In [2]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession, DataFrame

PATH_MAIN = '/project/ds5559/group2nba'
PATH_STACKED = f'{PATH_MAIN}/stacked_data/'
RESULTS_FILE = f'{PATH_MAIN}/results_expanded_bonus.csv'
TARGET = 'Won'
FEATURES = 'features'

CORES = 4

spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .master(f'local[{CORES}]') \
    .getOrCreate()



In [3]:
from pyspark.sql.types import *
from typing import *
import pyspark.sql.functions as F

T = TypeVar('T')

FIELDS: Dict[str, T] = {
      'Date': StringType
    , 'HomeTeam': StringType
    , 'AwayTeam': StringType
    , 'Team': StringType
    , 'Year': IntegerType
    , 'Won': IntegerType
    
    , 'ScoreDiff': IntegerType
    , 'Quarter': IntegerType
    , 'SecLeftTotal': IntegerType
    , 'LogSecLeftTotal': DoubleType
    , 'SecLeftTotalInverse': DoubleType
    
    , 'HasPossession': IntegerType
    , 'assist_team_cnt': LongType
    , 'assist_opponent_cnt': LongType
    , 'turnover_team_cnt': LongType
    , 'turnover_opponent_cnt': LongType
    , 'block_team_cnt': LongType
    , 'block_opponent_cnt': LongType
    
    , 'foul_team_cnt': LongType
    , 'foul_opponent_cnt': LongType
    , 'rebound_team_cnt': LongType
    , 'rebound_opponent_cnt': LongType
    , 'shotOnGoal_team_cnt': LongType
    , 'shotOnGoal_opponent_cnt': LongType
    , 'freeThrow_team_cnt': LongType
    , 'freeThrow_opponent_cnt': LongType
    
    , 'SecLeftTotalInverseTimesScoreDiff': DoubleType
    , 'assist_diff': IntegerType
    , 'turnover_diff': IntegerType
    , 'block_diff': IntegerType
    , 'foul_diff': IntegerType
    , 'rebound_diff': IntegerType
    , 'shotOnGoal_diff': IntegerType
    , 'freeThrow_diff': IntegerType
}
    
schema = StructType([StructField(k, v()) for k, v in FIELDS.items()])

In [4]:
from dataclasses import dataclass, field

@dataclass
class results_cls:
    model_type: str = ''
    list_features: list = field(default_factory=list)
    date_time_run: str = ''
    user: str = ''
    special_description: str = ''
    cv_elapsed_time: str = ''
    cv_area_under_roc: float = 0.0
    cv_area_under_pr: float = 0.0
    cv_best_coefficients: list = field(default_factory=list)
    cv_best_hyperparameters: dict = field(default_factory=dict)
    val_elapsed_time: str = ''
    val_area_under_roc: float = 0.0
    val_area_under_pr: float = 0.0
    val_best_coefficients: list = field(default_factory=list)
    val_true_positive: int = 0
    val_true_negative: int = 0
    val_false_positive: int = 0
    val_false_negative: int = 0

# results = results_cls(date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S"))
# results.model_type='SVM'
# print(results)

In [5]:
class Model:
    logistic = 'Logistic'
    random_forest = 'Random Forest'
    svm = 'Support Vector Machine'
    
    TARGET = 'Won'
    FEATURES = 'features'
    
    def __init__(self, list_features, model_type):
        from datetime import datetime
        
        # Defined in Methods
        self.cvModel = None
        self.cvPredictions = None
        self.valModel = None
        self.valPredictions = None
        
        print('Setup Model')
        self.list_features = list_features
        self.pipeline = self.build_pipeline()
        self.model_type = model_type
        self.evaluator = self.build_evaluator()
        self.results = results_cls(
            model_type = model_type,
            list_features = list_features,
            date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        )
        
    def build_pipeline(self):
        from pyspark.ml import feature as ft
        from pyspark.ml import Pipeline

        # Build the Pipeline
        print('build the pipeline')

        featuresCreator = ft.VectorAssembler(
            inputCols=self.list_features,
            outputCol='vectors'
        )

        sScaler = ft.StandardScaler(
            withMean=True, 
            withStd=True, 
            inputCol='vectors', 
            outputCol='features'
        )

        pipeline = Pipeline(
            stages=[
                featuresCreator,
                sScaler
            ])

        return pipeline
    
    @staticmethod
    def build_evaluator():
        import pyspark.ml.evaluation as ev

        evaluator = ev.BinaryClassificationEvaluator(
            metricName = 'areaUnderROC',
            rawPredictionCol='rawPrediction', 
            labelCol=TARGET
        )
        
        return evaluator
    
    @staticmethod
    def extract_hyperparams(cvModel):
        import re

        hyperparams = cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]
        hyper_dict = {}

        for i in range(len(hyperparams.items())):
            hyper_name = re.search("name='(.+?)'", str([x for x in hyperparams.items()][i])).group(1)
            hyper_value = [x for x in hyperparams.items()][i][1]

            hyper_dict[hyper_name]= hyper_value

        print(hyper_dict)
    
        return hyper_dict
    
    def evaluate_cv_model(self, test_train_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        train_data, test_data = test_train_data.randomSplit([0.7, 0.3], seed=123) 

        # Fit the Model
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(train_data)

        print('Transform Train Data + Fit CV Model')
        cvModel = self.cvModel.setParallelism(CORES).fit(data_transformer.transform(train_data))

        print('Transform Test Data')
        data_train = data_transformer.transform(test_data)

        print('Evaluate Model Against Test Data')
        self.cv_predictions = cvModel.transform(data_train)

        print('Store Results')
        
        self.results.cv_area_under_roc = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.cv_area_under_pr = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.cv_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in (self.logistic, self.svm):
            self.results.cv_best_coefficients = cvModel.bestModel.coefficients
            
        self.results.cv_best_hyperparameters = self.extract_hyperparams(cvModel)
        
    def evaluate_val_model(self, test_train_data, validation_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(test_train_data)

        print('Transform TestTrain Data + Fit Val Model')
        valModel = self.valModel.fit(data_transformer.transform(test_train_data))
        
        print('Transform Validation Data')
        data_train = data_transformer.transform(validation_data)

        print('Evaluate Model Against Validation Data')
        self.val_predictions = valModel.transform(data_train)
        
        print('Store Results')
        
        self.results.val_area_under_roc = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.val_area_under_pr = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.val_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in (self.logistic, self.svm):
            self.results.val_best_coefficients = valModel.coefficients
            
        self.results.val_true_positive = test_model.val_predictions.where((F.col('Won') == 1) & (F.col('prediction') == 1)).count()
        self.results.val_false_negative = test_model.val_predictions.where((F.col('Won') == 1) & (F.col('prediction') == 0)).count()
        self.results.val_false_positive = test_model.val_predictions.where((F.col('Won') == 0) & (F.col('prediction') == 1)).count()
        self.results.val_true_negative = test_model.val_predictions.where((F.col('Won') == 0) & (F.col('prediction') == 0)).count()


In [6]:
class SVMModel(Model):

    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Support Vector Machine'
        )
        
    def build_cv_model(self):
        import pyspark.ml.evaluation as ev
        from pyspark.ml.classification import LinearSVC
        import pyspark.ml.tuning as tune

        print('Build CVModel: SVM')

        svm = LinearSVC(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
                    .addGrid(svm.aggregationDepth, [3, 5, 10]) \
                    .addGrid(svm.maxIter, [10, 20, 50]) \
                    .build()

        self.cvModel = tune.CrossValidator( 
            estimator=svm, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self):
        from pyspark.ml.classification import LinearSVC

        print('Build ValModel: SVM')

        self.valModel = LinearSVC(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

       

In [7]:
class RandomForestModel(Model):

    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Random Forest'
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import RandomForestClassifier
        import pyspark.ml.tuning as tune

        print('Build CVModel: Random Forest')

        random_forest = RandomForestClassifier(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(random_forest.maxBins, [2, 3]) \
            .addGrid(random_forest.maxDepth, [3, 5]) \
            .addGrid(random_forest.numTrees, [100, 500]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=random_forest, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self):
        from pyspark.ml.classification import RandomForestClassifier

        print('Build ValModel: Random Forest')

        self.valModel = RandomForestClassifier(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

In [8]:
class LogisticModel(Model):
    
    def __init__(self, list_features):
        
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Logistic'
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import LogisticRegression
        import pyspark.ml.tuning as tune

        print('Build CVModel: Logistic Regression')

        logistic = LogisticRegression(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(logistic.maxIter, [10, 20]) \
            .addGrid(logistic.regParam, [0.1, 0.5]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=logistic, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self):
        from pyspark.ml.classification import LogisticRegression

        print('Build ValModel: Logistic Regression')

        self.valModel = LogisticRegression(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

In [9]:
train_files = [
#     'NBA_PBP_2015-16.csv',
#     'NBA_PBP_2016-17.csv',
#     'NBA_PBP_2017-18.csv',
    'NBA_PBP_2018-19.csv'
]

validation_file = 'NBA_PBP_2019-20.csv'

# Read in all Train_Files

file_list = []
for item in train_files:
    file_list.append(PATH_STACKED + item)
    
print(file_list)

['/project/ds5559/group2nba/stacked_data/NBA_PBP_2018-19.csv']


In [10]:
def read_in_file(full_file_name):

    df = spark.read \
        .format('csv') \
        .option('header', True) \
        .schema(schema) \
        .load(full_file_name)

#     display(df.count())
#     display(df.printSchema())
#     display(df.head(2))
    
    return df

from os import listdir
from os.path import isfile, join

df = spark.read.csv(file_list)

# test_train = 'NBA_PBP_2015-16.csv'
validation = 'NBA_PBP_2019-20.csv'

# # Load in your data - Can append more files to test_train if you want
# #   by extending the 'read_in_files' with ',' between full file names
test_train_df = read_in_file(file_list)
validation_df = read_in_file(join(PATH_STACKED, validation))


# test_train_df = test_train_df.where(F.col('SecLeftTotal') <= 300)
# validation_df = validation_df.where(F.col('SecLeftTotal') <= 300)

In [11]:
def save_results(results):
    from os.path import exists

    results_df = pd.DataFrame(data=[results])

    file_exists = exists(RESULTS_FILE)
    if file_exists:
        results_df.to_csv(RESULTS_FILE, mode='a', index=False, header=False, sep="|")
    else:
        results_df.to_csv(RESULTS_FILE, mode='w', index=False, header=True, sep="|")

def run_model(test_model):

    # Run Cross Validation (Find Best Hyperparameters)
    test_model.build_cv_model()
    test_model.evaluate_cv_model(test_train_df)

    # Run Model Validation
    test_model.build_val_model()
    test_model.evaluate_val_model(test_train_df, validation_df)

    # Save the Results
    save_results(test_model.results)

In [None]:
#   'HasPossession', 'assist_team_cnt', 'assist_opponent_cnt', 'turnover_team_cnt'
# , 'turnover_opponent_cnt', 'block_team_cnt', 'block_opponent_cnt'
# , 'foul_team_cnt', 'foul_opponent_cnt', 'rebound_team_cnt', 'rebound_opponent_cnt'
# , 'shotOnGoal_team_cnt', 'shotOnGoal_opponent_cnt', 'freeThrow_team_cnt'
# , 'freeThrow_opponent_cnt'
# , 'SecLeftTotalInverseTimesScoreDiff'
# , 'assist_diff', 'turnover_diff', 'block_diff', 'foul_diff', 'rebound_diff'
# , 'shotOnGoal_diff', 'freeThrow_diff'
# list_features = ['ScoreDiff']

# List of Features to test (List is Above)
list_features = ['assist_diff', 'turnover_diff', 'block_diff', 'foul_diff', 'rebound_diff', 'shotOnGoal_diff', 'freeThrow_diff']

test_train_df = read_in_file(file_list)
validation_df = read_in_file(join(PATH_STACKED, validation))

# Any Modifications to the Data you want to make
test_train_df = test_train_df.where(F.col('Quarter') == 4)
validation_df = validation_df.where(F.col('Quarter') == 4)

test_model = SVMModel(list_features)
test_model.results.special_description = 'Quarter = 4'
run_model(test_model)

test_model = LogisticModel(list_features)
test_model.results.special_description = 'Quarter = 4'
run_model(test_model)

# test_model = RandomForestModel(list_features)
# test_model.results.special_description = 'Quarter = 4'
# run_model(test_model)

Setup Model
build the pipeline
Build CVModel: SVM
Build Data Transformer
Transform Train Data + Fit CV Model


In [16]:
view_df = pd.read_csv(RESULTS_FILE, sep='|')
view_df.tail(10)

Unnamed: 0,model_type,list_features,date_time_run,user,special_description,cv_elapsed_time,cv_area_under_roc,cv_area_under_pr,cv_best_coefficients,cv_best_hyperparameters,val_elapsed_time,val_area_under_roc,val_area_under_pr,val_best_coefficients,val_true_positive,val_true_negative,val_false_positive,val_false_negative
57,Logistic,"['assist_diff', 'turnover_diff', 'block_diff',...",04/20/2022 12:51:28,,Quarter >= 3,102.19,0.829695,0.828194,[ 0.5812116 -0.344193 -0.22327406 0.035643...,"{'maxIter': 20, 'regParam': 0.1}",22.46,0.786536,0.777428,[ 0.58121907 -0.34495717 -0.22373361 0.034718...,195987,195987,78475,78475
58,Logistic,"['ScoreDiff', 'SecLeftTotalInverseTimesScoreDi...",04/20/2022 12:52:07,,Quarter >= 3,91.5,0.910001,0.910292,[0.449166 0.06199055],"{'maxIter': 20, 'regParam': 0.5}",18.46,0.901117,0.901426,[0.44929998 0.0619288 ],219873,229309,45153,54589
59,Support Vector Machine,"['assist_diff', 'turnover_diff', 'block_diff',...",04/20/2022 12:56:25,,Quarter >= 4,339.56,0.862297,0.863474,[ 0.75415664 -1.36413206 -0.20948062 0.053488...,"{'aggregationDepth': 10, 'maxIter': 50}",25.43,0.820415,0.813079,[ 0.75870619 -1.36246121 -0.20495591 0.059093...,106313,106313,36806,36806
60,Support Vector Machine,"['ScoreDiff', 'SecLeftTotalInverseTimesScoreDi...",04/20/2022 12:56:41,,Quarter >= 4,389.04,0.953682,0.955739,[ 2.10062818 24.57609777],"{'aggregationDepth': 5, 'maxIter': 20}",20.29,0.947664,0.949794,[ 2.21227477 20.56390867],120800,125976,17143,22319
61,Logistic,"['assist_diff', 'turnover_diff', 'block_diff',...",04/20/2022 13:02:43,,Quarter >= 4,71.83,0.84707,0.84568,[ 0.61398379 -0.36178735 -0.23379962 0.028837...,"{'maxIter': 20, 'regParam': 0.1}",17.55,0.801229,0.793778,[ 0.61627394 -0.3620753 -0.23038445 0.030426...,103500,103500,39619,39619
62,Logistic,"['ScoreDiff', 'SecLeftTotalInverseTimesScoreDi...",04/20/2022 13:03:42,,Quarter >= 4,67.53,0.94889,0.95064,[0.48378834 0.08352543],"{'maxIter': 10, 'regParam': 0.5}",16.05,0.941905,0.94344,[0.48369291 0.08341884],120800,125976,17143,22319
63,Support Vector Machine,"['assist_diff', 'turnover_diff', 'block_diff',...",04/20/2022 13:06:53,,Quarter >= 5,208.17,0.650696,0.599396,[-0.05058997 -1.86745284 -0.05936869 0.189597...,"{'aggregationDepth': 5, 'maxIter': 50}",17.76,0.640618,0.629317,[ 0.00450144 -1.78683662 -0.08343831 0.228016...,2636,2636,1876,1876
64,Logistic,"['assist_diff', 'turnover_diff', 'block_diff',...",04/20/2022 13:10:45,,Quarter >= 5,42.27,0.615654,0.607782,[ 0.03786394 -0.24566945 -0.11224499 -0.027887...,"{'maxIter': 20, 'regParam': 0.1}",7.46,0.625692,0.596167,[ 0.04709074 -0.23582563 -0.11578692 -0.025220...,2679,2679,1833,1833
65,Support Vector Machine,"['ScoreDiff', 'SecLeftTotalInverseTimesScoreDi...",04/20/2022 13:06:43,,Quarter >= 5,268.12,0.815262,0.823383,[1.00906297 2.46910011],"{'aggregationDepth': 5, 'maxIter': 50}",24.03,0.5,0.5,[-0. -0.],0,4512,0,4512
66,Logistic,"['ScoreDiff', 'SecLeftTotalInverseTimesScoreDi...",04/20/2022 13:11:42,,Quarter >= 5,38.98,0.811787,0.817189,[0.73565047 0.32991269],"{'maxIter': 10, 'regParam': 0.1}",9.12,0.875325,0.880782,[0.74147983 0.33760455],3113,3833,679,1399


In [61]:
import pyspark.sql.functions as F
test_model.val_predictions.where((F.col('Won') != 1) & (F.col('ScoreDiff') > 1)).select('Won', 'ScoreDiff', 'prediction').take(20)

[Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=3, prediction=1.0),
 Row(Won=0, ScoreDiff=3, prediction=1.0),
 Row(Won=0, ScoreDiff=4, prediction=1.0),
 Row(Won=0, ScoreDiff=5, prediction=1.0)]