In [4]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession, DataFrame

PATH_MAIN = '/project/ds5559/group2nba'
PATH_STACKED = f'{PATH_MAIN}/stacked_data/'
RESULTS_FILE = f'{PATH_MAIN}/results.csv'
TARGET = 'Won'
FEATURES = 'features'

CORES = 4

spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .master(f'local[{CORES}]') \
    .getOrCreate()



In [5]:
from pyspark.sql.types import *
from typing import *
import pyspark.sql.functions as F

T = TypeVar('T')

FIELDS: Dict[str, T] = {
      'Date': StringType
    , 'HomeTeam': StringType
    , 'AwayTeam': StringType
    , 'Team': StringType
    , 'Year': IntegerType
    , 'Won': IntegerType
    
    , 'ScoreDiff': IntegerType
    , 'Quarter': IntegerType
    , 'SecLeftTotal': IntegerType
    , 'LogSecLeftTotal': DoubleType
    , 'SecLeftTotalInverse': DoubleType
    
    , 'HasPossession': IntegerType
    , 'assist_team_cnt': LongType
    , 'assist_opponent_cnt': LongType
    , 'turnover_team_cnt': LongType
    , 'turnover_opponent_cnt': LongType
    , 'block_team_cnt': LongType
    , 'block_opponent_cnt': LongType
    
    , 'foul_team_cnt': LongType
    , 'foul_opponent_cnt': LongType
    , 'rebound_team_cnt': LongType
    , 'rebound_opponent_cnt': LongType
    , 'shotOnGoal_team_cnt': LongType
    , 'shotOnGoal_opponent_cnt': LongType
    , 'freeThrow_team_cnt': LongType
    , 'freeThrow_opponent_cnt': LongType
    
    , 'SecLeftTotalInverseTimesScoreDiff': DoubleType
    , 'assist_diff': IntegerType
    , 'turnover_diff': IntegerType
    , 'block_diff': IntegerType
    , 'foul_diff': IntegerType
    , 'rebound_diff': IntegerType
    , 'shotOnGoal_diff': IntegerType
    , 'freeThrow_diff': IntegerType
}
    
schema = StructType([StructField(k, v()) for k, v in FIELDS.items()])

In [29]:
from dataclasses import dataclass, field

@dataclass
class results_cls:
    model_type: str = ''
    list_features: list = field(default_factory=list)
    date_time_run: str = ''
    user: str = ''
    special_description: str = ''
    cv_elapsed_time: str = ''
    cv_area_under_roc: float = 0.0
    cv_area_under_pr: float = 0.0
    cv_best_coefficients: list = field(default_factory=list)
    cv_best_hyperparameters: dict = field(default_factory=dict)
    val_elapsed_time: str = ''
    val_area_under_roc: float = 0.0
    val_area_under_pr: float = 0.0
    val_best_coefficients: list = field(default_factory=list)

# results = results_cls(date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S"))
# results.model_type='SVM'
# print(results)

In [48]:
class Model:
    logistic = 'Logistic'
    random_forest = 'Random Forest'
    
    TARGET = 'Won'
    FEATURES = 'features'
    
    def __init__(self, list_features, model_type):
        from datetime import datetime
        
        # Defined in Methods
        self.cvModel = None
        self.cvPredictions = None
        self.valModel = None
        self.valPredictions = None
        
        print('Setup Model')
        self.list_features = list_features
        self.pipeline = self.build_pipeline()
        self.model_type = model_type
        self.evaluator = self.build_evaluator()
        self.results = results_cls(
            date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        )
        
    def build_pipeline(self):
        from pyspark.ml import feature as ft
        from pyspark.ml import Pipeline

        # Build the Pipeline
        print('build the pipeline')

        featuresCreator = ft.VectorAssembler(
            inputCols=self.list_features,
            outputCol='vectors'
        )

        sScaler = ft.StandardScaler(
            withMean=True, 
            withStd=True, 
            inputCol='vectors', 
            outputCol='features'
        )

        pipeline = Pipeline(
            stages=[
                featuresCreator,
                sScaler
            ])

        return pipeline
    
    @staticmethod
    def build_evaluator():
        import pyspark.ml.evaluation as ev

        evaluator = ev.BinaryClassificationEvaluator(
            metricName = 'areaUnderROC',
            rawPredictionCol='rawPrediction', 
            labelCol=TARGET
        )
        
        return evaluator
    
    @staticmethod
    def extract_hyperparams(cvModel):
        import re

        hyperparams = cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]
        hyper_dict = {}

        for i in range(len(hyperparams.items())):
            hyper_name = re.search("name='(.+?)'", str([x for x in hyperparams.items()][i])).group(1)
            hyper_value = [x for x in hyperparams.items()][i][1]

            hyper_dict[hyper_name]= hyper_value

        print(hyper_dict)
    
        return hyper_dict
    
    def evaluate_cv_model(self, test_train_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        train_data, test_data = test_train_data.randomSplit([0.7, 0.3], seed=123) 

        # Fit the Model
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(train_data)

        print('Transform Train Data + Fit CV Model')
        cvModel = self.cvModel.setParallelism(CORES).fit(data_transformer.transform(train_data))

        print('Transform Test Data')
        data_train = data_transformer.transform(test_data)

        print('Evaluate Model Against Test Data')
        self.cv_predictions = cvModel.transform(data_train)

        print('Store Results')
        
        self.results.cv_area_under_roc = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.cv_area_under_pr = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.cv_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in ('Logistic', 'Support Vector Machine'):
            self.results.cv_best_coefficients = cvModel.bestModel.coefficients
            
        self.results.best_hyperparameters = self.extract_hyperparams(cvModel)
        
    def evaluate_val_model(self, test_train_data, validation_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(train_test_data)

        print('Transform TestTrain Data + Fit Val Model')
        valModel = self.valModel.fit(data_transformer.transform(train_test_data))
        
        print('Transform Validation Data')
        data_train = data_transformer.transform(validation_data)

        print('Evaluate Model Against Validation Data')
        self.val_predictions = valModel.transform(data_train)
        
        print('Store Results')
        
        self.results.val_area_under_roc = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.val_area_under_pr = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.val_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in ('Logistic', 'Support Vector Machine'):
            self.results.val_best_coefficients = valModel.coefficients
    
    def save_results(results_obj):
        from os.path import exists
    
        results_df = pd.DataFrame(data=[results_obj])
        
        file_exists = exists(RESULTS_FILE)
        if file_exists:
            results_df.to_csv(RESULTS_FILE, mode='a', index=False, header=False, sep="|")
        else:
            results_df.to_csv(RESULTS_FILE, mode='w', index=False, header=True, sep="|")


In [44]:
class SVMModel(Model):
    SVM = 'Support Vector Machine'
    
    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = self.SVM
        )
        
    def build_cv_model(self):
        import pyspark.ml.evaluation as ev
        from pyspark.ml.classification import LinearSVC
        import pyspark.ml.tuning as tune

        print('Build CVModel: SVM')

        svm = LinearSVC(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
                    .addGrid(svm.aggregationDepth, [3, 5, 10]) \
                    .addGrid(svm.maxIter, [10, 20, 50]) \
                    .build()

        self.cvModel = tune.CrossValidator( 
            estimator=svm, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self, hyper_dict):
        from pyspark.ml.classification import LinearSVC

        print('Build ValModel: SVM')

        self.valModel = LinearSVC(featuresCol = FEATURES, labelCol=TARGET, **hyper_dict)

       

In [45]:
class RandomForestModel(Model):
    RANDOMFOREST = 'Random Forest'
    
    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = self.RANDOMFOREST
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import RandomForestClassifier
        import pyspark.ml.tuning as tune

        print('Build CVModel: Random Forest')

        random_forest = RandomForestClassifier(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(random_forest.maxBins, [2, 3]) \
            .addGrid(random_forest.maxDepth, [3, 5]) \
            .addGrid(random_forest.numTrees, [100, 500]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=random_forest, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self, hyper_dict):
        from pyspark.ml.classification import RandomForestClassifier

        print('Build ValModel: Random Forest')

        self.valModel = RandomForestClassifier(featuresCol = FEATURES, labelCol=TARGET, **hyper_dict)

In [46]:
class LogisticModel(Model):
    LOGISTIC = 'Logistic'
    
    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = self.RANDOMFOREST
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import LogisticRegression
        import pyspark.ml.tuning as tune

        print('Build CVModel: Logistic Regression')

        logistic = LogisticRegression(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(pipeline.getStages()[1].maxIter, [10, 20]) \
            .addGrid(pipeline.getStages()[1].regParam, [0.1, 0.5]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=logistic, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self, hyper_dict):
        from pyspark.ml.classification import LogisticRegression

        print('Build ValModel: Logistic Regression')

        self.valModel = LogisticRegression(featuresCol = FEATURES, labelCol=TARGET, **hyper_dict)

In [26]:
def read_in_file(full_file_name):

    df = spark.read \
        .format('csv') \
        .option('header', True) \
        .schema(schema) \
        .load(full_file_name)

#     display(df.count())
#     display(df.printSchema())
#     display(df.head(2))
    
    return df

from os import listdir
from os.path import isfile, join

test_train = 'NBA_PBP_2018-19.csv'
validation = 'NBA_PBP_2019-20.csv'

# Load in your data - Can append more files to test_train if you want
#   by extending the 'read_in_files' with ',' between full file names
test_train_df = read_in_file(join(PATH_STACKED, test_train))
validation_df = read_in_file(join(PATH_STACKED, validation))

# Any Modifications to the Data you want to make
test_train_df = test_train_df.where(F.col('SecLeftTotal') <= 300)
validation_df = validation_df.where(F.col('SecLeftTotal') <= 300)

# LogSecLeftTotal
list_features = ['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftTotalInverseTimesScoreDiff']
    
# test_model = SVMModel(list_features, test_train_df, validation_df)

In [49]:
test_model = SVMModel(list_features)

Attach Data
build the pipeline


In [None]:
test_model.build_cv_model()
test_model.evaluate_cv_model(test_train_df)

Build CVModel: SVM
Build Pipeline
Fit CV Model


In [None]:
test_model.build_val_model()
test_model.evaluate_val_model(test_train_df, validation_df)

test_model.save_results()

In [10]:
view_df = pd.read_csv(RESULTS_FILE, sep='|')
view_df.head()

Unnamed: 0,model_type,list_features,date_time_run,elapsed_time,user,special_description,area_under_roc,area_under_pr,best_coefficients,best_hyperparameters
0,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/17/2022 23:35:57,309.49,Peter,Test Run 19-20 Only,0.820474,0.82324,"[1.4749755253169836,-9.339839834844648e-05,0.4...",[]
1,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/17/2022 23:49:30,306.11,Peter,Test Run 19-20 Only,0.820472,0.823239,"[1.4749598979624223,-9.384355132570414e-05,0.4...",{Param(parent='LogisticRegression_b762bb9a2c14...
2,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/19/2022 11:26:57,151.95,Peter,Test Run 19-20 Only,0.820471,0.82324,"[1.4749598979624223,-9.384355132568476e-05,0.4...",{Param(parent='LogisticRegression_5e121c16d293...
3,Support Vector Machine,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/19/2022 15:31:11,444.19,Peter,Test Run 19-20 Only,0.827061,0.833835,"[1.2087398363120254,0.04048200887116883,21.764...","{Param(parent='LinearSVC_fc987ac4b343', name='..."


In [101]:
train_files = [
    'NBA_PBP_2015-16.csv',
    'NBA_PBP_2016-17.csv',
    'NBA_PBP_2017-18.csv',
    'NBA_PBP_2018-29.csv'
]

validation_file = 'NBA_PBP_2019-20.csv'

# Read in all Train_Files

file_str = ''
for item in train_files:
    file_str = file_str + PATH_STACKED + item + ','
    
file_str = file_str[:-1]
print(file_str)

/project/ds5559/group2nba/stacked_data/NBA_PBP_2015-16.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2016-17.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2017-18.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2018-29.csv
