In [1]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession, DataFrame

PATH_MAIN = '/project/ds5559/group2nba'
PATH_STACKED = f'{PATH_MAIN}/stacked_data/'
RESULTS_FILE = f'{PATH_MAIN}/results_expanded.csv'
TARGET = 'Won'
FEATURES = 'features'

CORES = 4

spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .master(f'local[{CORES}]') \
    .getOrCreate()



In [2]:
from pyspark.sql.types import *
from typing import *
import pyspark.sql.functions as F

T = TypeVar('T')

FIELDS: Dict[str, T] = {
      'Date': StringType
    , 'HomeTeam': StringType
    , 'AwayTeam': StringType
    , 'Team': StringType
    , 'Year': IntegerType
    , 'Won': IntegerType
    
    , 'ScoreDiff': IntegerType
    , 'Quarter': IntegerType
    , 'SecLeftTotal': IntegerType
    , 'LogSecLeftTotal': DoubleType
    , 'SecLeftTotalInverse': DoubleType
    
    , 'HasPossession': IntegerType
    , 'assist_team_cnt': LongType
    , 'assist_opponent_cnt': LongType
    , 'turnover_team_cnt': LongType
    , 'turnover_opponent_cnt': LongType
    , 'block_team_cnt': LongType
    , 'block_opponent_cnt': LongType
    
    , 'foul_team_cnt': LongType
    , 'foul_opponent_cnt': LongType
    , 'rebound_team_cnt': LongType
    , 'rebound_opponent_cnt': LongType
    , 'shotOnGoal_team_cnt': LongType
    , 'shotOnGoal_opponent_cnt': LongType
    , 'freeThrow_team_cnt': LongType
    , 'freeThrow_opponent_cnt': LongType
    
    , 'SecLeftTotalInverseTimesScoreDiff': DoubleType
    , 'assist_diff': IntegerType
    , 'turnover_diff': IntegerType
    , 'block_diff': IntegerType
    , 'foul_diff': IntegerType
    , 'rebound_diff': IntegerType
    , 'shotOnGoal_diff': IntegerType
    , 'freeThrow_diff': IntegerType
}
    
schema = StructType([StructField(k, v()) for k, v in FIELDS.items()])

In [3]:
from dataclasses import dataclass, field

@dataclass
class results_cls:
    model_type: str = ''
    list_features: list = field(default_factory=list)
    date_time_run: str = ''
    user: str = ''
    special_description: str = ''
    cv_elapsed_time: str = ''
    cv_area_under_roc: float = 0.0
    cv_area_under_pr: float = 0.0
    cv_best_coefficients: list = field(default_factory=list)
    cv_best_hyperparameters: dict = field(default_factory=dict)
    val_elapsed_time: str = ''
    val_area_under_roc: float = 0.0
    val_area_under_pr: float = 0.0
    val_best_coefficients: list = field(default_factory=list)

# results = results_cls(date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S"))
# results.model_type='SVM'
# print(results)

In [37]:
class Model:
    logistic = 'Logistic'
    random_forest = 'Random Forest'
    svm = 'Support Vector Machine'
    
    TARGET = 'Won'
    FEATURES = 'features'
    
    def __init__(self, list_features, model_type):
        from datetime import datetime
        
        # Defined in Methods
        self.cvModel = None
        self.cvPredictions = None
        self.valModel = None
        self.valPredictions = None
        
        print('Setup Model')
        self.list_features = list_features
        self.pipeline = self.build_pipeline()
        self.model_type = model_type
        self.evaluator = self.build_evaluator()
        self.results = results_cls(
            model_type = model_type,
            list_features = list_features,
            date_time_run = datetime.now().strftime("%m/%d/%Y %H:%M:%S")
        )
        
    def build_pipeline(self):
        from pyspark.ml import feature as ft
        from pyspark.ml import Pipeline

        # Build the Pipeline
        print('build the pipeline')

        featuresCreator = ft.VectorAssembler(
            inputCols=self.list_features,
            outputCol='vectors'
        )

        sScaler = ft.StandardScaler(
            withMean=True, 
            withStd=True, 
            inputCol='vectors', 
            outputCol='features'
        )

        pipeline = Pipeline(
            stages=[
                featuresCreator,
                sScaler
            ])

        return pipeline
    
    @staticmethod
    def build_evaluator():
        import pyspark.ml.evaluation as ev

        evaluator = ev.BinaryClassificationEvaluator(
            metricName = 'areaUnderROC',
            rawPredictionCol='rawPrediction', 
            labelCol=TARGET
        )
        
        return evaluator
    
    @staticmethod
    def extract_hyperparams(cvModel):
        import re

        hyperparams = cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ]
        hyper_dict = {}

        for i in range(len(hyperparams.items())):
            hyper_name = re.search("name='(.+?)'", str([x for x in hyperparams.items()][i])).group(1)
            hyper_value = [x for x in hyperparams.items()][i][1]

            hyper_dict[hyper_name]= hyper_value

        print(hyper_dict)
    
        return hyper_dict
    
    def evaluate_cv_model(self, test_train_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        train_data, test_data = test_train_data.randomSplit([0.7, 0.3], seed=123) 

        # Fit the Model
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(train_data)

        print('Transform Train Data + Fit CV Model')
        cvModel = self.cvModel.setParallelism(CORES).fit(data_transformer.transform(train_data))

        print('Transform Test Data')
        data_train = data_transformer.transform(test_data)

        print('Evaluate Model Against Test Data')
        self.cv_predictions = cvModel.transform(data_train)

        print('Store Results')
        
        self.results.cv_area_under_roc = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.cv_area_under_pr = self.evaluator.evaluate(
            self.cv_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.cv_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in (self.logistic, self.svm):
            self.results.cv_best_coefficients = cvModel.bestModel.coefficients
            
        self.results.cv_best_hyperparameters = self.extract_hyperparams(cvModel)
        
    def evaluate_val_model(self, test_train_data, validation_data):
        import time

        # Start the Timer
        start_time = time.time()
        
        print('Build Data Transformer')
        data_transformer = self.pipeline.fit(test_train_data)

        print('Transform TestTrain Data + Fit Val Model')
        valModel = self.valModel.fit(data_transformer.transform(test_train_data))
        
        print('Transform Validation Data')
        data_train = data_transformer.transform(validation_data)

        print('Evaluate Model Against Validation Data')
        self.val_predictions = valModel.transform(data_train)
        
        print('Store Results')
        
        self.results.val_area_under_roc = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderROC'}
        )
        self.results.val_area_under_pr = self.evaluator.evaluate(
            self.val_predictions, 
            {self.evaluator.metricName: 'areaUnderPR'}
        )

        # End Timer
        self.results.val_elapsed_time = round((time.time() - start_time), 2)

        # Random Forest doesn't have coefficients
        if self.model_type in (self.logistic, self.svm):
            self.results.val_best_coefficients = valModel.coefficients


In [39]:
class SVMModel(Model):

    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Support Vector Machine'
        )
        
    def build_cv_model(self):
        import pyspark.ml.evaluation as ev
        from pyspark.ml.classification import LinearSVC
        import pyspark.ml.tuning as tune

        print('Build CVModel: SVM')

        svm = LinearSVC(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
                    .addGrid(svm.aggregationDepth, [3, 5, 10]) \
                    .addGrid(svm.maxIter, [10, 20, 50]) \
                    .build()

        self.cvModel = tune.CrossValidator( 
            estimator=svm, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self):
        from pyspark.ml.classification import LinearSVC

        print('Build ValModel: SVM')

        self.valModel = LinearSVC(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

       

In [38]:
class RandomForestModel(Model):

    def __init__(self, list_features):
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Random Forest'
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import RandomForestClassifier
        import pyspark.ml.tuning as tune

        print('Build CVModel: Random Forest')

        random_forest = RandomForestClassifier(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(random_forest.maxBins, [2, 3]) \
            .addGrid(random_forest.maxDepth, [3, 5]) \
            .addGrid(random_forest.numTrees, [100, 500]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=random_forest, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self, hyper_dict):
        from pyspark.ml.classification import RandomForestClassifier

        print('Build ValModel: Random Forest')

        self.valModel = RandomForestClassifier(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

In [40]:
class LogisticModel(Model):
    
    def __init__(self, list_features):
        
        Model.__init__(
            self, 
            list_features = list_features, 
            model_type = 'Logistic'
        )
        
    def build_cv_model(self):
        from pyspark.ml.classification import LogisticRegression
        import pyspark.ml.tuning as tune

        print('Build CVModel: Logistic Regression')

        logistic = LogisticRegression(featuresCol = FEATURES, labelCol=TARGET)

        grid = tune.ParamGridBuilder() \
            .addGrid(logistic.maxIter, [10, 20]) \
            .addGrid(logistic.regParam, [0.1, 0.5]) \
            .build()

        self.cvModel = tune.CrossValidator( 
            estimator=logistic, 
            estimatorParamMaps=grid, 
            evaluator=self.evaluator,
            numFolds=5
        )
    
    def build_val_model(self):
        from pyspark.ml.classification import LogisticRegression

        print('Build ValModel: Logistic Regression')

        self.valModel = LogisticRegression(
            featuresCol = FEATURES, 
            labelCol=TARGET, 
            **self.results.cv_best_hyperparameters
        )

In [74]:
train_files = [
    'NBA_PBP_2015-16.csv',
    'NBA_PBP_2016-17.csv',
    'NBA_PBP_2017-18.csv',
    'NBA_PBP_2018-19.csv'
]

validation_file = 'NBA_PBP_2019-20.csv'

# Read in all Train_Files

file_list = []
for item in train_files:
    file_list.append(PATH_STACKED + item)
    
print(file_list)

['/project/ds5559/group2nba/stacked_data/NBA_PBP_2015-16.csv', '/project/ds5559/group2nba/stacked_data/NBA_PBP_2016-17.csv', '/project/ds5559/group2nba/stacked_data/NBA_PBP_2017-18.csv', '/project/ds5559/group2nba/stacked_data/NBA_PBP_2018-19.csv']


In [76]:
def read_in_file(full_file_name):

    df = spark.read \
        .format('csv') \
        .option('header', True) \
        .schema(schema) \
        .load(full_file_name)

#     display(df.count())
#     display(df.printSchema())
#     display(df.head(2))
    
    return df

from os import listdir
from os.path import isfile, join

df = spark.read.csv(file_list)

# test_train = 'NBA_PBP_2015-16.csv'
validation = 'NBA_PBP_2019-20.csv'

# # Load in your data - Can append more files to test_train if you want
# #   by extending the 'read_in_files' with ',' between full file names
test_train_df = read_in_file(file_list)
validation_df = read_in_file(join(PATH_STACKED, validation))

# Any Modifications to the Data you want to make
# test_train_df = test_train_df.where(F.col('SecLeftTotal') <= 300)
# validation_df = validation_df.where(F.col('SecLeftTotal') <= 300)

# test_model = SVMModel(list_features, test_train_df, validation_df)

In [77]:
def save_results(results):
    from os.path import exists

    results_df = pd.DataFrame(data=[results])

    file_exists = exists(RESULTS_FILE)
    if file_exists:
        results_df.to_csv(RESULTS_FILE, mode='a', index=False, header=False, sep="|")
    else:
        results_df.to_csv(RESULTS_FILE, mode='w', index=False, header=True, sep="|")

def run_model(test_model):

    # Run Cross Validation (Find Best Hyperparameters)
    test_model.build_cv_model()
    test_model.evaluate_cv_model(test_train_df)

    # Run Model Validation
    test_model.build_val_model()
    test_model.evaluate_val_model(test_train_df, validation_df)

    # Save the Results
    save_results(test_model.results)

In [None]:
#   'HasPossession', 'assist_team_cnt', 'assist_opponent_cnt', 'turnover_team_cnt'
# , 'turnover_opponent_cnt', 'block_team_cnt', 'block_opponent_cnt'
# , 'foul_team_cnt', 'foul_opponent_cnt', 'rebound_team_cnt', 'rebound_opponent_cnt'
# , 'shotOnGoal_team_cnt', 'shotOnGoal_opponent_cnt', 'freeThrow_team_cnt'
# , 'freeThrow_opponent_cnt'
# , 'SecLeftTotalInverseTimesScoreDiff'
# , 'assist_diff', 'turnover_diff', 'block_diff', 'foul_diff', 'rebound_diff'
# , 'shotOnGoal_diff', 'freeThrow_diff'

# LogSecLeftTotal
list_features = ['ScoreDiff']

# test_model = SVMModel(list_features)
# run_model(test_model)

test_model = LogisticModel(list_features)
run_model(test_model)

# test_model = RandomForestModel(list_features)
# run_model(test_model)

Setup Model
build the pipeline
Build CVModel: Logistic Regression
Build Data Transformer
Transform Train Data + Fit CV Model


In [65]:
view_df = pd.read_csv(RESULTS_FILE, sep='|')
view_df.head(10)

Unnamed: 0,model_type,list_features,date_time_run,user,special_description,cv_elapsed_time,cv_area_under_roc,cv_area_under_pr,cv_best_coefficients,cv_best_hyperparameters,val_elapsed_time,val_area_under_roc,val_area_under_pr,val_best_coefficients
0,,[],04/19/2022 19:09:36,,,59.2,0.980536,0.981152,[ 0.50661366 -0.00099486 0.11844983],{},12.56,0.981701,0.982327,[5.06565419e-01 8.31596062e-18 1.18880671e-01]
1,,[],04/19/2022 19:25:37,,,52.73,0.980539,0.981151,[ 0.50661366 -0.00099486 0.11844983],{},12.1,0.981703,0.982326,[5.06565419e-01 5.08373086e-18 1.18880671e-01]
2,,[],04/19/2022 19:25:37,,,52.73,0.980539,0.981151,[ 0.50661366 -0.00099486 0.11844983],{},12.1,0.981703,0.982326,[5.06565419e-01 5.08373086e-18 1.18880671e-01]
3,Logistic,[],04/19/2022 19:30:01,,,57.32,0.980542,0.981152,[ 0.50661366 -0.00099486 0.11844983],{},11.63,0.981703,0.982326,[5.06565419e-01 5.69033792e-18 1.18880671e-01]
4,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/19/2022 19:37:12,,,51.47,0.980539,0.98115,[ 0.50661366 -0.00099486 0.11844983],{},11.17,0.981694,0.982325,[5.06565419e-01 5.08798883e-18 1.18880671e-01]
5,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/19/2022 19:42:58,,,55.17,0.980542,0.981152,[ 0.50661366 -0.00099486 0.11844983],"{'maxIter': 20, 'regParam': 0.5}",12.04,0.981702,0.982329,[5.06565419e-01 1.01821348e-17 1.18880671e-01]
6,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/19/2022 19:46:19,,,51.99,0.984891,0.985332,[0.50903018 0.00076816 0.12166393],"{'maxIter': 20, 'regParam': 0.5}",11.86,0.981698,0.982328,[5.08990856e-01 3.46960544e-18 1.21583108e-01]
7,Logistic,"['ScoreDiff', 'SecLeftTotalInverse']",04/19/2022 19:50:03,,,56.16,0.983363,0.98416,[0.51731155 0.00091616],"{'maxIter': 10, 'regParam': 0.5}",11.22,0.979936,0.980646,[5.17276795e-01 1.48025348e-18]
8,Logistic,['ScoreDiff'],04/19/2022 19:52:49,,,50.71,0.983404,0.983935,[1.34998164],"{'maxIter': 10, 'regParam': 0.1}",11.75,0.979936,0.980652,[1.35004335]
9,Logistic,['ScoreDiff'],04/19/2022 20:12:32,,,125.03,0.834392,0.837142,[0.90569828],"{'maxIter': 10, 'regParam': 0.1}",24.17,0.81786,0.819785,[0.90682137]


In [61]:
import pyspark.sql.functions as F
test_model.val_predictions.where((F.col('Won') != 1) & (F.col('ScoreDiff') > 1)).select('Won', 'ScoreDiff', 'prediction').take(20)

[Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=8, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=6, prediction=1.0),
 Row(Won=0, ScoreDiff=3, prediction=1.0),
 Row(Won=0, ScoreDiff=3, prediction=1.0),
 Row(Won=0, ScoreDiff=4, prediction=1.0),
 Row(Won=0, ScoreDiff=5, prediction=1.0)]

In [31]:
save_results(test_model.results)