In [105]:
import os
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession, DataFrame

PATH_MAIN = '/project/ds5559/group2nba'
PATH_STACKED = f'{PATH_MAIN}/stacked_data/'
RESULTS_FILE = f'{PATH_MAIN}/results.csv'
TARGET = 'Won'
FEATURES = 'features'

CORES = 2

spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .master(f'local[{CORES}]') \
    .getOrCreate()



In [13]:
from pyspark.sql.types import *
from typing import *
import pyspark.sql.functions as F

T = TypeVar('T')

FIELDS: Dict[str, T] = {
      'Date': StringType
    , 'HomeTeam': StringType
    , 'AwayTeam': StringType
    , 'Team': StringType
    , 'Year': IntegerType
    , 'Won': IntegerType
    
    , 'ScoreDiff': IntegerType
    , 'Quarter': IntegerType
    , 'SecLeftTotal': IntegerType
    , 'LogSecLeftTotal': DoubleType
    , 'SecLeftTotalInverse': DoubleType
    
    , 'HasPossession': IntegerType
    , 'assist_team_cnt': LongType
    , 'assist_opponent_cnt': LongType
    , 'turnover_team_cnt': LongType
    , 'turnover_opponent_cnt': LongType
    , 'block_team_cnt': LongType
    , 'block_opponent_cnt': LongType
    
    , 'foul_team_cnt': LongType
    , 'foul_opponent_cnt': LongType
    , 'rebound_team_cnt': LongType
    , 'rebound_opponent_cnt': LongType
    , 'shotOnGoal_team_cnt': LongType
    , 'shotOnGoal_opponent_cnt': LongType
    , 'freeThrow_team_cnt': LongType
    , 'freeThrow_opponent_cnt': LongType
    
    , 'SecLeftTotalInverseTimesScoreDiff': DoubleType
    , 'assist_diff': IntegerType
    , 'turnover_diff': IntegerType
    , 'block_diff': IntegerType
    , 'foul_diff': IntegerType
    , 'rebound_diff': IntegerType
    , 'shotOnGoal_diff': IntegerType
    , 'freeThrow_diff': IntegerType
}
    
schema = StructType([StructField(k, v()) for k, v in FIELDS.items()])

In [30]:
def read_in_file(full_file_name):

    df = spark.read \
        .format('csv') \
        .option('header', True) \
        .schema(schema) \
        .load(full_file_name)

#     display(df.count())
#     display(df.printSchema())
#     display(df.head(2))
    
    return df

def build_pipeline(list_features):
    from pyspark.ml import feature as ft
    from pyspark.ml import Pipeline

    # Build the Pipeline
    print('build the pipeline')

    featuresCreator = ft.VectorAssembler(
        inputCols=list_features,
        outputCol='vectors'
    )
    
    sScaler = ft.StandardScaler(
        withMean=True, 
        withStd=True, 
        inputCol='vectors', 
        outputCol='features'
    )

    pipeline = Pipeline(
        stages=[
            featuresCreator,
            sScaler
        ])

    return pipeline

def build_cross_validator(model, evaluator, grid):
    import pyspark.ml.tuning as tune
    
    cv = tune.CrossValidator( 
        estimator=model, 
        estimatorParamMaps=grid, 
        evaluator=evaluator
    )
    
    return cv

In [75]:
from typing import NamedTuple

class results_cls(NamedTuple):
    model_type: str
    list_features: list
    date_time_run: str
    elapsed_time: str
    user: str
    special_description: str
    area_under_roc: float
    area_under_pr: float
    best_coefficients: list
    best_hyperparameters: list
        
        
class model_cls(NamedTuple):
    model_type: str
    list_features: list
    pipeline: object
    cross_validator: object
    evaluator: object
    
    
def evaluate_cv_model(train_data, test_data, model_obj, user, special_description):
    from datetime import datetime
    import time
    
    # Start the Timer
    start_time = time.time()
    
    # Fit the Model
    print('Build Pipeline')
    data_transformer = model_obj.pipeline.fit(train_data)
    
    print('Fit CV Model')
    cvModel = model_obj.cross_validator.fit(data_transformer.transform(train_data))
    
    print('Transform Test Data')
    data_train = data_transformer.transform(test_data)
    
    print('Evaluate Model Against Test Data')
    predictions = cvModel.transform(data_train)
    
    print('Save Results')
    area_under_roc = model_obj.evaluator.evaluate(
        predictions, 
        {model_obj.evaluator.metricName: 'areaUnderROC'}
    )
    area_under_pr = model_obj.evaluator.evaluate(
        predictions, 
        {model_obj.evaluator.metricName: 'areaUnderPR'}
    )
    
    print(area_under_roc)
    print(area_under_pr)
    print(model_obj.list_features)
    print(cvModel.bestModel.coefficients)
    
    # End Timer
    elapsed_time = round((time.time() - start_time), 2)
    
    results_obj = results_cls(
        model_type = model_obj.model_type,
        list_features = model_obj.list_features,
        area_under_roc = area_under_roc,
        area_under_pr = area_under_pr,
        best_coefficients = cvModel.bestModel.coefficients,
        best_hyperparameters = cvModel.getEstimatorParamMaps()[ np.argmax(cvModel.avgMetrics) ],
        date_time_run=datetime.now().strftime("%m/%d/%Y %H:%M:%S"),
        elapsed_time=elapsed_time,
        user=user,
        special_description=special_description
    )
    
    return predictions, results_obj, cvModel

In [59]:
def build_logistic_model_object(pipeline, list_features):
    import pyspark.ml.evaluation as ev
    from pyspark.ml.classification import LogisticRegression
    import pyspark.ml.tuning as tune
    
    print('Build Logistic Object')

    logistic = LogisticRegression(featuresCol = FEATURES, labelCol=TARGET)
    
    grid = tune.ParamGridBuilder() \
                .addGrid(logistic.maxIter, [2, 10, 50]) \
                .addGrid(logistic.regParam, [0.01, 0.05, 0.3]) \
                .build()

    evaluator = ev.BinaryClassificationEvaluator(
        metricName = 'areaUnderROC',
        rawPredictionCol='rawPrediction', 
        labelCol=target
    )
    
    cv = tune.CrossValidator( 
        estimator=logistic, 
        estimatorParamMaps=grid, 
        evaluator=evaluator
    )
    
    model_obj = model_cls(
        model_type='Logistic',
        list_features=list_features,
        pipeline=pipeline,
        cross_validator=cv,
        evaluator=evaluator
    )
    
    return model_obj

In [61]:
def build_random_forest_model_object():
    import pyspark.ml.evaluation as ev
    from pyspark.ml.classification import RandomForestClassifier
    import pyspark.ml.tuning as tune
    
    print('Build Random Forest Object')

    random_forest = RandomForestClassifier(featuresCol = FEATURES, labelCol=TARGET)
    
    grid = tune.ParamGridBuilder() \
        .addGrid(random_forest.maxBins, [2, 3]) \
        .addGrid(random_forest.maxDepth, [3, 5]) \
        .addGrid(random_forest.numTrees, [100, 500]) \
        .build()

    evaluator = ev.BinaryClassificationEvaluator(
        metricName = 'areaUnderROC',
        rawPredictionCol='rawPrediction', 
        labelCol=target
    )
    
    cv = tune.CrossValidator( 
        estimator=logistic, 
        estimatorParamMaps=grid, 
        evaluator=evaluator
    )
    
    model_obj = model_cls(
        model_type='Random Forest',
        list_features=list_features,
        pipeline=pipeline,
        cross_validator=cv,
        evaluator=evaluator
    )
    
    return model_obj

In [62]:
def build_SVM_model_object():
    import pyspark.ml.evaluation as ev
    from pyspark.ml.classification import LinearSVC
    import pyspark.ml.tuning as tune
    
    print('Build SVM Model Object')

    svm = LinearSVC(featuresCol = FEATURES, labelCol=TARGET)
    
    grid = tune.ParamGridBuilder() \
                .addGrid(svm.aggregationDepth, [3, 5, 10]) \
                .addGrid(svm.maxIter, [10, 20, 50]) \
                .build()

    evaluator = ev.BinaryClassificationEvaluator(
        metricName = 'areaUnderROC',
        rawPredictionCol='rawPrediction', 
        labelCol=target
    )
    
    cv = tune.CrossValidator( 
        estimator=logistic, 
        estimatorParamMaps=grid, 
        evaluator=evaluator
    )
    
    model_obj = model_cls(
        model_type='Random Forest',
        list_features=list_features,
        pipeline=pipeline,
        cross_validator=cv,
        evaluator=evaluator
    )
    
    return model_obj

In [78]:
# Run A Single File

from os import listdir
from os.path import isfile, join

file = 'NBA_PBP_2019-20.csv'

df = read_in_file(join(PATH_STACKED, file))

train_data, test_data = df.randomSplit([0.7, 0.3], seed=123) # LogSecLeftTotal
list_features = ['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftTotalInverseTimesScoreDiff']

pipeline = build_pipeline(list_features=list_features)

model_obj = build_logistic_model_object(pipeline=pipeline, list_features=list_features)

predictions, results_obj, cvModel = evaluate_cv_model(
    train_data=train_data, 
    test_data=test_data, 
    model_obj=model_obj,
    user='Peter',
    special_description='Test Run 19-20 Only'
)


IndexError: string index out of range

In [81]:
def parse_results_obj(results_obj):
    return pd.DataFrame(data=[results_obj])

results_df = parse_results_obj(results_obj)
print(results_df.head())

from os.path import exists
file_exists = exists(RESULTS_FILE)
if file_exists:
    results_df.to_csv(RESULTS_FILE, mode='a', index=False, header=False, sep="|")
else:
    results_df.to_csv(RESULTS_FILE, mode='w', index=False, header=True, sep="")
    


  model_type                                      list_features  \
0   Logistic  [ScoreDiff, SecLeftTotalInverse, SecLeftTotalI...   

         date_time_run  elapsed_time   user  special_description  \
0  04/17/2022 23:49:30        306.11  Peter  Test Run 19-20 Only   

   area_under_roc  area_under_pr  \
0        0.820472       0.823239   

                                   best_coefficients  \
0  [1.4749598979624223, -9.384355132570414e-05, 0...   

                                best_hyperparameters  
0  {LogisticRegression_b762bb9a2c14__maxIter: 50,...  


In [82]:
view_df = pd.read_csv(RESULTS_FILE, sep='|')
view_df.head()

Unnamed: 0,model_type,list_features,date_time_run,elapsed_time,user,special_description,area_under_roc,area_under_pr,best_coefficients,best_hyperparameters
0,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/17/2022 23:35:57,309.49,Peter,Test Run 19-20 Only,0.820474,0.82324,"[1.4749755253169836,-9.339839834844648e-05,0.4...",[]
1,Logistic,"['ScoreDiff', 'SecLeftTotalInverse', 'SecLeftT...",04/17/2022 23:49:30,306.11,Peter,Test Run 19-20 Only,0.820472,0.823239,"[1.4749598979624223,-9.384355132570414e-05,0.4...",{Param(parent='LogisticRegression_b762bb9a2c14...


In [101]:
train_files = [
    'NBA_PBP_2015-16.csv',
    'NBA_PBP_2016-17.csv',
    'NBA_PBP_2017-18.csv',
    'NBA_PBP_2018-29.csv'
]

validation_file = 'NBA_PBP_2019-20.csv'

# Read in all Train_Files

file_str = ''
for item in train_files:
    file_str = file_str + PATH_STACKED + item + ','
    
file_str = file_str[:-1]
print(file_str)

/project/ds5559/group2nba/stacked_data/NBA_PBP_2015-16.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2016-17.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2017-18.csv,/project/ds5559/group2nba/stacked_data/NBA_PBP_2018-29.csv
