In [1]:
import os

import numpy as np
import pandas as pd
import pyspark.sql.functions as F
import regex as re

from IPython.display import display
from pyspark.ml.classification import GBTClassifier, LinearSVC, LogisticRegression, RandomForestClassifier
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from typing import *
from pyspark.sql.functions import col

In [2]:
spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .getOrCreate()

In [3]:
path_main = '/project/ds5559/group2nba'

T = TypeVar('T')

In [4]:
class ML_CV():
    __slots__: List[str] = [
          'Model'
        , 'HyperParameters'
    ]
    
    def __init__(self, model: T, hyper_params: Dict[str, List[T]]): # todo: touch up
        self.Model = model
        self.HyperParameters = hyper_params

In [5]:
FIELDS: Dict[str, T] = {
      'Url': StringType
    , 'GameType': StringType
    , 'Location': StringType
    , 'Date': StringType
    , 'Time': StringType
    , 'WinningTeam': StringType
    , 'Quarter': IntegerType
    , 'SecLeft': IntegerType
    , 'AwayTeam': StringType
    , 'AwayPlay': StringType
    , 'AwayScore': IntegerType
    , 'HomeTeam': StringType
    , 'HomePlay': StringType
    , 'HomeScore': IntegerType
    , 'Shooter': StringType
    , 'ShotType': StringType
    , 'ShotOutcome': IntegerType
    , 'ShotDist': IntegerType
    , 'Assister': StringType
    , 'Blocker': StringType
    , 'FoulType': StringType
    , 'Fouler': StringType
    , 'Fouled': StringType
    , 'Rebounder': StringType
    , 'ReboundType': IntegerType
    , 'ViolationPlayer': StringType
    , 'ViolationType': StringType
    , 'TimeoutTeam': StringType
    , 'FreeThrowShooter': StringType
    , 'FreeThrowOutcome': IntegerType
    , 'FreeThrowNum': StringType
    , 'EnterGame': StringType
    , 'LeaveGame': StringType
    , 'TurnoverPlayer': StringType
    , 'TurnoverType': StringType
    , 'TurnoverCause': StringType
    , 'TurnoverCauser': StringType
    , 'JumpballAwayPlayer': StringType
    , 'JumpballHomePlayer': StringType
    , 'JumpballPoss': StringType
}

POSSESSION_PLAYS: List[str] = [
      'Shooter'
    , 'Assister'
    , 'Fouled'
    , 'Rebounder'
    , 'ViolationPlayer'
    , 'FreeThrowShooter'
    , 'TurnOverPlayer'
    , 'JumpballPoss'
]

MODEL_FIELDS: List[str] = [
      'Date'
    , 'HomeTeam'
    , 'AwayTeam'
    , 'Team'
    , 'Year'
    , 'Won'
    , 'ScoreDiff'
    , 'Quarter'
    , 'SecLeftTotal'
    , 'HasPossession'
    , 'away_assist_cnt'
    , 'home_assist_cnt'
    , 'away_TO_cnt'
    , 'home_TO_cnt'
    , 'away_block_cnt'
    , 'home_block_cnt'
    , 'away_foul_cnt'
    , 'home_foul_cnt'
    , 'away_rebound_cnt'
    , 'home_rebound_cnt'
    , 'away_shoot_pct'
    , 'home_shoot_pct'
    , 'away_ft_pct'
    , 'home_ft_pct'
]

DICT_ML: Dict[str, ML_CV] = {
      'GradientBoost': ML_CV(GBTClassifier, {
          'featureSubsetStrategy': ['all', 'sqrt', 'onethird', 'log2']
        , 'maxBins': [2, 3]
        , 'maxDepth': [100, 500, 1000]
        , 'weightCol': ['ScoreDiff', 'SecLeftTotal', 'HasPossession']
    })
    , 'LinearSVC': ML_CV(RandomForestClassifier, {
          'aggregationDepth': [10, 20, 50, 100]
        , 'maxIter': [10, 20, 50, 100]
        , 'weightCol': ['ScoreDiff', 'SecLeftTotal', 'HasPossession']
    })
    , 'LogisticRegression': ML_CV(LogisticRegression, {
          'maxIter': [10, 20, 50, 100]
        , 'regParam': [0.1, 0.5, 0.01, 0.05]
        , 'weightCol': ['ScoreDiff', 'SecLeftTotal', 'HasPossession']
    })
    , 'RandomForest': ML_CV(RandomForestClassifier, {
          'maxBins': [2, 3]
        , 'numTrees': [100, 500, 1000, 2000]
        , 'maxDepth': [100, 500, 1000]
        , 'weightCol': ['ScoreDiff', 'SecLeftTotal', 'HasPossession']
    })
}

In [6]:
@F.udf(IntegerType())
def get_has_possession(team: str, plays: List[str]) -> int:
    '''Get: whether the current team has possession of the ball'''
    return int(bool(team) and any([bool(x) for x in plays]))


@F.udf(IntegerType())
def get_score_diff(score1: int, score2: int) -> int:
    '''Get: score differential relative to the specified team'''
    return score1 - score2


@F.udf(IntegerType())
def get_secleft_total(quarter: int, maxquarter: int, sec: int) -> int:
    '''Get: SecLeft by Quarter, accounting for OT'''
    if quarter < 5:
        return ((maxquarter - 4) * 300) + ((4 - quarter) * 720) + sec
    else:
        return ((maxquarter - quarter) * 300) + sec
    
    
@F.udf(StringType())
def get_team(team: str) -> str:
    '''Get: the specified team'''
    return team
    
    
@F.udf(IntegerType())
def get_won(winner: str, team: str) -> int:
    '''Get: whether the specified team won'''
    return int(winner == team)
    
    
@F.udf(IntegerType())
def get_year(date: str) -> int:
    '''Get: Year of game took place in'''
    return int(re.match(r'[A-Z][a-z]+ \d+ (\d{4})', date).groups()[0])

In [7]:
def build_model_win_percent(df: DataFrame) -> DataFrame:
    '''Constucts a model for predicting the win percent on a play by play basis'''
    altered = df \
        .groupBy(['Date', 'HomeTeam', 'AwayTeam']) \
        .agg(F.max('Quarter')
             .alias('MaxQuarter')) \
        .join(df, ['Date', 'HomeTeam', 'AwayTeam']) \
        .withColumn('SecLeftTotal', get_secleft_total('Quarter', 'MaxQuarter', 'SecLeft')) \
        .withColumn('Year', get_year('Date'))

    home = altered \
        .withColumn('Won', get_won('WinningTeam', 'HomeTeam')) \
        .withColumn('ScoreDiff', get_score_diff('HomeScore', 'AwayScore')) \
        .withColumn('HasPossession', get_has_possession('HomePlay', F.array(*POSSESSION_PLAYS))) \
        .withColumn('Team', get_team('HomeTeam')) \
        .select(MODEL_FIELDS)
    
    away = altered \
        .withColumn('Won', get_won('WinningTeam', 'AwayTeam')) \
        .withColumn('ScoreDiff', get_score_diff('AwayScore', 'HomeScore')) \
        .withColumn('HasPossession', get_has_possession('AwayPlay', F.array(*POSSESSION_PLAYS))) \
        .withColumn('Team', get_team('AwayTeam')) \
        .select(MODEL_FIELDS)
    
    ###add in cumulative stats here
    
    
    
    return home.union(away)

In [8]:
def cross_validate(df: DataFrame, ml_method: str, features: List[str], k_folds: int = 10) -> DataFrame:
    '''...'''
    method = DICT_ML[ml_method]
    
    pipeline = Pipeline(stages = [
          VectorAssembler(inputCols = features, outputCol = 'features')
        , method.Model(featuresCol = 'features', labelCol = 'Won') # todo: make response a constant
    ])
    
    param_grid = ParamGridBuilder()
    for attr, params in method.HyperParameters.items():
        param_grid = param_grid.addGrid(getattr(pipeline.stages[1], attr), params)
    param_grid.build()
    
    cv_model = CrossValidator(
          estimator = pipeline
        , estimatorParamMaps = param_grid
        , evaluator = BinaryClassificationEvaluator()
        , numFolds = k_folds
    ).setParallelism(4).fit(df)
    
    # todo: return results as pandas dataframe...
    # write hyperparams to json

In [9]:
schema = StructType([StructField(k, v()) for k, v in FIELDS.items()])

df_train = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(schema) \
    .load(f'{path_main}/clean_train_data/*')

display(df_train.count())
display(df_train.printSchema())
display(df_train.head(2))

2403581

root
 |-- Url: string (nullable = true)
 |-- GameType: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- WinningTeam: string (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeft: integer (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- AwayPlay: string (nullable = true)
 |-- AwayScore: integer (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- HomePlay: string (nullable = true)
 |-- HomeScore: integer (nullable = true)
 |-- Shooter: string (nullable = true)
 |-- ShotType: string (nullable = true)
 |-- ShotOutcome: integer (nullable = true)
 |-- ShotDist: integer (nullable = true)
 |-- Assister: string (nullable = true)
 |-- Blocker: string (nullable = true)
 |-- FoulType: string (nullable = true)
 |-- Fouler: string (nullable = true)
 |-- Fouled: string (nullable = true)
 |-- Rebounder: string (nullable = true)
 |-- ReboundType: integer (nullable = tru

None

[Row(Url='/boxscores/201810160BOS.html', GameType='regular', Location='TD Garden Boston Massachusetts', Date='October 16 2018', Time='8:00 PM', WinningTeam='BOS', Quarter=1, SecLeft=700, AwayTeam='PHI', AwayPlay='R. Covington misses 3-pt jump shot from 27 ft', AwayScore=0, HomeTeam='BOS', HomePlay=None, HomeScore=0, Shooter='R. Covington - covinro01', ShotType=None, ShotOutcome=0, ShotDist=None, Assister=None, Blocker=None, FoulType=None, Fouler=None, Fouled=None, Rebounder=None, ReboundType=None, ViolationPlayer=None, ViolationType=None, TimeoutTeam=None, FreeThrowShooter=None, FreeThrowOutcome=None, FreeThrowNum=None, EnterGame=None, LeaveGame=None, TurnoverPlayer=None, TurnoverType=None, TurnoverCause=None, TurnoverCauser=None, JumpballAwayPlayer=None, JumpballHomePlayer=None, JumpballPoss=None),
 Row(Url='/boxscores/201810160BOS.html', GameType='regular', Location='TD Garden Boston Massachusetts', Date='October 16 2018', Time='8:00 PM', WinningTeam='BOS', Quarter=1, SecLeft=700, Aw

In [10]:
##create cumulative stats before applying "build_model_win_pct", these features are added to "MODEL_FIELDS"
team = F.when(F.length(col("HomePlay")) >0,'home').otherwise('away')
df_train = df_train.withColumn('team', team)


assist_away = F.when((F.length(col("Assister")) > 0) & (col("team") == 'away'), 1).otherwise(0)
assist_home = F.when((F.length(col("Assister")) > 0) & (col("team") == 'home'), 1).otherwise(0)
TO_away = F.when((F.length(col("TurnoverPlayer")) > 0) & (col("team") == 'away'), 1).otherwise(0)
TO_home = F.when((F.length(col("TurnoverPlayer")) > 0) & (col("team") == 'home'), 1).otherwise(0)
block_away = F.when((F.length(col("Blocker")) > 0) & (col("team") == 'away'), 1).otherwise(0)
block_home = F.when((F.length(col("Blocker")) > 0) & (col("team") == 'home'), 1).otherwise(0)
foul_away = F.when((F.length(col("Fouler")) > 0) & (col("team") == 'away'), 1).otherwise(0)
foul_home = F.when((F.length(col("Fouler")) > 0) & (col("team") == 'home'), 1).otherwise(0)
rebound_away = F.when((F.length(col("Rebounder")) > 0) & (col("team") == 'away'), 1).otherwise(0)
rebound_home = F.when((F.length(col("Rebounder")) > 0) & (col("team") == 'home'), 1).otherwise(0)
score_diff = col("HomeScore")-col('AwayScore')
shot_away = F.when((F.length(col("ShotOutcome")) > 0) & (col("team") == 'away'), 1).otherwise(0)
shot_home = F.when((F.length(col("ShotOutcome")) > 0) & (col("team") == 'home'), 1).otherwise(0)
ft_away = F.when((F.length(col("FreeThrowOutcome")) > 0) & (col("team") == 'away'), 1).otherwise(0)
ft_home = F.when((F.length(col("FreeThrowOutcome")) > 0) & (col("team") == 'home'), 1).otherwise(0)
home_team_win = F.when(col("WinningTeam") == col("HomeTeam"),1).otherwise(0)
away_team_win = F.when(col("WinningTeam") == col("AwayTeam"),1).otherwise(0)

df_train = df_train.withColumn('assist_away', assist_away).withColumn('assist_home',assist_home).withColumn('TO_away',TO_away).withColumn('TO_home',TO_home).withColumn('block_away',block_away).withColumn('block_home',block_home)\
.withColumn('foul_away',foul_away).withColumn('foul_home',foul_home).withColumn('rebound_away',rebound_away).withColumn('rebound_home',rebound_home).withColumn('score_diff',score_diff).withColumn('shot_away',shot_away).withColumn('shot_home',shot_home)\
.withColumn('ft_away',ft_away).withColumn('ft_home',ft_home).withColumn('home_team_win',home_team_win).withColumn('away_team_win',away_team_win)

##create column distinguishing unique games- time and location
from pyspark.sql.functions import concat
df_train = df_train.withColumn('date_location', concat(col("Date"),col("Location")))


from pyspark.sql import Window
windowval = (Window.partitionBy('date_location').orderBy(df_train.Quarter.asc(),df_train.SecLeft.desc())
             .rangeBetween(Window.unboundedPreceding, 0))

away_assist_cnt = F.sum(col("assist_away")).over(windowval)
home_assist_cnt = F.sum(col("assist_home")).over(windowval)
away_TO_cnt = F.sum(col("TO_away")).over(windowval)
home_TO_cnt = F.sum(col("TO_home")).over(windowval)
away_block_cnt = F.sum(col("block_away")).over(windowval)
home_block_cnt = F.sum(col("block_home")).over(windowval)
away_foul_cnt = F.sum(col("foul_away")).over(windowval)
home_foul_cnt = F.sum(col("foul_home")).over(windowval)
away_rebound_cnt = F.sum(col("rebound_away")).over(windowval)
home_rebound_cnt = F.sum(col("rebound_home")).over(windowval)
away_shoot_pct = F.avg(col("shot_away")).over(windowval)
home_shoot_pct = F.avg(col("shot_home")).over(windowval)
away_ft_pct = F.avg(col("ft_away")).over(windowval)
home_ft_pct = F.avg(col("ft_home")).over(windowval)

df_train = df_train.withColumn('away_assist_cnt', away_assist_cnt).withColumn('home_assist_cnt',home_assist_cnt).withColumn('away_TO_cnt',away_TO_cnt).withColumn('home_TO_cnt',home_TO_cnt).withColumn('away_block_cnt',away_block_cnt).withColumn('home_block_cnt',home_block_cnt)\
.withColumn('away_foul_cnt',away_foul_cnt).withColumn('home_foul_cnt',home_foul_cnt).withColumn('away_rebound_cnt',away_rebound_cnt).withColumn('home_rebound_cnt',home_rebound_cnt).withColumn('away_shoot_pct',away_shoot_pct).withColumn('home_shoot_pct',home_shoot_pct)\
.withColumn('away_ft_pct',away_ft_pct).withColumn('home_ft_pct',home_ft_pct)

df_train.head(3)

[Row(Url='/boxscores/201704020NOP.html', GameType='regular', Location='Smoothie King Center New Orleans Louisiana', Date='April 2 2017', Time='6:00 PM', WinningTeam='CHI', Quarter=1, SecLeft=720, AwayTeam='CHI', AwayPlay='Jump ball: A. Davis vs. R. Lopez (N. Miroti gains possession)', AwayScore=0, HomeTeam='NOP', HomePlay=None, HomeScore=0, Shooter=None, ShotType=None, ShotOutcome=None, ShotDist=None, Assister=None, Blocker=None, FoulType=None, Fouler=None, Fouled=None, Rebounder=None, ReboundType=None, ViolationPlayer=None, ViolationType=None, TimeoutTeam=None, FreeThrowShooter=None, FreeThrowOutcome=None, FreeThrowNum=None, EnterGame=None, LeaveGame=None, TurnoverPlayer=None, TurnoverType=None, TurnoverCause=None, TurnoverCauser=None, JumpballAwayPlayer='A. Davis - davisan02', JumpballHomePlayer='R. Lopez - lopezro01', JumpballPoss='N. Mirotić - mirotni01', team='away', assist_away=0, assist_home=0, TO_away=0, TO_home=0, block_away=0, block_home=0, foul_away=0, foul_home=0, rebound_a

In [11]:
df_train = build_model_win_percent(df_train)

display(df_train.count())
display(df_train.printSchema())
display(df_train.head(2))

4807162

root
 |-- Date: string (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- Team: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Won: integer (nullable = true)
 |-- ScoreDiff: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeftTotal: integer (nullable = true)
 |-- HasPossession: integer (nullable = true)
 |-- away_assist_cnt: long (nullable = true)
 |-- home_assist_cnt: long (nullable = true)
 |-- away_TO_cnt: long (nullable = true)
 |-- home_TO_cnt: long (nullable = true)
 |-- away_block_cnt: long (nullable = true)
 |-- home_block_cnt: long (nullable = true)
 |-- away_foul_cnt: long (nullable = true)
 |-- home_foul_cnt: long (nullable = true)
 |-- away_rebound_cnt: long (nullable = true)
 |-- home_rebound_cnt: long (nullable = true)
 |-- away_shoot_pct: double (nullable = true)
 |-- home_shoot_pct: double (nullable = true)
 |-- away_ft_pct: double (nullable = true)
 |-- home_ft_pct: double

None

[Row(Date='December 18 2016', HomeTeam='MEM', AwayTeam='UTA', Team='MEM', Year=2016, Won=0, ScoreDiff=0, Quarter=1, SecLeftTotal=2880, HasPossession=0, away_assist_cnt=0, home_assist_cnt=0, away_TO_cnt=0, home_TO_cnt=0, away_block_cnt=0, home_block_cnt=0, away_foul_cnt=0, home_foul_cnt=0, away_rebound_cnt=0, home_rebound_cnt=0, away_shoot_pct=0.0, home_shoot_pct=0.0, away_ft_pct=0.0, home_ft_pct=0.0),
 Row(Date='December 18 2016', HomeTeam='MEM', AwayTeam='UTA', Team='MEM', Year=2016, Won=0, ScoreDiff=-3, Quarter=1, SecLeftTotal=2862, HasPossession=0, away_assist_cnt=1, home_assist_cnt=0, away_TO_cnt=0, home_TO_cnt=0, away_block_cnt=0, home_block_cnt=0, away_foul_cnt=0, home_foul_cnt=0, away_rebound_cnt=0, home_rebound_cnt=0, away_shoot_pct=0.5, home_shoot_pct=0.0, away_ft_pct=0.0, home_ft_pct=0.0)]

In [12]:
df_train.count()

4807162

In [13]:
##after build_model_win_pct is called...
##..creating difference in cumulative stats


df_train = df_train.withColumn('TO_dif', (df_train.away_TO_cnt-df_train.home_TO_cnt))
df_train = df_train.withColumn('foul_dif', (df_train.away_foul_cnt-df_train.home_foul_cnt))
df_train = df_train.withColumn('shoot_pct_dif', (df_train.away_shoot_pct-df_train.home_shoot_pct))
df_train = df_train.withColumn('ft_pct_dif', (df_train.away_ft_pct-df_train.home_ft_pct))



In [14]:
df_train = df_train.withColumn('year', df_train.Date.substr(-4,4))
df_train.describe('year').show()
df_train.select('year').distinct().show()

+-------+------------------+
|summary|              year|
+-------+------------------+
|  count|           4807162|
|   mean|2017.1074101517693|
| stddev|1.2084794140067494|
|    min|              2015|
|    max|              2019|
+-------+------------------+

+----+
|year|
+----+
|2016|
|2019|
|2017|
|2018|
|2015|
+----+



In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StandardScaler
import numpy as np
import pandas as pd

# training_fraction = .7
seed = 10
max_iterations = 10


columns = ['ScoreDiff','SecLeftTotal','HasPossession','TO_dif','foul_dif','shoot_pct_dif','ft_pct_dif']
assembler = VectorAssembler(inputCols=columns,outputCol="feature")
tr = assembler.transform(df_train)
scaler = StandardScaler(inputCol="feature", outputCol="scaledFeature")
scalerModel = scaler.fit(tr)
scaledData = scalerModel.transform(tr)
train = scaledData.filter(scaledData.year <= 2017)
test = scaledData.filter(scaledData.year > 2017)
lr = LogisticRegression(featuresCol="feature",       
    labelCol='Won',
    maxIter=max_iterations,
    regParam=.3, 
    elasticNetParam=.8, family = "binomial")
lrModel = lr.fit(train)
lrPred = lrModel.transform(test)
# ev = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol='class', metricName = "areaUnderPR")
# auc = ev.evaluate(lrPred)



        

In [16]:
lrPred_EOG = lrPred.filter(lrPred.SecLeftTotal < 15)

In [17]:
lrPred.columns

['Date',
 'HomeTeam',
 'AwayTeam',
 'Team',
 'year',
 'Won',
 'ScoreDiff',
 'Quarter',
 'SecLeftTotal',
 'HasPossession',
 'away_assist_cnt',
 'home_assist_cnt',
 'away_TO_cnt',
 'home_TO_cnt',
 'away_block_cnt',
 'home_block_cnt',
 'away_foul_cnt',
 'home_foul_cnt',
 'away_rebound_cnt',
 'home_rebound_cnt',
 'away_shoot_pct',
 'home_shoot_pct',
 'away_ft_pct',
 'home_ft_pct',
 'TO_dif',
 'foul_dif',
 'shoot_pct_dif',
 'ft_pct_dif',
 'feature',
 'scaledFeature',
 'rawPrediction',
 'probability',
 'prediction']

In [18]:
lrPred_EOG = lrPred_EOG.withColumn('shoot_pct_dif', F.round(lrPred_EOG['shoot_pct_dif'], 2))
lrPred_EOG = lrPred_EOG.withColumn('ft_pct_dif', F.round(lrPred_EOG['ft_pct_dif'], 2))
lrPred_EOG.select('Date','HomeTeam','AwayTeam','Team','ScoreDiff','SecLeftTotal','HasPossession','TO_dif','foul_dif','shoot_pct_dif','ft_pct_dif','probability').show(100)

+----------------+--------+--------+----+---------+------------+-------------+------+--------+-------------+----------+--------------------+
|            Date|HomeTeam|AwayTeam|Team|ScoreDiff|SecLeftTotal|HasPossession|TO_dif|foul_dif|shoot_pct_dif|ft_pct_dif|         probability|
+----------------+--------+--------+----+---------+------------+-------------+------+--------+-------------+----------+--------------------+
| December 3 2018|     NOP|     LAC| NOP|       -3|           7|            0|     5|       5|        -0.02|      0.02|[0.50628118585597...|
| December 3 2018|     NOP|     LAC| NOP|       -3|           6|            0|     5|       5|        -0.02|      0.02|[0.50628118585597...|
| December 3 2018|     NOP|     LAC| NOP|       -3|           6|            0|     5|       5|        -0.02|      0.02|[0.50628118585597...|
| December 3 2018|     NOP|     LAC| NOP|       -3|           6|            0|     5|       5|        -0.02|      0.02|[0.50628118585597...|
| December 3 