In [1]:
import json
import os

import numpy as np
import pandas as pd
import pyspark.sql.functions as F
import regex as re

from IPython.display import display
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, LinearSVC, LogisticRegression, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
from typing import *

In [2]:
spark = SparkSession \
    .builder \
    .appName('group2nba') \
    .getOrCreate()

In [3]:
path_main = '/project/ds5559/group2nba'

T = TypeVar('T')

In [4]:
FIELDS: Dict[str, T] = {
      'Url': StringType
    , 'GameType': StringType
    , 'Location': StringType
    , 'Date': StringType
    , 'Time': StringType
    , 'WinningTeam': StringType
    , 'Quarter': IntegerType
    , 'SecLeft': IntegerType
    , 'AwayTeam': StringType
    , 'AwayPlay': StringType
    , 'AwayScore': IntegerType
    , 'HomeTeam': StringType
    , 'HomePlay': StringType
    , 'HomeScore': IntegerType
    , 'Shooter': StringType
    , 'ShotType': StringType
    , 'ShotOutcome': IntegerType
    , 'ShotDist': IntegerType
    , 'Assister': StringType
    , 'Blocker': StringType
    , 'FoulType': StringType
    , 'Fouler': StringType
    , 'Fouled': StringType
    , 'Rebounder': StringType
    , 'ReboundType': IntegerType
    , 'ViolationPlayer': StringType
    , 'ViolationType': StringType
    , 'TimeoutTeam': StringType
    , 'FreeThrowShooter': StringType
    , 'FreeThrowOutcome': IntegerType
    , 'FreeThrowNum': StringType
    , 'EnterGame': StringType
    , 'LeaveGame': StringType
    , 'TurnoverPlayer': StringType
    , 'TurnoverType': StringType
    , 'TurnoverCause': StringType
    , 'TurnoverCauser': StringType
    , 'JumpballAwayPlayer': StringType
    , 'JumpballHomePlayer': StringType
    , 'JumpballPoss': StringType
}

POSSESSION_PLAYS: List[str] = [
      'Shooter'
    , 'Assister'
    , 'Fouled'
    , 'Rebounder'
    , 'ViolationPlayer'
    , 'FreeThrowShooter'
    , 'TurnOverPlayer'
    , 'JumpballPoss'
]

MODEL_FEATURES: List[str] = [
      'ScoreDiff'
    , 'SecLeftTotal'
    , 'HasPossession'
]
    
MODEL_FIELDS: List[str] = [
      'Date'
    , 'HomeTeam'
    , 'AwayTeam'
    , 'Team'
    , 'Year'
    , 'Won'
    , 'ScoreDiff'
    , 'Quarter'
    , 'SecLeftTotal'
    , 'HasPossession'
]

DICT_ML: Dict[str, T] = {
      'GradientBoost': GBTClassifier
    , 'LinearSVC': RandomForestClassifier
    , 'LogisticRegression': LogisticRegression
    , 'RandomForest': RandomForestClassifier
}

In [5]:
# TODO: MAKE SEPARATE LIBRARY TO REFERENCE.......

@F.udf(IntegerType())
def get_has_possession(team: str, plays: List[str]) -> int:
    '''Get: whether the current team has possession of the ball'''
    return int(bool(team) and any([bool(x) for x in plays]))


@F.udf(IntegerType())
def get_score_diff(score1: int, score2: int) -> int:
    '''Get: score differential relative to the specified team'''
    return score1 - score2


@F.udf(IntegerType())
def get_secleft_total(quarter: int, maxquarter: int, sec: int) -> int:
    '''Get: SecLeft by Quarter, accounting for OT'''
    if quarter < 5:
        return ((maxquarter - 4) * 300) + ((4 - quarter) * 720) + sec
    else:
        return ((maxquarter - quarter) * 300) + sec
    
    
@F.udf(StringType())
def get_team(team: str) -> str:
    '''Get: the specified team'''
    return team
    
    
@F.udf(IntegerType())
def get_won(winner: str, team: str) -> int:
    '''Get: whether the specified team won'''
    return int(winner == team)
    
    
@F.udf(IntegerType())
def get_year(date: str) -> int:
    '''Get: Year of game took place in'''
    return int(re.match(r'[A-Z][a-z]+ \d+ (\d{4})', date).groups()[0])

In [6]:
# TODO: MAKE SEPARATE LIBRARY TO REFERENCE.......

def build_model_win_percent(df: DataFrame) -> DataFrame:
    '''Constucts a model for predicting the win percent on a play by play basis'''
    altered = df \
        .groupBy(['Date', 'HomeTeam', 'AwayTeam']) \
        .agg(F.max('Quarter')
             .alias('MaxQuarter')) \
        .join(df, ['Date', 'HomeTeam', 'AwayTeam']) \
        .withColumn('SecLeftTotal', get_secleft_total('Quarter', 'MaxQuarter', 'SecLeft')) \
        .withColumn('Year', get_year('Date'))

    home = altered \
        .withColumn('Won', get_won('WinningTeam', 'HomeTeam')) \
        .withColumn('ScoreDiff', get_score_diff('HomeScore', 'AwayScore')) \
        .withColumn('HasPossession', get_has_possession('HomePlay', F.array(*POSSESSION_PLAYS))) \
        .withColumn('Team', get_team('HomeTeam')) \
        .select(MODEL_FIELDS)
    
    away = altered \
        .withColumn('Won', get_won('WinningTeam', 'AwayTeam')) \
        .withColumn('ScoreDiff', get_score_diff('AwayScore', 'HomeScore')) \
        .withColumn('HasPossession', get_has_possession('AwayPlay', F.array(*POSSESSION_PLAYS))) \
        .withColumn('Team', get_team('AwayTeam')) \
        .select(MODEL_FIELDS)
    
    return home.union(away)

In [7]:
schema_data = StructType([StructField(k, v()) for k, v in FIELDS.items()])

In [8]:
df_train = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(schema_data) \
    .load(f'{path_main}/clean_train_data/*')

display(df_train.count())
display(df_train.printSchema())
display(df_train.head(2))

2403581

root
 |-- Url: string (nullable = true)
 |-- GameType: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- WinningTeam: string (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeft: integer (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- AwayPlay: string (nullable = true)
 |-- AwayScore: integer (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- HomePlay: string (nullable = true)
 |-- HomeScore: integer (nullable = true)
 |-- Shooter: string (nullable = true)
 |-- ShotType: string (nullable = true)
 |-- ShotOutcome: integer (nullable = true)
 |-- ShotDist: integer (nullable = true)
 |-- Assister: string (nullable = true)
 |-- Blocker: string (nullable = true)
 |-- FoulType: string (nullable = true)
 |-- Fouler: string (nullable = true)
 |-- Fouled: string (nullable = true)
 |-- Rebounder: string (nullable = true)
 |-- ReboundType: integer (nullable = tru

None

[Row(Url='/boxscores/201810160BOS.html', GameType='regular', Location='TD Garden Boston Massachusetts', Date='October 16 2018', Time='8:00 PM', WinningTeam='BOS', Quarter=1, SecLeft=700, AwayTeam='PHI', AwayPlay='R. Covington misses 3-pt jump shot from 27 ft', AwayScore=0, HomeTeam='BOS', HomePlay=None, HomeScore=0, Shooter='R. Covington - covinro01', ShotType=None, ShotOutcome=0, ShotDist=None, Assister=None, Blocker=None, FoulType=None, Fouler=None, Fouled=None, Rebounder=None, ReboundType=None, ViolationPlayer=None, ViolationType=None, TimeoutTeam=None, FreeThrowShooter=None, FreeThrowOutcome=None, FreeThrowNum=None, EnterGame=None, LeaveGame=None, TurnoverPlayer=None, TurnoverType=None, TurnoverCause=None, TurnoverCauser=None, JumpballAwayPlayer=None, JumpballHomePlayer=None, JumpballPoss=None),
 Row(Url='/boxscores/201810160BOS.html', GameType='regular', Location='TD Garden Boston Massachusetts', Date='October 16 2018', Time='8:00 PM', WinningTeam='BOS', Quarter=1, SecLeft=700, Aw

In [9]:
df_train = build_model_win_percent(df_train) \
    .where(F.col('SecLeftTotal') <= 300)

display(df_train.count())
display(df_train.printSchema())
display(df_train.head(2))

579310

root
 |-- Date: string (nullable = true)
 |-- HomeTeam: string (nullable = true)
 |-- AwayTeam: string (nullable = true)
 |-- Team: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Won: integer (nullable = true)
 |-- ScoreDiff: integer (nullable = true)
 |-- Quarter: integer (nullable = true)
 |-- SecLeftTotal: integer (nullable = true)
 |-- HasPossession: integer (nullable = true)



None

[Row(Date='December 18 2016', HomeTeam='MEM', AwayTeam='UTA', Team='MEM', Year=2016, Won=0, ScoreDiff=0, Quarter=4, SecLeftTotal=297, HasPossession=1),
 Row(Date='December 18 2016', HomeTeam='MEM', AwayTeam='UTA', Team='MEM', Year=2016, Won=0, ScoreDiff=0, Quarter=4, SecLeftTotal=296, HasPossession=1)]

In [None]:
df_test = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(schema_data) \
    .load(f'{path_main}/clean_test_data/*')

display(df_test.count())
display(df_test.printSchema())
display(df_test.head(2))

In [None]:
df_test = build_model_win_percent(df_valid) \
    .where(F.col('SecLeftTotal') <= 300)

display(df_test.count())
display(df_test.printSchema())
display(df_test.head(2))

In [10]:
valid_latest = os.listdir(f'{path_main}/models_validation')[-1]
valid_latest

'validation_results_0.csv'

In [12]:
schema_model = StructType([
      StructField('Method', StringType())
    , StructField('ROC', FloatType())
    , StructField('HyperParameters', StringType())
])

In [14]:
validation_models = spark.read \
    .format('csv') \
    .option('header', False) \
    .schema(schema_model) \
    .load(f'{path_main}/models_validation/{valid_latest}')
display(validation_models.count())
display(validation_models.printSchema())
display(validation_models.head(2))

3

root
 |-- Method: string (nullable = true)
 |-- ROC: float (nullable = true)
 |-- HyperParameters: string (nullable = true)



None

[Row(Method='RandomForest', ROC=0.9272347092628479, HyperParameters='{"maxBins": 2, "maxDepth": 3, "numTrees": 100}'),
 Row(Method='LogisticRegression', ROC=0.926864504814148, HyperParameters='{"maxIter": 20, "regParam": 0.1}')]

In [39]:
best_model = validation_models.orderBy(F.col('ROC'), ascending = False).take(1)
best_model

[Row(Method='RandomForest', ROC=0.9272347092628479, HyperParameters='{"maxBins": 2, "maxDepth": 3, "numTrees": 100}')]

In [15]:
test_results = spark.createDataFrame({}, schema = schema_model)

In [None]:
pipeline = Pipeline(stages = [
      VectorAssembler(
          inputCols = MODEL_FEATURES
        , outputCol = 'features')
    , DICT_ML[best_model[0][0]](
          featuresCol = 'features'
        , labelCol = 'Won'
        , **json.loads(best_model[0][2])
    )
])

results = BinaryClassificationMetrics(pipeline
    .fit(df_train)
    .transform(df_valid)
    .select(['Won', 'prediction'])
    .rdd
    .map(lambda x: (float(x[0]), x[1]))
)

valid_results = valid_results.union(spark.createDataFrame([(
      best_model[0][0]
    , results.areaUnderROC
    , params
)], schema_model))
valid_results