In [72]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss

In [192]:
# Load data
years = [2019, 2020, 2021, 2022, 2023, 2024]
cols = ['game_id', 'Date', 'season', 'Home', 'Away', 'jumpball_home', 'jumpball_away', 'jumpball_possession_tm']
first_basket = pd.concat([pd.read_csv(f'../data/first_basket_{year}.csv') for year in years])

In [193]:
dfs = []
for year in years :
    df = pd.read_csv(f'../data/rosters.nosync/rosters_{year}.csv')
    df.insert(1, 'Year', year)
    dfs.append(df)
stats = pd.concat(dfs).reset_index(drop = True)
stats = pd.merge(stats, first_basket[['game_id', 'first_basket']], on = 'game_id', how = 'inner')
stats['first_basket_scorer'] = (stats['player_id'] == stats['first_basket']).astype(int)

In [194]:
features = []
for f in ['PTS', 'USG%', 'VORP', 'FGA'] :

    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}'].expanding().mean().reset_index(level = 0, drop = True)
    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}_avg'].shift()
    features.append(f'{f}_avg')

    for w in [25, 50] :
        stats[f'{f}_{w}'] = stats.groupby('player_id')[f'{f}'].rolling(w, min_periods = 1).mean().reset_index(level = 0, drop = True)
        stats[f'{f}_{w}'] = stats.groupby('player_id')[f'{f}_{w}'].shift()
        features.append(f'{f}_{w}')

In [195]:
starters = stats.copy()[stats['starter']]
non_starters = stats.copy()[~stats['starter']]

In [196]:
starters['first_basket_avg'] = starters.groupby('player_id')['first_basket_scorer'].expanding().mean().reset_index(level = 0, drop = True)
starters['first_basket_avg'] = starters.groupby('player_id')['first_basket_avg'].shift()
features.append(f'first_basket_avg')

for w in [25, 50] :
    starters[f'first_basket_{w}'] = starters.groupby('player_id')['first_basket_scorer'].rolling(w, min_periods = 1).mean().reset_index(level = 0, drop = True)
    starters[f'first_basket_{w}'] = starters.groupby('player_id')[f'first_basket_{w}'].shift()
    features.append(f'first_basket_{w}')

In [197]:
ratings = pd.read_csv('../data/2k-ratings-2001-2024.csv')
ratings['Rating'] = (
    (ratings['Rating'] - ratings.groupby('Year')['Rating'].transform('mean'))
    / ratings.groupby('Year')['Rating'].transform('std')
)

In [198]:
starters = starters.merge(
    ratings[['player_id', 'Year', 'Rating']],
    on = ['player_id', 'Year'],
    how = 'left'
)
features.append('Rating')

In [262]:
data = starters[['game_id', 'player_id', 'first_basket_scorer'] + features].dropna().reset_index(drop = True)
features_norm = [f for f in features if 'first_basket' not in f and f != 'Rating']
# Normalize per game
data[features_norm] = (
    (
        data[features_norm] - data.groupby('game_id')[features_norm].transform('mean')
    )
    / data.groupby('game_id')[features_norm].transform('std')
)

in_2024 = data['game_id'].apply(lambda x: x[:4] == '2023')
train = data.copy()[~in_2024]
test = data.copy()[in_2024]

X_train = train[features].to_numpy()
X_test = test[features].to_numpy()

y_train = train['first_basket_scorer'].to_numpy()
y_test = test['first_basket_scorer'].to_numpy()

In [254]:
def evaluate(model, X_train, y_train, X_test, y_test, train, test) :

    # Test
    y_pred = model.predict_proba(X_test)[:, -1]
    loss_baseline = log_loss(y_test, 0.1 * np.ones_like(y_test))
    loss = log_loss(y_test, y_pred)
    loss_test = 100 * (loss - loss_baseline) / loss_baseline
    test['pred'] = y_pred
    test['argmax'] = (test.groupby('game_id')['pred'].transform('max') == test['pred']).astype(int)
    test['correct'] = (test['first_basket_scorer'] == test['argmax']).astype(int)
    acc_test = (test.groupby('game_id')['correct'].mean() == 1).mean()

    # Train
    y_pred = model.predict_proba(X_train)[:, -1]
    loss_baseline = log_loss(y_train, 0.1 * np.ones_like(y_train))
    loss = log_loss(y_train, y_pred)
    loss_train = 100 * (loss - loss_baseline) / loss_baseline
    train['pred'] = y_pred
    train['argmax'] = (train.groupby('game_id')['pred'].transform('max') == train['pred']).astype(int)
    train['correct'] = (train['first_basket_scorer'] == train['argmax']).astype(int)
    acc_train = (train.groupby('game_id')['correct'].mean() == 1).mean()

    return loss_test, acc_test, loss_train, acc_train

In [264]:
test

Unnamed: 0,game_id,player_id,first_basket_scorer,PTS_avg,PTS_25,PTS_50,USG%_avg,USG%_25,USG%_50,VORP_avg,VORP_25,VORP_50,FGA_avg,FGA_25,FGA_50,first_basket_avg,first_basket_25,first_basket_50,Rating
50768,202301010DEN,brownja02,0,1.019728,1.261514,1.089535,1.154631,1.404714,1.321383,-0.265292,-0.008006,-0.197989,1.161726,1.517045,1.435030,0.133333,0.00,0.12,1.931067
50769,202301010DEN,caldwke01,0,-0.733749,-0.697785,-0.583533,-0.894915,-0.839785,-0.680068,-0.821390,-1.009388,-0.914813,-0.762892,-0.682855,-0.578415,0.070485,0.04,0.10,0.128142
50770,202301010DEN,tatumja01,0,1.540131,1.708371,1.683780,1.414165,1.563391,1.505586,0.543576,0.741390,1.067634,1.711447,1.797569,1.667197,0.090909,0.20,0.18,2.914481
50771,202301010DEN,gordoaa01,0,-0.046388,0.043704,-0.124460,-0.075672,-0.122987,-0.130961,-0.460847,0.078614,-0.474625,0.004432,-0.343273,-0.302217,0.118644,0.08,0.08,0.619849
50772,202301010DEN,smartma01,0,-0.664530,-0.687964,-0.713604,-0.744058,-0.622043,-0.525758,-0.401468,-0.341362,-0.550013,-0.565861,-0.557357,-0.526378,0.076613,0.00,0.06,1.111556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62352,202312310WAS,youngtr01,0,2.059498,2.166122,1.941661,2.047084,1.580722,1.611911,1.872218,1.998159,1.988046,1.722784,1.675259,1.480416,0.094241,0.00,0.06,2.140643
62353,202312310WAS,johnsja05,0,-1.228351,-0.636936,-1.098626,-0.801660,-0.415123,-0.617182,-0.478161,0.460368,0.284604,-1.244231,-0.756834,-1.089239,0.250000,0.25,0.25,-0.032648
62354,202312310WAS,capelca01,0,0.000935,-0.650445,-0.579887,-0.617233,-0.732332,-0.857938,1.032318,0.295535,0.672448,-0.245324,-0.822790,-0.790537,0.104478,0.04,0.06,1.053998
62355,202312310WAS,beysa01,0,0.083244,-0.481586,-0.456215,-0.179577,-0.835565,-0.671064,-0.335752,-0.559655,-0.295765,0.180675,-0.352860,-0.344530,0.047120,0.04,0.02,0.433057


In [265]:
model = RandomForestClassifier(
    max_depth = 3,
    max_features = 3,
    n_estimators = 400
)

model.fit(X_train, y_train)

evaluate(model, X_train, y_train, X_test, y_test, train.copy(), test.copy())

(-1.2818500001096462,
 np.float64(0.15893470790378006),
 -1.3924665789808932,
 np.float64(0.16561809216119708))

In [None]:
y_pred = model.predict_proba(X_test)[:, -1]
loss_baseline = log_loss(y_test, 0.1 * np.ones_like(y_test))
loss = log_loss(y_test, y_pred)
loss_test = 100 * (loss - loss_baseline) / loss_baseline
test['pred'] = y_pred
test['argmax'] = (test.groupby('game_id')['pred'].transform('max') == test['pred']).astype(int)
test['correct'] = (test['first_basket_scorer'] == test['argmax']).astype(int)

acc_test = (test.groupby('game_id')['correct'].mean() == 1).mean()

In [273]:
test[test['argmax'] == 1].groupby('player_id')['game_id'].count().reset_index().sort_values('game_id', ascending = False).tail(20)

Unnamed: 0,player_id,game_id
38,holmgch01,2
4,baglema01,1
25,gaffoda01,1
75,wagnemo01,1
77,wiggian01,1
78,willija06,1
73,vassede01,1
22,durenja01,1
31,grantje01,1
45,kesslwa01,1


In [259]:
model = LogisticRegression()
model.fit(X_train, y_train)
evaluate(model, X_train, y_train, X_test, y_test, train.copy(), test.copy())

(-1.0409602591023905,
 np.float64(0.15066666666666667),
 -1.138427661808673,
 np.float64(0.15091342335186655))

In [257]:
model = LogisticRegression(penalty = 'l1', solver = 'liblinear', C = 0.01)
model.fit(X_train, y_train)
evaluate(model, X_train, y_train, X_test, y_test, train.copy(), test.copy())

(-0.8708552295799604,
 np.float64(0.14933333333333335),
 -0.8008258273858235,
 np.float64(0.1431294678316124))

In [216]:
from sklearn.decomposition import PCA

In [219]:
model = LogisticRegression()
pca = PCA(n_components = 5)
pcs = pca.fit_transform(X_train)
model.fit(pcs, y_train)
evaluate(model, pca.transform(X_test), y_test)

-0.9007088849159999

In [141]:
model.fit(pcs, y_train)

  y = column_or_1d(y, warn=True)


In [142]:
model.coef_

array([[0.07205394, 0.02603759]])

In [143]:
y_pred = model.predict_proba(pca.transform(X_test))[:, -1]

In [144]:
log_loss(y_test, y_pred) - log_loss(y_test, len(y_test) * [0.1])

-0.0026404069410321807

In [145]:
test['y_pred'] = y_pred

In [146]:
test.sort_values('y_pred', ascending = False)

Unnamed: 0,game_id,player_id,first_basket_scorer,PTS_avg,PTS_25,PTS_50,USG%_avg,USG%_25,USG%_50,VORP_avg,VORP_25,VORP_50,FGA_avg,FGA_25,FGA_50,first_basket_avg,first_basket_25,first_basket_50,y_pred
69348,202404090PHI,embiijo01,0,2.262617,2.424091,2.367581,2.274667,2.217474,2.291331,2.603081,2.699874,2.745917,1.905663,1.974658,1.978467,0.249258,0.40,0.280000,0.169184
68781,202404020PHI,embiijo01,0,2.210108,2.464780,2.469483,2.398856,2.270532,2.452733,2.315665,2.610013,2.608444,1.803999,2.060674,2.115291,0.251497,0.40,0.280000,0.168364
69313,202404090MEM,wembavi01,0,2.415832,2.187874,2.448943,2.335628,1.936079,2.269823,2.207069,2.489463,2.393843,2.409856,2.127616,2.443934,0.217391,0.32,0.240000,0.167289
63286,202401130DAL,irvinky01,0,2.307803,2.205828,2.358785,2.004522,1.878044,1.995839,2.558945,2.547740,2.510361,2.268607,2.141940,2.260537,0.162055,0.28,0.140000,0.166666
68818,202404020WAS,antetgi01,0,2.269727,2.136975,2.384523,2.132734,1.993625,2.103022,2.657622,2.706557,2.734604,1.879324,1.605499,1.913319,0.174684,0.24,0.220000,0.166173
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63962,202401230BRK,simsje01,0,-1.872283,-1.727886,-1.737376,-1.834889,-1.862485,-1.878536,-1.942436,-1.129638,-1.262001,-1.916913,-1.875397,-1.825793,0.000000,0.00,0.000000,0.066759
68678,202404010CHI,krejcvi01,0,-1.676529,-1.903480,-1.941171,-1.621423,-1.879858,-1.890135,-1.421309,-1.421539,-1.886777,-1.633112,-1.905871,-1.853333,0.000000,0.00,0.000000,0.066172
67061,202403100ATL,krejcvi01,0,-1.958374,-2.116185,-1.995591,-1.700793,-1.670654,-1.621971,-1.999408,-1.238079,-1.701228,-1.910293,-1.996344,-1.843386,0.000000,0.00,0.000000,0.065106
68414,202403280ATL,krejcvi01,0,-2.088776,-1.894712,-1.925592,-1.934995,-1.808547,-1.742394,-1.743468,-1.418082,-1.594648,-2.059752,-1.876977,-1.870955,0.000000,0.00,0.000000,0.065098


In [101]:
train['pred_ML'] = (train.groupby('game_id')['y_pred'].transform('max') == train['y_pred']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_ML'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.14805401111993646)

In [102]:
train['pred_PTS'] = (train.groupby('game_id')['PTS_avg'].transform('max') == train['PTS_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_PTS'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.13200953137410643)

In [103]:
train['pred_USG'] = (train.groupby('game_id')['USG%_avg'].transform('max') == train['USG%_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_USG'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.12883240667196189)

In [104]:
test['y_pred'] = model.predict_proba(X_test)[:, -1]

In [105]:
test['pred_ML'] = (test.groupby('game_id')['y_pred'].transform('max') == test['y_pred']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_ML'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.16266666666666665)

In [106]:
test['pred_PTS'] = (test.groupby('game_id')['PTS_avg'].transform('max') == test['PTS_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_PTS'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12933333333333333)

In [115]:
test['pred_FGA'] = (test.groupby('game_id')['FGA_avg'].transform('max') == test['FGA_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_FGA'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12933333333333333)

In [108]:
(model.predict(X_test) == y_test).mean()

np.float64(0.8994772818657016)

In [109]:
(model.predict(X_train) == y_train).mean()

np.float64(0.8999150055326587)

In [110]:
((np.random.uniform(0, 1, (len(test),)) < 0.1) == y_test).mean()

np.float64(0.8232140463744806)

In [111]:
-np.mean(y_test * np.log(model.predict_proba(X_test)[:, -1] + 1e-15) + (1 - y_test) * np.log(1 - model.predict_proba(X_test)[:, -1] + 1e-15))

np.float64(0.3229264521309898)

In [112]:
-np.mean(y_test * np.log(0.1) + (1 - y_test) * np.log(1 - 0.1))

np.float64(0.32623150252314786)

In [113]:
-np.mean(y_train * np.log(model.predict_proba(X_train)[:, -1] + 1e-15) + (1 - y_train) * np.log(1 - model.predict_proba(X_train)[:, -1] + 1e-15))

np.float64(0.32165258292608306)

In [114]:
-np.mean(y_train * np.log(0.1) + (1 - y_train) * np.log(1 - 0.1))

np.float64(0.32526972532402804)