In [1]:
import pandas as pd
import numpy as np
import os

In [66]:
# Load data
years = [2019, 2020, 2021, 2022, 2023, 2024]
cols = ['game_id', 'Date', 'season', 'Home', 'Away', 'jumpball_home', 'jumpball_away', 'jumpball_possession_tm']
first_basket = pd.concat([pd.read_csv(f'../data/first_basket_{year}.csv') for year in years])

In [67]:
stats = pd.concat([pd.read_csv(f'../data/rosters.nosync/rosters_{year}.csv') for year in years]).reset_index(drop = True)
stats = pd.merge(stats, first_basket[['game_id', 'first_basket']], on = 'game_id', how = 'inner')
stats['first_basket_scorer'] = (stats['player_id'] == stats['first_basket']).astype(int)

In [68]:
features = []
for f in ['PTS', 'USG%', 'VORP', 'FGA'] :

    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}'].expanding().mean().reset_index(level = 0, drop = True)
    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}_avg'].shift()
    features.append(f'{f}_avg')

    for w in [25, 50] :
        stats[f'{f}_{w}'] = stats.groupby('player_id')[f'{f}'].rolling(w, min_periods = 1).mean().reset_index(level = 0, drop = True)
        stats[f'{f}_{w}'] = stats.groupby('player_id')[f'{f}_{w}'].shift()
        features.append(f'{f}_{w}')

In [69]:
starters = stats.copy()[stats['starter']]
non_starters = stats.copy()[~stats['starter']]

In [70]:
starters['first_basket_avg'] = starters.groupby('player_id')['first_basket_scorer'].expanding().mean().reset_index(level = 0, drop = True)
starters['first_basket_avg'] = starters.groupby('player_id')['first_basket_avg'].shift()
features.append(f'first_basket_avg')

for w in [25, 50] :
    starters[f'first_basket_{w}'] = starters.groupby('player_id')['first_basket_scorer'].rolling(w, min_periods = 1).mean().reset_index(level = 0, drop = True)
    starters[f'first_basket_{w}'] = starters.groupby('player_id')[f'first_basket_{w}'].shift()
    features.append(f'first_basket_{w}')

In [116]:
for _, game in starters.groupby('game_id') :
    pass

In [148]:
game[features].isna().sum().sum()

np.int64(0)

In [168]:
Xs, ys = [], []
for _, game in starters.groupby('game_id') :

    if game[features].isna().sum().sum() == 0 :

        game[features + ['first_basket_scorer']]
        Xy = game[features + ['first_basket_scorer']].to_numpy()
        for _ in range(10) :
            np.random.shuffle(Xy)
            X = Xy[:, :-1].flatten()
            y = Xy[:, -1]

            Xs.append(X)
            ys.append(y)

In [169]:
X = np.array(Xs)
y = np.array(ys)

In [172]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

In [232]:
# Input layer
inputs = Input(shape = (10 * len(features),))

# Hidden layers
x = Dense(64, activation='relu')(inputs)  
x = Dense(32, activation='relu')(x)

# Output layer
outputs = Dense(10, activation = 'softmax')(x)

# Create the model
model = Model(inputs = inputs, outputs = outputs)

# Compile the model
model.compile(optimizer = 'adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy', 'categorical_crossentropy'])

# Model summary
model.summary()

In [233]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 128)

Epoch 1/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 584us/step - accuracy: 0.1025 - categorical_crossentropy: 2.3234 - loss: 2.3234
Epoch 2/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 526us/step - accuracy: 0.1124 - categorical_crossentropy: 2.2970 - loss: 2.2970
Epoch 3/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 528us/step - accuracy: 0.1128 - categorical_crossentropy: 2.2949 - loss: 2.2949
Epoch 4/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 512us/step - accuracy: 0.1166 - categorical_crossentropy: 2.2936 - loss: 2.2936
Epoch 5/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516us/step - accuracy: 0.1184 - categorical_crossentropy: 2.2911 - loss: 2.2911
Epoch 6/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 511us/step - accuracy: 0.1179 - categorical_crossentropy: 2.2900 - loss: 2.2900
Epoch 7/10
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [234]:
y_pred = model.predict(X_train)

[1m1807/1807[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210us/step


In [239]:
np.where(y_pred == y_pred.max())

(array([50536]), array([4]))

In [243]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

data = starters[['game_id', 'player_id', 'first_basket_scorer'] + features].dropna().reset_index(drop = True)
features_norm = [f for f in features if 'first_basket' not in f]
# Normalize per game
data[features_norm] = (
    (
        data[features_norm] - data.groupby('game_id')[features_norm].transform('mean')
    )
    / data.groupby('game_id')[features_norm].transform('std')
)

in_2024 = data['game_id'].apply(lambda x: x[:4] == '2024')
train = data.copy()[~in_2024]
test = data.copy()[in_2024]

In [206]:
len(features)

15

In [209]:
Xs, ys = [], []
for _, game in train.groupby('game_id') :
        
    if game.shape[0] == 10 :

        Xy = game[features + ['first_basket_scorer']].to_numpy()
        for _ in range(10) :
            np.random.shuffle(Xy)
            X = Xy[:, :-1].flatten()
            y = Xy[:, -1]

            Xs.append(X)
            ys.append(y)

In [211]:
X_train = np.array(Xs)
y_train = np.array(ys)

In [None]:

X_train = train[features].to_numpy()
X_test = test[features].to_numpy()

y_train = train['first_basket_scorer'].to_numpy()
y_test = test['first_basket_scorer'].to_numpy()

In [99]:
model = LogisticRegression()
# model = RandomForestClassifier(n_estimators = 20)
model.fit(X_train, y_train)

In [100]:
train['y_pred'] = model.predict_proba(X_train)[:, -1]

In [101]:
train['pred_ML'] = (train.groupby('game_id')['y_pred'].transform('max') == train['y_pred']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_ML'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.14805401111993646)

In [102]:
train['pred_PTS'] = (train.groupby('game_id')['PTS_avg'].transform('max') == train['PTS_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_PTS'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.13200953137410643)

In [103]:
train['pred_USG'] = (train.groupby('game_id')['USG%_avg'].transform('max') == train['USG%_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_USG'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.12883240667196189)

In [104]:
test['y_pred'] = model.predict_proba(X_test)[:, -1]

In [105]:
test['pred_ML'] = (test.groupby('game_id')['y_pred'].transform('max') == test['y_pred']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_ML'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.16266666666666665)

In [106]:
test['pred_PTS'] = (test.groupby('game_id')['PTS_avg'].transform('max') == test['PTS_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_PTS'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12933333333333333)

In [115]:
test['pred_FGA'] = (test.groupby('game_id')['FGA_avg'].transform('max') == test['FGA_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_FGA'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12933333333333333)

In [108]:
(model.predict(X_test) == y_test).mean()

np.float64(0.8994772818657016)

In [109]:
(model.predict(X_train) == y_train).mean()

np.float64(0.8999150055326587)

In [110]:
((np.random.uniform(0, 1, (len(test),)) < 0.1) == y_test).mean()

np.float64(0.8232140463744806)

In [111]:
-np.mean(y_test * np.log(model.predict_proba(X_test)[:, -1] + 1e-15) + (1 - y_test) * np.log(1 - model.predict_proba(X_test)[:, -1] + 1e-15))

np.float64(0.3229264521309898)

In [112]:
-np.mean(y_test * np.log(0.1) + (1 - y_test) * np.log(1 - 0.1))

np.float64(0.32623150252314786)

In [113]:
-np.mean(y_train * np.log(model.predict_proba(X_train)[:, -1] + 1e-15) + (1 - y_train) * np.log(1 - model.predict_proba(X_train)[:, -1] + 1e-15))

np.float64(0.32165258292608306)

In [114]:
-np.mean(y_train * np.log(0.1) + (1 - y_train) * np.log(1 - 0.1))

np.float64(0.32526972532402804)