In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Load data
years = [2019, 2020, 2021, 2022, 2023, 2024]
cols = ['game_id', 'Date', 'season', 'Home', 'Away', 'jumpball_home', 'jumpball_away', 'jumpball_possession_tm']
first_basket = pd.concat([pd.read_csv(f'../data/first_basket_{year}.csv') for year in years])

In [116]:
stats = pd.concat([pd.read_csv(f'../data/rosters.nosync/rosters_{year}.csv') for year in years]).reset_index(drop = True)

In [117]:
features = []
for f in ['PTS', 'USG%', 'VORP', 'FGA'] :

    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}'].expanding().mean().reset_index(level = 0, drop = True)
    stats[f'{f}_avg'] = stats.groupby('player_id')[f'{f}_avg'].shift()
    features.append(f'{f}_avg')

    stats[f'{f}_25'] = stats.groupby('player_id')[f'{f}'].rolling(25, min_periods = 1).mean().reset_index(level = 0, drop = True)
    stats[f'{f}_25'] = stats.groupby('player_id')[f'{f}_25'].shift()
    features.append(f'{f}_25')

In [118]:
stats = stats.merge(first_basket[['game_id', 'first_basket']],
          on = 'game_id',
          how = 'left')

In [119]:
stats['first_basket_scorer'] = (stats['player_id'] == stats['first_basket']).astype(int)

In [120]:
stats = stats[stats['starter']]

In [122]:
# Create dataframe with all rosters
    # Create starter_flag column
    # Sort dataframe by date
    # Compute rolling averages of [PTS, FGA, USG%, VORP, ...?] with player groupby

# Left merge with first_basket to create target

In [132]:
stats

Unnamed: 0,game_id,Player,player_id,Team,MP,FG,FGA,FG%,3P,3PA,...,PTS_avg,PTS_25,USG%_avg,USG%_25,VORP_avg,VORP_25,FGA_avg,FGA_25,first_basket,first_basket_scorer
0,201810160BOS,Jaylen Brown,brownja02,BOS,28.383333,5,13,0.385,1,3,...,,,,,,,,,embiijo01,0
1,201810160BOS,Kyrie Irving,irvinky01,BOS,29.000000,2,14,0.143,1,8,...,,,,,,,,,embiijo01,0
2,201810160BOS,Jayson Tatum,tatumja01,BOS,28.933333,9,17,0.529,1,5,...,,,,,,,,,embiijo01,0
3,201810160BOS,Gordon Hayward,haywago01,BOS,24.616667,4,12,0.333,1,3,...,,,,,,,,,embiijo01,0
10,201810160BOS,Al Horford,horfoal01,BOS,29.950000,4,7,0.571,0,1,...,,,,,,,,,embiijo01,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149869,202404140SAS,Jaden Ivey,iveyja01,DET,27.066667,5,13,0.385,1,6,...,15.840000,15.80,25.172667,26.204,-0.725333,-1.216,12.953333,14.16,wisemja01,0
149870,202404140SAS,Troy Brown Jr.,browntr01,DET,11.166667,0,3,0.000,0,0,...,6.185915,3.92,15.380563,12.188,0.082535,-1.180,5.481690,4.40,wisemja01,0
149877,202404140SAS,Blake Wesley,weslebl01,SAS,30.283333,7,9,0.778,1,1,...,4.484536,4.24,16.603093,14.624,-1.518557,-1.000,4.340206,3.68,wisemja01,0
149879,202404140SAS,Chimezie Metu,metuch01,DET,33.433333,4,9,0.444,1,4,...,5.841699,6.96,19.548649,14.740,0.128185,0.520,4.563707,5.24,wisemja01,0


In [179]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

data = stats[['game_id', 'player_id', 'first_basket_scorer'] + features].dropna().reset_index(drop = True)
# Normalize per game
data[features] = (
    (
        data[features] - data.groupby('game_id')[features].transform('mean')
    )
    / data.groupby('game_id')[features].transform('std')
)

in_2024 = data['game_id'].apply(lambda x: x[:4] == '2024')
train = data.copy()[~in_2024]
test = data.copy()[in_2024]

X_train = train[features].to_numpy()
X_test = test[features].to_numpy()

y_train = train['first_basket_scorer'].to_numpy()
y_test = test['first_basket_scorer'].to_numpy()

In [200]:
model = LogisticRegression()
model = RandomForestClassifier(n_estimators = 20)
model.fit(X_train, y_train)

In [201]:
train['y_pred'] = model.predict_proba(X_train)[:, -1]

In [202]:
train['pred_ML'] = (train.groupby('game_id')['y_pred'].transform('max') == train['y_pred']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_ML'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.9977760127084988)

In [203]:
train['pred_PTS'] = (train.groupby('game_id')['PTS_avg'].transform('max') == train['PTS_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_PTS'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.13200953137410643)

In [204]:
train['pred_USG'] = (train.groupby('game_id')['USG%_avg'].transform('max') == train['USG%_avg']).astype(int)
((train['first_basket_scorer'] == 1 ) & (train['pred_USG'] == 1)).sum() / train['game_id'].unique().shape[0]

np.float64(0.1289912629070691)

In [205]:
test['y_pred'] = model.predict_proba(X_test)[:, -1]

In [206]:
test['pred_ML'] = (test.groupby('game_id')['y_pred'].transform('max') == test['y_pred']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_ML'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.15066666666666667)

In [207]:
test['pred_PTS'] = (test.groupby('game_id')['PTS_avg'].transform('max') == test['PTS_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_PTS'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12933333333333333)

In [208]:
test['pred_USG'] = (test.groupby('game_id')['USG%_avg'].transform('max') == test['USG%_avg']).astype(int)
((test['first_basket_scorer'] == 1 ) & (test['pred_USG'] == 1)).sum() / test['game_id'].unique().shape[0]

np.float64(0.12133333333333333)

In [209]:
(model.predict(X_test) == y_test).mean()

np.float64(0.8995866115482064)

In [210]:
(model.predict(X_train) == y_train).mean()

np.float64(0.9916368550759201)

In [211]:
((np.random.uniform(0, 1, (len(test),)) < 0.1) == y_test).mean()

np.float64(0.8209094545939458)

In [212]:
-np.mean(y_test * np.log(model.predict_proba(X_test)[:, -1] + 1e-15) + (1 - y_test) * np.log(1 - model.predict_proba(X_test)[:, -1] + 1e-15))

np.float64(0.7402103969577682)

In [213]:
-np.mean(y_test * np.log(0.1) + (1 - y_test) * np.log(1 - 0.1))

np.float64(0.3251122736258439)

In [214]:
-np.mean(y_train * np.log(model.predict_proba(X_train)[:, -1] + 1e-15) + (1 - y_train) * np.log(1 - model.predict_proba(X_train)[:, -1] + 1e-15))

np.float64(0.07878734606989182)

In [215]:
-np.mean(y_train * np.log(0.1) + (1 - y_train) * np.log(1 - 0.1))

np.float64(0.32478602754666946)