In [61]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import r2_score, mean_absolute_error

In [127]:
df = pd.read_csv('processed_data/data.csv')
df = df.sort_values('game_day').reset_index(drop=True)
exclude_cols = ['game_id', 'team', 'game_day', 'score']


In [126]:
df.columns

Index(['game_id', 'fgm_5_home', 'fga_5_home', '2pm_5_home', '2pa_5_home',
       '3pm_5_home', '3pa_5_home', 'ftm_5_home', 'fta_5_home', 'oreb_5_home',
       'dreb_5_home', 'reb_5_home', 'ast_5_home', 'stl_5_home', 'blk_5_home',
       'to_5_home', 'pf_5_home', 'home_rank', 'away_rank', 'is_conference',
       'is_neutral', 'game_day', 'homeTeam_wins', 'homeTeam_losses',
       'awayTeam_wins', 'awayTeam_losses', 'fgm_5_away', 'fga_5_away',
       '2pm_5_away', '2pa_5_away', '3pm_5_away', '3pa_5_away', 'ftm_5_away',
       'fta_5_away', 'oreb_5_away', 'dreb_5_away', 'reb_5_away', 'ast_5_away',
       'stl_5_away', 'blk_5_away', 'to_5_away', 'pf_5_away', 'is_home', 'team',
       'score'],
      dtype='object')

In [64]:
df.shape

(9214, 45)

In [138]:
df.head()

Unnamed: 0,game_id,fgm_5_home,fga_5_home,2pm_5_home,2pa_5_home,3pm_5_home,3pa_5_home,ftm_5_home,fta_5_home,oreb_5_home,...,dreb_5_away,reb_5_away,ast_5_away,stl_5_away,blk_5_away,to_5_away,pf_5_away,is_home,team,score
0,401606133,0.469091,0.6,0.379592,0.485333,0.327273,0.476923,0.375,0.431373,0.284848,...,0.478571,0.5175,0.356757,0.191304,0.255556,0.24,0.405,1,South Alabama Jaguars,82
1,401606133,0.469091,0.6,0.379592,0.485333,0.327273,0.476923,0.375,0.431373,0.284848,...,0.478571,0.5175,0.356757,0.191304,0.255556,0.24,0.405,0,Denver Pioneers,75
2,401584339,0.465455,0.603922,0.293878,0.397333,0.509091,0.611538,0.435,0.419608,0.290909,...,0.417857,0.465,0.275676,0.373913,0.222222,0.4,0.425,0,Northwestern State Demons,74
3,401604346,0.476364,0.609804,0.334694,0.456,0.445455,0.538462,0.38,0.396078,0.30303,...,0.446429,0.455,0.308108,0.243478,0.255556,0.291429,0.38,0,SIU Edwardsville Cougars,60
4,401577562,0.494545,0.535294,0.428571,0.512,0.281818,0.311538,0.415,0.447059,0.212121,...,0.464286,0.4275,0.378378,0.46087,0.177778,0.32,0.45,0,Maine Black Bears,80


In [128]:
rr = RidgeClassifier(alpha=1)
split=TimeSeriesSplit(n_splits=3)

sfs= SequentialFeatureSelector(rr, n_features_to_select=25, direction='forward', cv=split)

In [129]:
feature_cols = df.columns[~df.columns.isin(exclude_cols)]
target_col = 'score'

In [130]:
sfs.fit(df[feature_cols],df[target_col])

In [131]:
predictors = list(feature_cols[sfs.get_support()])

In [None]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions=[]
    unique_days=sorted(df['game_day'].unique())

    for i in range(start, len(unique_days), step):
        day = unique_days[i]
        train = df[df['game_day']<day]
        test = df[df['game_day']==day]

        model.fit(train[predictors], train[target_col])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test[target_col], preds], axis=1)
        combined.columns = ['actual','prediction']

        all_predictions.append(combined)
    return pd.concat(all_predictions)


In [132]:
predictions = backtest(df, rr, predictors)

In [133]:
predictions

Unnamed: 0,actual,prediction
216,68,58
217,55,58
218,79,71
219,56,88
220,69,74
...,...,...
7861,64,77
7862,80,75
7863,79,68
7864,76,78


In [135]:
print("r2:", r2_score(predictions['actual'],predictions['prediction']))
print("MAE:", mean_absolute_error(predictions['actual'],predictions['prediction']))

r2: 0.06131634881042991
MAE: 9.000915032679739


In [136]:
predictors

['fgm_5_home',
 'fga_5_home',
 '2pm_5_home',
 '2pa_5_home',
 '3pm_5_home',
 '3pa_5_home',
 'dreb_5_home',
 'ast_5_home',
 'to_5_home',
 'is_conference',
 'is_neutral',
 'homeTeam_wins',
 'fgm_5_away',
 'fga_5_away',
 '2pm_5_away',
 '2pa_5_away',
 '3pm_5_away',
 '3pa_5_away',
 'ftm_5_away',
 'fta_5_away',
 'reb_5_away',
 'ast_5_away',
 'blk_5_away',
 'to_5_away',
 'is_home']

In [137]:
for col in df.columns:
    if col not in predictors:
        print(col)

game_id
ftm_5_home
fta_5_home
oreb_5_home
reb_5_home
stl_5_home
blk_5_home
pf_5_home
home_rank
away_rank
game_day
homeTeam_losses
awayTeam_wins
awayTeam_losses
oreb_5_away
dreb_5_away
stl_5_away
pf_5_away
team
score
