In [64]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier

from sklearn.metrics import r2_score, mean_absolute_error

In [90]:
df = pd.read_csv('processed_data/data.csv')
df = df.sort_values('game_day')
exclude_cols = ['game_id','game_day', 'team']#,'away_score','home_score']

In [91]:
df.columns

Index(['game_id', 'fgm_home', 'fga_home', '2pm_home', '2pa_home', '3pm_home',
       '3pa_home', 'ftm_home', 'fta_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'to_home', 'pf_home',
       'home_rank', 'away_rank', 'is_conference', 'is_neutral', 'game_day',
       'homeTeam_wins', 'homeTeam_losses', 'awayTeam_wins', 'awayTeam_losses',
       'fgm_away', 'fga_away', '2pm_away', '2pa_away', '3pm_away', '3pa_away',
       'ftm_away', 'fta_away', 'oreb_away', 'dreb_away', 'reb_away',
       'ast_away', 'stl_away', 'blk_away', 'to_away', 'pf_away', 'is_home',
       'team', 'score'],
      dtype='object')

In [92]:
df.shape

(9214, 45)

In [94]:
df.head()

Unnamed: 0,game_id,fgm_home,fga_home,2pm_home,2pa_home,3pm_home,3pa_home,ftm_home,fta_home,oreb_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,to_away,pf_away,is_home,team,score
0,401575451,0.709091,0.54902,0.530612,0.44,0.590909,0.442308,0.2,0.196078,0.121212,...,0.25,0.2875,0.189189,0.304348,0.055556,0.257143,0.325,1,Kansas Jayhawks,99
4637,401583759,0.490909,0.607843,0.285714,0.4,0.590909,0.615385,0.275,0.235294,0.30303,...,0.428571,0.375,0.162162,0.217391,0.0,0.4,0.325,0,Winthrop Eagles,56
4638,401583744,0.418182,0.480392,0.285714,0.386667,0.409091,0.384615,0.425,0.490196,0.30303,...,0.357143,0.4,0.27027,0.391304,0.222222,0.342857,0.525,0,Monmouth Hawks,61
4639,401583589,0.527273,0.637255,0.387755,0.493333,0.454545,0.538462,0.1,0.137255,0.272727,...,0.517857,0.4875,0.216216,0.217391,0.0,0.4,0.25,0,The Citadel Bulldogs,59
4640,401582026,0.618182,0.666667,0.653061,0.72,0.090909,0.269231,0.6,0.764706,0.606061,...,0.410714,0.4375,0.351351,0.347826,0.166667,0.314286,0.65,0,UMBC Retrievers,93


In [84]:
rr = RidgeClassifier(alpha=1)
split=TimeSeriesSplit(n_splits=3)

sfs= SequentialFeatureSelector(rr, n_features_to_select=20, direction='forward', cv=split)

In [95]:
feature_cols = df.columns[~df.columns.isin(exclude_cols)]
target_col = 'score'

In [96]:
sfs.fit(df[feature_cols],df[target_col])

In [97]:
predictors = list(feature_cols[sfs.get_support()])

In [98]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions=[]
    unique_days=sorted(df['game_day'].unique())

    for i in range(start, len(unique_days), step):
        day = unique_days[i]
        train = df[df['game_day']<day]
        test = df[df['game_day']==day]

        model.fit(train[predictors], train[target_col])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)

        combined = pd.concat([test[target_col], preds], axis=1)
        combined.columns = ['actual','prediction']

        all_predictions.append(combined)
    return pd.concat(all_predictions)


In [99]:
predictions = backtest(df, rr, predictors)

In [100]:
predictions

Unnamed: 0,actual,prediction
4897,77,73
4890,65,73
4891,66,61
4892,64,70
4893,71,63
...,...,...
4602,64,63
4603,80,79
4604,79,75
4598,76,74


In [101]:
print("r2:", r2_score(predictions['actual'],predictions['prediction']))
print("MAE:", mean_absolute_error(predictions['actual'],predictions['prediction']))

r2: 0.5769069914645399
MAE: 5.915081366032546


In [102]:
#TODO: rolling avg