# Basic data cleaning

In [1]:
import pandas as pd
df = pd.read_csv('nba_games.csv',index_col=0)

In [2]:
df = df.sort_values("date")
df = df.reset_index(drop=True)

In [3]:
del df["mp.1"]
del df["mp_opp.1"]
del df["index_opp"]

In [4]:
def add_target(group):
    group["target"] = group["won"].shift(-1)
    return group

df = df.groupby("team", group_keys=False).apply(add_target)

In [5]:
df["target"][pd.isnull(df["target"])] = 2
df["target"] = df["target"].astype(int, errors="ignore")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["target"][pd.isnull(df["target"])] = 2


In [6]:
nulls = pd.isnull(df).sum()
nulls = nulls[nulls > 0]
valid_columns = df.columns[~df.columns.isin(nulls.index)]
df = df[valid_columns].copy()

In [7]:
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = RidgeClassifier(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=30, 
                                direction="forward",
                                cv=split,
                                n_jobs=1
                               )

In [8]:
removed_columns = ["season", "date", "won", "target", "team", "team_opp"]
selected_columns = df.columns[~df.columns.isin(removed_columns)]
selected_columns

Index(['mp', 'fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%',
       ...
       'trb%_max_opp', 'ast%_max_opp', 'stl%_max_opp', 'blk%_max_opp',
       'tov%_max_opp', 'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp',
       'total_opp', 'home_opp'],
      dtype='object', length=136)

In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

In [10]:
sfs.fit(df[selected_columns], df["target"])

In [11]:
#provided a boolean mask of selected features which is nice
predictors = list(selected_columns[sfs.get_support()])

In [12]:
#split data up by seasons to predict future seasons
def backtest(data, model, predictors, start=2, step=1):
    all_predictions = []
    
    seasons = sorted(data["season"].unique())
    
    for i in range(start, len(seasons), step):
        season = seasons[i]
        train = data[data["season"] < season]
        test = data[data["season"] == season]
        
        model.fit(train[predictors], train["target"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["target"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined) #list of df's
    return pd.concat(all_predictions) #combine all df's into one

In [13]:
predictions = backtest(df, rr, predictors)


In [14]:
from sklearn.metrics import accuracy_score

predictions = predictions[predictions['actual']!=2]
accuracy_score(predictions["actual"], predictions["prediction"])

0.5485110470701249

In [15]:
#shape[0] here is the same as len()
#We want to beat 57% because we can predict winner of game of we simply predict
#The home team so as it stands our model is actually worsee then just
#always picking home team
df.groupby('home').apply(lambda x: x[x['won']==1].shape[0]/x.shape[0])

home
0.0    0.428314
1.0    0.571686
dtype: float64

In [18]:
#Instead of just using 1 game to predict the next game lets predict
#using the last 10 games

df_rolling = df[list(selected_columns) + ['won','team','season']]

In [24]:
#rolling groups the previous ten 
#by previous ten I mean the current row and the 9 row before it
def find_team_averages(team):
    rolling = team.rolling(10).mean()
    return rolling

In [27]:
df_rolling = df_rolling.groupby(['team','season'],
                                group_keys=False).apply(find_team_averages)

In [29]:
rolling_cols = [f"{col}_10" for col in df_rolling.columns]
df_rolling.columns = rolling_cols
df = pd.concat([df, df_rolling], axis=1)

In [46]:
df = df.dropna()

In [49]:
#want a col that tells us if the next game is a home game
#who the next opposing tema is
#and the date of the next game
def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col

def add_col(df, col_name):
    return df.groupby("team", group_keys=False).apply(lambda x: shift_col(x, col_name))

df["home_next"] = add_col(df, "home")
df["team_opp_next"] = add_col(df, "team_opp")
df["date_next"] = add_col(df, "date")

In [53]:
df = df.copy()

In [54]:
full = df.merge(df[rolling_cols + ["team_opp_next", "date_next", "team"]], left_on=["team", "date_next"], right_on=["team_opp_next", "date_next"])

In [55]:
full[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]


Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,SAC,TOR,TOR,SAC,2015-11-15
1,TOR,SAC,SAC,TOR,2015-11-15
2,CLE,DET,DET,CLE,2015-11-17
3,GSW,TOR,TOR,GSW,2015-11-17
4,DEN,NOP,NOP,DEN,2015-11-17
...,...,...,...,...,...
15769,BOS,GSW,GSW,BOS,2022-06-10
15770,GSW,BOS,BOS,GSW,2022-06-13
15771,BOS,GSW,GSW,BOS,2022-06-13
15772,GSW,BOS,BOS,GSW,2022-06-16


In [56]:
removed_columns = list(full.columns[full.dtypes == "object"]) + removed_columns


In [57]:
selected_columns = full.columns[~full.columns.isin(removed_columns)]
sfs.fit(full[selected_columns], full["target"])

In [58]:
predictors = list(selected_columns[sfs.get_support()])

In [59]:
predictions = backtest(full, rr, predictors)

In [60]:
accuracy_score(predictions["actual"], predictions["prediction"])

0.6296296296296297