In [2]:
# import libraries
import pandas as pd

In [3]:
df_games = pd.read_csv('data/games_rolling.csv')
df_games.head()

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,team_name_home,game_id,game_date,wl_home,fgm_home,fga_home,fg_pct_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,elo_away
0,21981,1610612749,MIL,Milwaukee Bucks,28100518,1982-02-05,1,43.5,92.0,0.48,...,13.5,38.5,20.5,4.5,4.0,20.0,28.0,89.5,-19.0,1361.309587
1,21981,1610612741,CHI,Chicago Bulls,28100564,1982-02-13,0,40.0,92.33,0.44,...,18.33,40.67,24.0,8.67,6.33,20.0,28.0,99.67,-2.67,1652.547934
2,21981,1610612751,NJN,New Jersey Nets,28100625,1982-02-24,0,40.25,90.75,0.45,...,19.5,40.25,22.75,11.0,6.25,22.0,28.0,103.0,-0.25,1673.129963
3,21981,1610612749,MIL,Milwaukee Bucks,28100656,1982-03-02,1,40.8,90.0,0.46,...,21.0,42.2,22.0,10.4,5.2,22.0,27.4,100.6,-2.2,1479.863124
4,21981,1610612759,SAN,San Antonio Spurs,28100679,1982-03-06,1,45.33,94.33,0.48,...,22.33,41.0,24.0,10.33,4.83,20.17,28.5,111.5,-2.67,1682.372699


In [4]:
# drop non-numeric columns
df_games = df_games.select_dtypes(include=['float64', 'int64'])
df_games.drop(['season_id', 'team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
df_games.head()

Unnamed: 0,wl_home,fgm_home,fga_home,fg_pct_home,fg3m_home,fg3a_home,fg3_pct_home,ftm_home,fta_home,ft_pct_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,elo_away
0,1,43.5,92.0,0.48,1.0,2.0,0.67,20.5,31.0,0.67,...,13.5,38.5,20.5,4.5,4.0,20.0,28.0,89.5,-19.0,1361.309587
1,0,40.0,92.33,0.44,1.67,4.67,0.54,20.67,30.33,0.69,...,18.33,40.67,24.0,8.67,6.33,20.0,28.0,99.67,-2.67,1652.547934
2,0,40.25,90.75,0.45,1.75,5.0,0.49,21.0,30.0,0.71,...,19.5,40.25,22.75,11.0,6.25,22.0,28.0,103.0,-0.25,1673.129963
3,1,40.8,90.0,0.46,1.4,4.2,0.39,19.8,27.6,0.73,...,21.0,42.2,22.0,10.4,5.2,22.0,27.4,100.6,-2.2,1479.863124
4,1,45.33,94.33,0.48,1.33,4.0,0.38,22.17,29.5,0.75,...,22.33,41.0,24.0,10.33,4.83,20.17,28.5,111.5,-2.67,1682.372699


First we will separate the features from the target data. 

In [5]:
features = df_games.drop(columns=['wl_home', 'wl_away'])
target = df_games['wl_home']

print(f"{features.shape[0]} games with {features.shape[1]} features")
print(features.columns)

41865 games with 40 features
Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')


The baseline accuracy of the model will be predicting the home team always wins.

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def find_metrics(labels, preds):
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds),
        'f1': f1_score(labels, preds)
    }

def print_metrics(labels, preds):
    metrics = find_metrics(labels, preds)
    print(f"Accuracy: {metrics['accuracy']}")
    print(f"Precision: {metrics['precision']}")
    print(f"Recall: {metrics['recall']}")
    print(f"F1 Score: {metrics['f1']}")

# baseline model
baseline_preds = [1] * len(target) # home team always wins
print_metrics(target, baseline_preds)

# elo predictions, elo_home + 100 > elo_away
elo_preds = [1 if x + 100 > y else 0 for (x,y) in zip(features['elo_home'], features['elo_away'])]
# print_metrics(target, elo_preds)

Accuracy: 0.6023886301206258
Precision: 0.6023886301206258
Recall: 1.0
F1 Score: 0.751863335519647


## Feature Selection

In [7]:
features.columns

Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')

We use the `StandardScalar` from `sklearn.preprocessing`, which standardizes data with it's z-score for each feature.

In [13]:
# preprocessing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [9]:
# time series split for cross validation
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

def train(model, cv = tscv, n_splits = 5, X = features, y = target, scaler=scaler):
    X_scaled = scaler.fit_transform(X)

    for train_index, test_index in tscv.split(features_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        # print_metrics(y_test, preds)
    return find_metrics(y_test, preds)

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=10)
knn_tests = {x: train(KNeighborsClassifier(n_neighbors=x)) for x in range(2,15)}

# print_metrics(y_test, preds)

Accuracy: 0.5817686684821557
Precision: 0.7072293097808309
Recall: 0.5166069295101553
F1 Score: 0.5970726318696493
Accuracy: 0.5455066647556256
Precision: 0.7013422818791947
Recall: 0.442484121383204
F1 Score: 0.5426222414539161
Accuracy: 0.5423534470402752
Precision: 0.6920077972709552
Recall: 0.42484442316898036
F1 Score: 0.5264718967818478
Accuracy: 0.568582485308872
Precision: 0.6937679083094556
Recall: 0.47336265884652984
F1 Score: 0.5627542126670539
Accuracy: 0.5437867278199799
Precision: 0.6693262411347518
Recall: 0.38256903977704587
F1 Score: 0.4868611961953894
Accuracy: 0.6303568869141465
Precision: 0.666943866943867
Recall: 0.766547192353644
F1 Score: 0.7132851584213453
Accuracy: 0.6120108929339257
Precision: 0.6744238590149119
Recall: 0.7021877205363444
F1 Score: 0.6880258153739771
Accuracy: 0.6141608141034829
Precision: 0.6686336813436223
Recall: 0.7051220679751077
F1 Score: 0.6863932898415657
Accuracy: 0.6230471549376523
Precision: 0.6624444444444444
Recall: 0.728494623655

At first I was getting almost 90% accuracy from the KNN classifier. This was caused because there was a "wl_away" column in the features that I did not account for. It was an artifact from the preprocessing process. 

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

tscv = TimeSeriesSplit(n_splits=5)
pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())])

X = features[['']]
y = target
rfc = RandomForestClassifier()

for train_index, test_index in tscv.split(X):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    rfc.fit(X_train, y_train)
    preds = rfc.predict(X_test)
print_metrics(y_test, preds)

# Step 5: Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")

KeyError: "None of [Index([''], dtype='object')] are in the [columns]"

Logistic regression and the SVM with a linear kernel both performed better than the baseline. We will try to improve both models using their hyperparameters. From https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# find best features using SequentialFeatureSelector
from sklearn.feature_selection import SequentialFeatureSelector

cv = TimeSeriesSplit(n_splits=5)
rfc = RandomForestClassifier()

sfs = SequentialFeatureSelector(rfc, n_features_to_select=10, cv=cv, n_jobs=-1)
sfs.fit(X, y)

# get the best features
best_features = X.columns[sfs.get_support()]
X = X[best_features]


KeyboardInterrupt: 

In [None]:
# try a neural network
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, random_state=42)

for train_index, test_index in tscv.split(features_scaled):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    mlp.fit(X_train, y_train)
    preds = mlp.predict(X_test)
    
print_metrics(y_test, preds)

