In [1]:
# import libraries
import pandas as pd

In [2]:
df_games = pd.read_csv('data/games_rolling.csv')
df_games.head()

Unnamed: 0,season_id,team_id_home,team_abbreviation_home,team_name_home,game_id,game_date,wl_home,fgm_home,fga_home,fg_pct_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,elo_away
0,21981,1610612749,MIL,Milwaukee Bucks,28100518,1982-02-05,1,43.5,92.0,0.48,...,13.5,38.5,20.5,4.5,4.0,20.0,28.0,89.5,-19.0,1361.309587
1,21981,1610612741,CHI,Chicago Bulls,28100564,1982-02-13,0,40.0,92.33,0.44,...,18.33,40.67,24.0,8.67,6.33,20.0,28.0,99.67,-2.67,1652.547934
2,21981,1610612751,NJN,New Jersey Nets,28100625,1982-02-24,0,40.25,90.75,0.45,...,19.5,40.25,22.75,11.0,6.25,22.0,28.0,103.0,-0.25,1673.129963
3,21981,1610612749,MIL,Milwaukee Bucks,28100656,1982-03-02,1,40.8,90.0,0.46,...,21.0,42.2,22.0,10.4,5.2,22.0,27.4,100.6,-2.2,1479.863124
4,21981,1610612759,SAN,San Antonio Spurs,28100679,1982-03-06,1,45.33,94.33,0.48,...,22.33,41.0,24.0,10.33,4.83,20.17,28.5,111.5,-2.67,1682.372699


In [3]:
# drop non-numeric columns
df_games = df_games.select_dtypes(include=['float64', 'int64'])
df_games.drop(['season_id', 'team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
df_games.head()

Unnamed: 0,wl_home,fgm_home,fga_home,fg_pct_home,fg3m_home,fg3a_home,fg3_pct_home,ftm_home,fta_home,ft_pct_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,elo_away
0,1,43.5,92.0,0.48,1.0,2.0,0.67,20.5,31.0,0.67,...,13.5,38.5,20.5,4.5,4.0,20.0,28.0,89.5,-19.0,1361.309587
1,0,40.0,92.33,0.44,1.67,4.67,0.54,20.67,30.33,0.69,...,18.33,40.67,24.0,8.67,6.33,20.0,28.0,99.67,-2.67,1652.547934
2,0,40.25,90.75,0.45,1.75,5.0,0.49,21.0,30.0,0.71,...,19.5,40.25,22.75,11.0,6.25,22.0,28.0,103.0,-0.25,1673.129963
3,1,40.8,90.0,0.46,1.4,4.2,0.39,19.8,27.6,0.73,...,21.0,42.2,22.0,10.4,5.2,22.0,27.4,100.6,-2.2,1479.863124
4,1,45.33,94.33,0.48,1.33,4.0,0.38,22.17,29.5,0.75,...,22.33,41.0,24.0,10.33,4.83,20.17,28.5,111.5,-2.67,1682.372699


First we will separate the features from the target data. 

In [4]:
features = df_games.drop(columns=['wl_home', 'wl_away'])
target = df_games['wl_home']

print(f"{features.shape[0]} games with {features.shape[1]} features")
print(features.columns)

41865 games with 40 features
Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')


The baseline accuracy of the model will be predicting the home team always wins.

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def find_metrics(labels, preds):
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds),
        'f1': f1_score(labels, preds)
    }

def print_metrics(labels, preds):
    metrics = find_metrics(labels, preds)
    print(f"Accuracy: {metrics['accuracy']:0.4f}")
    print(f"Precision: {metrics['precision']:0.4f}")
    print(f"Recall: {metrics['recall']:0.4f}")
    print(f"F1 Score: {metrics['f1']:0.4f}")

# baseline model
baseline_preds = [1] * len(target) # home team always wins
print_metrics(target, baseline_preds)

# elo predictions, elo_home + 100 > elo_away
elo_preds = [1 if x + 100 > y else 0 for (x,y) in zip(features['elo_home'], features['elo_away'])]
print_metrics(target, elo_preds)

Accuracy: 0.6024
Precision: 0.6024
Recall: 1.0000
F1 Score: 0.7519
Accuracy: 0.6763
Precision: 0.6881
Recall: 0.8463
F1 Score: 0.7590


## Feature Selection

In [6]:
features.columns

Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')

We use the `StandardScalar` from `sklearn.preprocessing`, which standardizes data with it's z-score for each feature.

In [7]:
# preprocessing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [20]:
# time series split for cross validation
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

def train(model, cv = tscv, n_splits = 5, X = features, y = target, scaler=scaler):
    X_scaled = scaler.fit_transform(X) if scaler else X
    cv = TimeSeriesSplit(n_splits=n_splits)

    for train_index, test_index in cv.split(features_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    return y_test, preds

In [23]:
from sklearn.neighbors import KNeighborsClassifier

def find_best_k(X, y, max_k=15):
    best_n = 0
    best_score = 0

    for k in range(2, max_k):
        knn = KNeighborsClassifier(n_neighbors=k)
        test, preds = train(knn)
        score = accuracy_score(test, preds)
        if score > best_score:
            best_score = score
            best_n = k

    return (best_n, best_score)

best_n, best_score = find_best_k(features, target)

def print_best_knn(X, y, max_k=15, best_n = 0):
    if best_n != 0:
        print(f"Best n_neighbors: {best_n}")
        test, pred = train(KNeighborsClassifier(n_neighbors=best_n))
        print(f"Best score: {accuracy_score(test, pred):0.4f}")
        return
    best_n, best_score = find_best_k(X, y, max_k)
    print(f"Best n_neighbors: {best_n}")
    print(f"Best score: {best_score:0.4f}")

print_best_knn(features, target)

Best n_neighbors: 13
Best score: 0.6198


In [24]:
from sklearn.feature_selection import SelectKBest, f_classif

# find best k features

def find_best_k_features(X, y, max_k=30):
    best_k = 0
    best_score = 0

    for k in range(2, max_k):
        X_new = SelectKBest(f_classif, k=k).fit_transform(X, y)
        knn = KNeighborsClassifier(n_neighbors=best_n)
        test, preds = train(knn, X=X_new)
        score = accuracy_score(test, preds)
        if score > best_score:
            best_features = X_new
            best_score = score
            best_k = k

    return (best_k, best_score, best_features)

features = pd.read_csv('data/best_features.csv')
best_k, best_score, best_features = find_best_k_features(features, target)
print(f"Best k: {best_k}")
print(f"Best features: {best_features.shape[1]}")
print(f"Best score: {best_score:0.4f}")


KeyboardInterrupt: 

At first I was getting almost 90% accuracy from the KNN classifier. This was caused because there was a "wl_away" column in the features that I did not account for. It was an artifact from the preprocessing process. 

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

tscv = TimeSeriesSplit(n_splits=5)
# pipeline = Pipeline([('scaler', StandardScaler()), ('classifier', RandomForestClassifier())])

y = target
# X = df_games.drop(columns=['wl_home', 'wl_away'])
rfc = RandomForestClassifier()

y_test, pred = train(RandomForestClassifier(), X=X, y=y, scaler=None)
print(f"Accuracy: {accuracy_score(y_test, pred):0.4f}")

y_yest, pred = train(RandomForestClassifier(), X=best_features, y=y, scaler=None)
print(f"Accuracy: {accuracy_score(y_test, pred):0.4f}")

KeyError: "None of [Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,\n       ...\n       6970, 6971, 6972, 6973, 6974, 6975, 6976, 6977, 6978, 6979],\n      dtype='int32', length=6980)] are in the [columns]"

Logistic regression and the SVM with a linear kernel both performed better than the baseline. We will try to improve both models using their hyperparameters. From https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [12]:
# try a neural network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=1000, random_state=42)

X = df_games.drop(columns=['wl_home', 'wl_away'])
y = target
# features_scaled = scaler.fit_transform(X, y)
features_scaled = best_features

for train_index, test_index in tscv.split(features_scaled):
    X_train, X_test = features_scaled[train_index], features_scaled[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    mlp.fit(X_train, y_train)
    preds = mlp.predict(X_test)
    
print_metrics(y_test, preds)

Accuracy: 0.5697
Precision: 0.6301
Recall: 0.5799
F1 Score: 0.6040


In [14]:
# find best features using SequentialFeatureSelector
from sklearn.feature_selection import SequentialFeatureSelector

cv = TimeSeriesSplit(n_splits=5)
rfc = RandomForestClassifier()

sfs = SequentialFeatureSelector(rfc, n_features_to_select=10, cv=cv, n_jobs=-1)
sfs.fit(X, y)

# get the best features
best_features = X.columns[sfs.get_support()]
X = X[best_features]

In [17]:
# export best features since it took 36 minutes to find them
X.to_csv('data/best_features.csv', index=False)

In [18]:
best_features

Index(['fg_pct_home', 'fg3_pct_home', 'oreb_home', 'stl_home',
       'plus_minus_home', 'elo_home', 'ft_pct_away', 'reb_away',
       'plus_minus_away', 'elo_away'],
      dtype='object')

Unnamed: 0,fgm_home,fga_home,fg_pct_home,fg3m_home,fg3a_home,fg3_pct_home,ftm_home,fta_home,ft_pct_home,oreb_home,...,dreb_away,reb_away,ast_away,stl_away,blk_away,tov_away,pf_away,pts_away,plus_minus_away,elo_away
0,43.50,92.00,0.48,1.00,2.00,0.67,20.50,31.00,0.67,20.50,...,13.50,38.50,20.50,4.50,4.00,20.00,28.0,89.50,-19.00,1361.309587
1,40.00,92.33,0.44,1.67,4.67,0.54,20.67,30.33,0.69,23.67,...,18.33,40.67,24.00,8.67,6.33,20.00,28.0,99.67,-2.67,1652.547934
2,40.25,90.75,0.45,1.75,5.00,0.49,21.00,30.00,0.71,21.50,...,19.50,40.25,22.75,11.00,6.25,22.00,28.0,103.00,-0.25,1673.129963
3,40.80,90.00,0.46,1.40,4.20,0.39,19.80,27.60,0.73,19.20,...,21.00,42.20,22.00,10.40,5.20,22.00,27.4,100.60,-2.20,1479.863124
4,45.33,94.33,0.48,1.33,4.00,0.38,22.17,29.50,0.75,18.50,...,22.33,41.00,24.00,10.33,4.83,20.17,28.5,111.50,-2.67,1682.372699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41860,43.10,93.40,0.46,15.30,41.20,0.37,15.70,21.40,0.73,10.00,...,38.90,50.40,29.40,8.50,4.50,12.70,18.1,123.80,6.60,1612.431821
41861,43.20,94.70,0.46,14.60,40.60,0.35,15.20,20.50,0.74,10.30,...,39.50,51.40,29.60,7.30,4.50,13.70,17.3,121.10,4.90,1274.148794
41862,43.10,94.50,0.46,14.60,41.40,0.34,14.90,20.40,0.73,10.50,...,39.40,51.20,29.80,7.80,4.60,14.20,17.0,120.60,4.90,1323.535205
41863,42.80,94.60,0.45,13.50,39.80,0.33,15.20,20.90,0.72,11.30,...,38.80,50.30,29.30,7.20,5.10,14.30,17.1,119.10,4.80,1548.706303
