In [39]:
# import libraries
import pandas as pd

In [40]:
def get_features_target():
    df_games = pd.read_csv('data/games_rolling.csv')
    df_games = df_games.select_dtypes(include=['float64', 'int64'])
    df_games.drop(['season_id', 'team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)

    features = df_games.drop(columns=['wl_home', 'wl_away'])
    target = df_games['wl_home']
    
    return features, target

First we will separate the features from the target data. 

In [41]:
features, target = get_features_target()

print(f"{features.shape[0]} games with {features.shape[1]} features")
print(features.columns)

41865 games with 40 features
Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')


The baseline accuracy of the model will be predicting the home team always wins.

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def find_metrics(labels, preds):
    return {
        'accuracy': accuracy_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds),
        'f1': f1_score(labels, preds)
    }

def print_metrics(labels, preds):
    metrics = find_metrics(labels, preds)
    print(f"Accuracy: {metrics['accuracy']:0.4f}")
    print(f"Precision: {metrics['precision']:0.4f}")
    print(f"Recall: {metrics['recall']:0.4f}")
    print(f"F1 Score: {metrics['f1']:0.4f}")

# baseline model
baseline_preds = [1] * len(target) # home team always wins
print("Baseline Model")
print_metrics(target, baseline_preds)

# elo predictions, elo_home + 100 > elo_away
elo_preds = [1 if x + 100 > y else 0 for (x,y) in zip(features['elo_home'], features['elo_away'])]
print("Elo Model")
print_metrics(target, elo_preds)

Accuracy: 0.6024
Precision: 0.6024
Recall: 1.0000
F1 Score: 0.7519
Accuracy: 0.6763
Precision: 0.6881
Recall: 0.8463
F1 Score: 0.7590


## Feature Selection

In [43]:
features.columns

Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')

We use the `StandardScalar` from `sklearn.preprocessing`, which standardizes data with it's z-score for each feature.

In [44]:
# preprocessing data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [45]:
# time series split for cross validation
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

def train(model, cv = tscv, n_splits = 5, X = features, y = target, scaler=scaler):
    X_scaled = scaler.fit_transform(X) if scaler else X
    cv = TimeSeriesSplit(n_splits=n_splits)

    for train_index, test_index in cv.split(features_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    return y_test, preds

In [47]:
# find best features using SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector

cv = TimeSeriesSplit(n_splits=5)
knn = KNeighborsClassifier(n_neighbors=10)

X, y = get_features_target()

sfs = SequentialFeatureSelector(knn, n_features_to_select=30, cv=cv, n_jobs=-1)
sfs.fit(X, y)

# get the best features
best_features = X.columns[sfs.get_support()]
X = X[best_features]

# export best features since it took 36 minutes to find them
X.to_csv('data/best_features_knn.csv', index=False)

best_featres_knn = pd.read_csv('data/best_features_knn.csv')

In [51]:
from sklearn.neighbors import KNeighborsClassifier

def find_best_k(X, y, max_k=15):
    best_n = 0
    best_score = 0

    for k in range(2, max_k):
        knn = KNeighborsClassifier(n_neighbors=k)
        test, preds = train(knn, X=X, y=y)
        score = accuracy_score(test, preds)
        if score > best_score:
            best_score = score
            best_n = k

    return (best_n, best_score)

best_n, best_score = find_best_k(X, y)

def print_best_knn(X, y, max_k=15, best_n = 0):
    if best_n != 0:
        print(f"Best n_neighbors: {best_n}")
        test, pred = train(KNeighborsClassifier(n_neighbors=best_n))
        print_metrics(test, pred)
        return
    best_n, best_score = find_best_k(X, y, max_k)
    print(f"Best n_neighbors: {best_n}")
    print_metrics(test, pred)

print_best_knn(X, y)

Best n_neighbors: 11
Best score: 0.6306


In [50]:
from sklearn.feature_selection import SelectKBest, f_classif

# find best k features

def find_best_k_features(X, y, max_k=30):
    best_k = 0
    best_score = 0

    for k in range(2, max_k):
        X_new = SelectKBest(f_classif, k=k).fit_transform(X, y)
        knn = KNeighborsClassifier(n_neighbors=best_n)
        test, preds = train(knn, X=X_new)
        score = accuracy_score(test, preds)
        if score > best_score:
            best_features = X_new
            best_score = score
            best_k = k

    return (best_k, best_score, best_features, test, preds)

features = pd.read_csv('data/best_features.csv')
best_k, best_score, best_features, test, preds = find_best_k_features(X, y)
print_metrics(test, preds)

Best k: 4
Best features: 4
Accuracy: 0.6318
Precision: 0.6540
Recall: 0.7413
F1 Score: 0.6949


65.11% accuracy is good. 

In [68]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# cv = TimeSeriesSplit(n_splits=5)
# rfc = RandomForestClassifier()
# X, y = get_features_target()

# sfs = SequentialFeatureSelector(rfc, n_features_to_select=30, cv=cv, n_jobs=-1, verbose=2)
# sfs.fit(X, y)

# # get the best features
# best_features = X.columns[sfs.get_support()]
# X = X[best_features]

# # export best features since it took 36 minutes to find them
# X.to_csv('data/best_features_rfc.csv', index=False)

X = pd.read_csv('data/best_features_rfc.csv')
y_test, pred = train(RandomForestClassifier(), X=X, y=y)
print_metrics(y_test, pred)

Accuracy: 0.6572
Precision: 0.6896
Recall: 0.7165
F1 Score: 0.7028


Logistic regression and the SVM with a linear kernel both performed better than the baseline. We will try to improve both models using their hyperparameters. From https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [54]:
# using grid search to find best parameters

# X = pd.read_csv('data/best_features_rfc.csv')
# y = pd.read_csv('data/games.csv')['wl_home']

# def find_best_params(X, y, model, params):
#     grid = GridSearchCV(model, params, cv=TimeSeriesSplit(), scoring=accuracy_score ,n_jobs=-1, verbose=2)
#     grid.fit(X, y)
#     return grid.best_params_

# params = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [None, 5, 10, 15, 20],
#     'min_samples_split': [2, 5, 10, 15, 20],
#     'min_samples_leaf': [1, 2, 5, 10, 15, 20]
# }

# best_params = find_best_params(X, y, RandomForestClassifier(), params)

# pd.DataFrame(best_params, index=[0]).to_csv('data/best_params_rfc.csv', index=False)
best_params = pd.read_csv('data/best_params_rfc.csv').to_dict('records')[0]

print(best_params)
print(len(best_features.to_list()))

In [59]:
y_test, pred = train(RandomForestClassifier(**best_params), X=X, y=y)
print_metrics(y_test, pred)

Accuracy: 0.6670
Precision: 0.6891
Recall: 0.7497
F1 Score: 0.7181
