In [215]:
# import libraries
import pandas as pd

In [274]:
def get_data():
    df_games = pd.read_csv('data/games_rolling.csv')
    df_games = df_games.select_dtypes(include=['float64', 'int64'])
    # df_games.drop(['season_id', 'team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
    # df_games.drop(['team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
    return df_games

First we will separate the features from the target data. 

In [239]:
features, target = get_data()

print(f"{features.shape[0]} games with {features.shape[1]} features")
print(features.columns)

features.info()

40579 games with 41 features
Index(['season_id', 'fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home',
       'fg3a_home', 'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home',
       'oreb_home', 'dreb_home', 'reb_home', 'ast_home', 'stl_home',
       'blk_home', 'tov_home', 'pf_home', 'pts_home', 'plus_minus_home',
       'elo_home', 'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away',
       'fg3a_away', 'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away',
       'oreb_away', 'dreb_away', 'reb_away', 'ast_away', 'stl_away',
       'blk_away', 'tov_away', 'pf_away', 'pts_away', 'plus_minus_away',
       'elo_away'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40579 entries, 0 to 40578
Data columns (total 41 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season_id        40579 non-null  int64  
 1   fgm_home         40579 non-null  float64
 2   fga_home         40579 non-null  float64
 3   fg_pct_h

The baseline accuracy of the model will be predicting the home team always wins.

In [240]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def find_metrics(labels, pred):
    return {
        'accuracy': accuracy_score(labels, pred),
        'precision': precision_score(labels, pred),
        'recall': recall_score(labels, pred),
        'f1': f1_score(labels, pred)
    }

def print_metrics(labels, pred):
    metrics = find_metrics(labels, pred)
    print(f"Accuracy: {metrics['accuracy']:0.4f}")
    print(f"Precision: {metrics['precision']:0.4f}")
    print(f"Recall: {metrics['recall']:0.4f}")
    print(f"F1 Score: {metrics['f1']:0.4f}")

# baseline model
baseline_pred = [1] * len(target) # home team always wins
print("Baseline Model")
print_metrics(target, baseline_pred)

Baseline Model
Accuracy: 0.6019
Precision: 0.6019
Recall: 1.0000
F1 Score: 0.7515


## Feature Selection

In [219]:
features.columns

Index(['fgm_home', 'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home',
       'fg3_pct_home', 'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home',
       'pf_home', 'pts_home', 'plus_minus_home', 'elo_home', 'fgm_away',
       'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away', 'fg3_pct_away',
       'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away', 'dreb_away',
       'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away', 'pf_away',
       'pts_away', 'plus_minus_away', 'elo_away'],
      dtype='object')

We use the `StandardScalar` from `sklearn.preprocessing`, which standardizes data with it's z-score for each feature.

In [241]:
# data normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# time series split for cross validation
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector

# classifiers we will use
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

None


For consistency in testing the models, the function below `train`, will be used to return the final test values and predictions of the time-series cross-validation to measure metrics of the model.



In [221]:
def train(model, X, y, scaler=StandardScaler()):
    cv = TimeSeriesSplit()
    X_scaled = scaler.fit_transform(X) if scaler else X

    for train_index, test_index in cv.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

    return y_test, pred

In [236]:
knn = KNeighborsClassifier(n_neighbors=5)

X, y = get_data()
back_test, pred = train(knn, X=X, y=y)
print_metrics(back_test, pred)

sfs = SequentialFeatureSelector(knn, n_features_to_select=5, n_jobs=-1)
sfs.fit(X, y)

features = X.columns[sfs.get_support()]
X = X[features]

X.to_csv('data/best_features_knn.csv', index=False)
best_features_knn = pd.read_csv('data/best_features_knn.csv')

back_test, pred = train(knn, X=X, y=y)
print_metrics(back_test, pred)

Accuracy: 0.5641
Precision: 0.5962
Recall: 0.7103
F1 Score: 0.6483
Accuracy: 0.6011
Precision: 0.6255
Recall: 0.7344
F1 Score: 0.6756


In [223]:
def find_best_k(X, y, max_k=15):
    best_n = 0
    best_score = 0

    for k in range(2, max_k):
        knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
        test, pred = train(knn, X=X, y=y)
        score = accuracy_score(test, pred)
        if score > best_score:
            best_score = score
            best_n = k

    return (best_n, best_score)

best_n, best_score = find_best_k(X, y)

def print_best_knn(X, y, max_k=15, best_n = 0):
    if best_n != 0:
        print(f"Best n_neighbors: {best_n}")
        test, pred = train(KNeighborsClassifier(n_neighbors=best_n), X=X, y=y)
        print_metrics(test, pred)
        return
    best_n, best_score = find_best_k(X, y, max_k)
    test, pred = train(KNeighborsClassifier(n_neighbors=best_n), X=X, y=y)
    print(f"Best n_neighbors: {best_n}")
    print_metrics(test, pred)

print_best_knn(X, y)

Best n_neighbors: 14
Accuracy: 0.6256
Precision: 0.6457
Recall: 0.7490
F1 Score: 0.6935


65.11% accuracy is good. 

Logistic regression and the SVM with a linear kernel both performed better than the baseline. We will try to improve both models using their hyperparameters. From https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [224]:
# logistic regression
scaler = MinMaxScaler()

logreg = LogisticRegression(max_iter = 1000, n_jobs=-1, verbose=2)
X, y = get_data()
back_test, pred = train(logreg, X=X, y=y, scaler=scaler)
print_metrics(back_test, pred)

# find best features using SequentialFeatureSelector
sfs = SequentialFeatureSelector(logreg, n_features_to_select=5, cv=cv, n_jobs=-1)

X_scaled = scaler.fit_transform(X)
sfs.fit(X_scaled, y)

# get the best features
features = X.columns[sfs.get_support()]
X = X[features]
print(features.to_list())

X.to_csv('data/best_features_logreg.csv', index=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Accuracy: 0.6436
Precision: 0.6593
Recall: 0.7655
F1 Score: 0.7084
['fg_pct_home', 'pts_home', 'elo_home', 'ast_away', 'elo_away']


In [225]:
back_test, pred = train(logreg, X=X, y=y, scaler=MinMaxScaler())
print_metrics(back_test, pred)
#64.33

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


Accuracy: 0.6460
Precision: 0.6405
Recall: 0.8525
F1 Score: 0.7315


In [226]:
# elo base model
elo_home = X['elo_home']
elo_preds = [1 if elo_home + 100> elo_away else 0 for elo_home, elo_away in zip(X['elo_home'], X['elo_away']) ]
print_metrics(y, elo_preds)

Accuracy: 0.6759
Precision: 0.6876
Recall: 0.8461
F1 Score: 0.7586


In [235]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler', MinMaxScaler()), 
    ('logreg', LogisticRegression(max_iter = 1000, n_jobs=-1, verbose=2))
    ])

cross_val_score(pipe, X, y, cv=TimeSeriesSplit(n_splits=5),n_jobs=-1)

array([0.69347923, 0.66020997, 0.67839716, 0.67174331, 0.6435014 ])

In [289]:
def back_test(data, model, features):
    # test the model on previous seasons iteratively
    # each iteration, the model is trained on all previous seasons
    # and tested on the current season
    all_predictions = []
    seasons = data['season_id'].unique()

    for i in range(2, len(seasons)):
        # start on the 3rd season
        season = seasons[i]
        train = data[data['season_id'] < season]
        test = data[data['season_id'] == season]

        X = train[features]
        y = train['wl_home']

        model.fit(X, y)

        predictions = model.predict(test[features])
        predictions = pd.Series(predictions, index=test.index)

        combined = pd.concat([test['wl_home'], predictions], axis=1)
        combined.columns = ['actual', 'predicted']

        all_predictions.append(combined)
    return pd.concat(all_predictions)

def pipeline(data, model, n_features):
    ignored_cols = ['season_id', 'team_id_home', 'game_id', 'team_id_away', 'wl_home']
    feature_cols = data.columns[~data.columns.isin(ignored_cols)]
    target_col = 'wl_home'

    # 5 fold cross validation for time series data
    split = TimeSeriesSplit(n_splits=5)
    # finds the best features using sequential feature selector
    sfs = SequentialFeatureSelector(model, n_features_to_select=n_features, cv=split, n_jobs=-1)

    # normalize the data using min max scaler
    data = data.copy()
    scaler = MinMaxScaler()
    data[feature_cols] = scaler.fit_transform(data[feature_cols])

    sfs.fit(data[feature_cols], data[target_col])

    best_features = list(feature_cols[sfs.get_support()])
    predictions = back_test(data, model, best_features)
    print_metrics(predictions['actual'], predictions['predicted'])

In [291]:
model = LogisticRegression(max_iter=1000, n_jobs=-1)
pipeline(get_data(), model, 5)


# model = KNeighborsClassifier()
# pipeline(get_data(), model, 5)

Accuracy: 0.6717
Precision: 0.6894
Recall: 0.8274
F1 Score: 0.7521
