In [33]:
# import libraries
import pandas as pd

In [34]:
def get_data():
    df_games = pd.read_csv('data/games_rolling.csv')
    df_games = df_games.select_dtypes(include=['float64', 'int64'])
    # df_games.drop(['season_id', 'team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
    # df_games.drop(['team_id_home', 'game_id', 'team_id_away'], axis=1, inplace=True)
    return df_games

In [35]:
def get_xy():
    data = get_data()

    ignored_cols = ['season_id', 'team_id_home',
                    'game_id', 'team_id_away', 'wl_home']
    feature_cols = data.columns[~data.columns.isin(ignored_cols)]
    target_col = 'wl_home'

    X = data[feature_cols]
    y = data[target_col]
    return X, y

In [36]:
def find_metrics(true, pred):
    # find the metrics for the binary classification model

    # some bitwise operations to find the metrics
    true_positives = ((true == 1) & (pred == 1)).sum()
    false_positives = ((true == 0) & (pred == 1)).sum()
    true_negatives = ((true == 0) & (pred == 0)).sum()
    false_negatives = ((true == 1) & (pred == 0)).sum()
    total = true_positives + false_positives + true_negatives + false_negatives

    

    accuracy = (true_positives + true_negatives) / total
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

def print_metrics(labels, pred):
    metrics = find_metrics(labels, pred)
    print(f"Accuracy: {metrics['accuracy']:0.4f}")
    print(f"Precision: {metrics['precision']:0.4f}")
    print(f"Recall: {metrics['recall']:0.4f}")
    print(f"F1 Score: {metrics['f1']:0.4f}")

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, classes, title='Confusion Matrix', cmap=plt.cm.Blues):
    """
    Plots a confusion matrix graphic.
    
    Parameters:
        y_true (array-like): The true target values.
        y_pred (array-like): The predicted target values.
        classes (list): List of class labels (strings) in the order of the confusion matrix.
        title (str): Title of the confusion matrix plot.
        cmap (matplotlib colormap): Colormap to be used for the heatmap (default is plt.cm.Blues).

    Returns:
        None
    """
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_normalized, annot=True, cmap=cmap, fmt=".2f", xticklabels=classes, yticklabels=classes)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()



In [37]:
# data normalization
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# time series split for cross validation
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector

# classifiers we will use
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [38]:
# elo base model
X, y = get_xy()
elo_preds = [1 if elo_home + 100> elo_away else 0 for elo_home, elo_away in zip(X['elo_home'], X['elo_away']) ]
print_metrics(y, elo_preds)

Accuracy: nan
Precision: nan
Recall: nan
F1 Score: nan


  accuracy = (true_positives + true_negatives) / total
  precision = true_positives / (true_positives + false_positives)
  recall = true_positives / (true_positives + false_negatives)


In [39]:
def back_test(data, model, features):
    # test the model on previous seasons iteratively
    # each iteration, the model is trained on all previous seasons
    # and tested on the current season
    target = 'wl_home'
    seasons = data['season_id'].unique()

    all_predictions = pd.DataFrame(columns=['actual', 'predicted'])

    for i in range(2, len(seasons)):
        # start on the 3rd season
        season = seasons[i]
        train = data[data['season_id'] < season]
        test = data[data['season_id'] == season]

        X_train, y_train = train[features], train[target]
        X_test, y_test = test[features], test[target]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        season_predictions = pd.DataFrame(
            {'actual': y_test, 'predicted': predictions}
        )
        all_predictions = pd.concat([all_predictions, season_predictions])

    return all_predictions


def find_best_features(data, model, n_features):
    # find the best features using sequential feature selector
    ignored_cols = ['season_id', 'team_id_home',
                    'game_id', 'team_id_away', 'wl_home']
    feature_cols = data.columns[~data.columns.isin(ignored_cols)]
    target_col = 'wl_home'

    # 5 fold cross validation for time series data
    split = TimeSeriesSplit(n_splits=5)
    # finds the best features using sequential feature selector
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=n_features, cv=split, n_jobs=-1)

    # Create a copy of the data to avoid modifying the original data
    data_copy = data.copy()

    # Normalize the data using MinMaxScaler
    scaler = MinMaxScaler()
    data_copy[feature_cols] = scaler.fit_transform(data_copy[feature_cols])

    # fit the model
    sfs.fit(data_copy[feature_cols], data_copy[target_col])

    return list(feature_cols[sfs.get_support()])


def evaluate(data, model, n_features):
    print(f'Finding best {n_features} features...')
    best_features = find_best_features(data, model, n_features)
    print(f'Best features: {best_features}')

    print('Back testing model...')
    predictions = back_test(data, model, best_features)

    print('Metrics:')
    actual, predicted = predictions['actual'], predictions['predicted']
    print_metrics(actual, predicted)

In [40]:
data = get_data()
predictions = back_test(data, LogisticRegression(), ['elo_home', 'elo_away', 'plus_minus_home', 'plus_minus_away']).reset_index(drop=True)

# get metrics from predictions
actual, predicted = predictions['actual'], predictions['predicted']

print_metrics(actual, predicted)

Accuracy: 0.6752
Precision: 0.6869
Recall: 0.8426
F1 Score: 0.7568


In [41]:
data = get_data()
true = data['wl_home']
pred = data['elo_home'] > data['elo_away']
print_metrics(true, pred)

# baseline, always predict home team wins
pred = 1
print_metrics(true, pred)

Accuracy: 0.6545
Precision: 0.7563
Recall: 0.6286
F1 Score: 0.6865
Accuracy: 0.6019
Precision: 0.6019
Recall: 1.0000
F1 Score: 0.7515


In [42]:
knn = KNeighborsClassifier(n_neighbors=10)
evaluate(data, knn, 5) 
evaluate(data, knn, 3)
evaluate(data, knn, 10)


Finding best 5 features...
Best features: ['fg3a_home', 'fg3_pct_home', 'elo_home', 'fg_pct_away', 'elo_away']
Back testing model...
Metrics:
Accuracy: 0.6317
Precision: 0.6753
Recall: 0.7435
F1 Score: 0.7077
