In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, matthews_corrcoef, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import KFold


df_1 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\Shooting_Splits.csv')
df_2 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\Barttorvik_Away_Neutral.csv')
df_3 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\mmWinLoss.csv')
df_4 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\teamID.csv')

In [10]:
#Choose preferred columns
df_3_col = [
    'Season', 'WTeamID' , 'LTeamID'
]

df_4_col = [
    'TeamID', 'TeamName'
]

df_splits = df_1.drop('CONF', axis=1)
df_stats = df_2.drop('ROUND', axis=1)
df_WL = df_3[df_3_col]
df_ID = df_4[df_4_col]

team_stats = pd.merge(df_splits, df_stats, on=['YEAR', 'TEAM'], how='inner')

#Match ID to team name
df_merged = df_WL.merge(df_ID, left_on='WTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'WTeamName'})

df_merged = df_merged.merge(df_ID, left_on='LTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'LTeamName'})
df_merged = df_merged.drop(columns=['TeamID_x', 'TeamID_y', 'WTeamID', 'LTeamID'])

#Name mapping
stat_name = set(team_stats['TEAM'].unique())
record_name = set(df_merged['WTeamName'].unique()) | set(df_merged['LTeamName'].unique())
mismatch_name = stat_name - record_name
name_map = {
    'Abilene Christian' : 'Abilene Chr', 'Alabama St.' : 'Alabama St', 'Albany' : 'SUNY Albany', 'American' : 'American Univ', 'Appalachian St.' : 'Appalachian St',
    'Arizona St.' : 'Arizona St', 'Arkansas Pine Bluff' : 'Ark Pine Bluff', 'Boise St.' : 'Boise St', 'Boston University' : 'Boston Univ', 'Cal St. Bakersfield' : 'CS Bakersfield',
    'Cal St. Fullerton' : 'CS Fullerton', 'Cleveland St.' : 'Cleveland St', 'Coastal Carolina' : 'Coastal Car', 'College of Charleston' : 'Col Charleston', 'Colorado St.' : 'Colorado St',
    'East Tennessee St.' : 'ETSU', 'Eastern Kentucky' : 'E Kentucky', 'Eastern Washington' : 'E Washington', 'Fairleigh Dickinson' : 'F Dickinson', 'Florida Atlantic' : 'FL Atlantic', 
    'Florida Gulf Coast' : 'FGCU', 'Florida St.' : 'Florida St', 'Fresno St.' : 'Fresno St', 'George Washington' : 'G Washington', 'Georgia St.' : 'Georgia St', 'Grambling St.' : 'Grambling',
    'Indiana St.' : 'Indiana St', 'Iowa St.' : 'Iowa St', 'Jacksonville St.' : 'Jacksonville St', 'Kansas St.' : 'Kansas St', 'Kennesaw St.' : 'Kennesaw', 'Kent St.' : 'Kent', 'Little Rock' : 'Ark Little Rock',
    'Long Beach St.' : 'Long Beach St', 'Louisiana Lafayette' : 'Lafayette', 'Loyola Chicago' : 'Loyola-Chicago', 'McNeese St.' : 'McNeese St', 'Michigan St.' : 'Michigan St',
    'Middle Tennessee' : 'MTSU', 'Milwaukee' : 'WI Milwaukee', 'Mississippi St.' : 'Mississippi St', 'Mississippi Valley St.' : 'MS Valley St', 'Montana St.' : 'Montana St', 'Morehead St.' : 'Morehead St',
    'Morgan St.' : 'Morgan St', "Mount St. Mary's" : "Mt St Mary's", 'Murray St.' : 'Murray St', 'Nebraska Omaha' : 'NE Omaha', 'New Mexico St.' : 'New Mexico St', 'Norfolk St.' : 'Norfolk St', 
    'North Carolina Central' : 'NC Central', 'North Carolina A&T' : 'NC A&T', 'North Carolina St.' : 'NC State', 'Northern Colorado' : 'N colorado', 'Northern Kentucky' : 'N Kentucky',
    'Northwestern St.' : 'Northwestern LA', 'Ohio St.' : 'Ohio St', 'Oklahoma St.' : 'Oklahoma St', 'Oregon St.' : 'Oregon St', 'Penn St.' : 'Penn St', 'Prairie View A&M' : 'Prairie View',
    'SIU Edwardsville' : 'S Illinois', 'Saint Francis' : 'St Francis PA', "Saint Joseph's" : "St Joseph's PA", 'Saint Louis' : 'St Louis', "Saint Mary's" : "St Mary's CA", 
    "Saint Peter's" : "St Peter's", 'Sam Houston St.' : 'Sam Houston St', 'San Diego St.' : 'San Diego St', 'South Dakota St.' : 'S Dakota St', 'Southeast Missouri St.' : 'SE Missouri St',
    'Southern' : 'Southern Univ', 'St. Bonaventure' : 'St Bonaventure', "St. John's" : "St John's", 'Stephen F. Austin' : 'SF Austin', 'Texas A&M Corpus Chris' : 'TAM C. Christi',
    'Texas Southern' : 'TX Southern', 'UTSA' : 'UT San Antonio', 'Utah St.' : 'Utah St', 'Washington St.' : 'Washington St', 'Weber St.' : 'Weber St', 'Western Kentucky' : 'WKU',
    'Western Michigan' : 'W Michigan', 'Wichita St.' : 'Wichita St', 'Wright St.' : 'Wright St'
}
team_stats['TEAM'] = team_stats['TEAM'].replace(name_map)

w_stats = team_stats.rename(columns=lambda col: f'W_{col}' if col not in ['YEAR', 'TEAM'] else col)
l_stats = team_stats.rename(columns=lambda col: f'L_{col}' if col not in ['YEAR', 'TEAM'] else col)

#Table with individual game and team stats
df_merged = df_merged[df_merged['Season'] >= 2010]
df_merged = df_merged.merge(w_stats, left_on=['Season', 'WTeamName'], right_on=['YEAR', 'TEAM'], how='left')
df_merged = df_merged.merge(l_stats, left_on=['Season', 'LTeamName'], right_on=['YEAR', 'TEAM'], how='left')
final_df = df_merged.dropna()
final_df = final_df.drop(columns=['YEAR_x', 'TEAM_x', 'YEAR_y', 'TEAM_y'])



In [12]:
df = final_df.copy()

flip = np.random.rand(len(df)) < 0.5


team1_stats = df.filter(regex='^W_').copy()
team2_stats = df.filter(regex='^L_').copy()

team1_stats.columns = team1_stats.columns.str.replace('^W_', 'team1_', regex=True)
team2_stats.columns = team2_stats.columns.str.replace('^L_', 'team2_', regex=True)

team1_stats_flipped = df.filter(regex='^L_').copy()
team2_stats_flipped = df.filter(regex='^W_').copy()

team1_stats_flipped.columns = team1_stats_flipped.columns.str.replace('^L_', 'team1_', regex=True)
team2_stats_flipped.columns = team2_stats_flipped.columns.str.replace('^W_', 'team2_', regex=True)

team1_stats[flip] = team1_stats_flipped[flip]
team2_stats[flip] = team2_stats_flipped[flip]

team1_stats.to_csv('team1_stats.csv')
team2_stats.to_csv('team2_stats.csv')

X = pd.concat([
    team1_stats, team2_stats
 ], axis=1)

y = (~flip).astype(int)

X = X.to_numpy()

In [None]:
import time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter = 10000)
lr.fit(X_train, y_train)

l_train_acc = accuracy_score(y_train, lr.predict(X_train))
l_train_mcc = matthews_corrcoef(y_train, lr.predict(X_train))
l_train_f1 = f1_score(y_train, lr.predict(X_train))

l_test_acc = accuracy_score(y_test, lr.predict(X_test))
l_test_mcc = matthews_corrcoef(y_test, lr.predict(X_test))
l_test_f1 = f1_score(y_test, lr.predict(X_test))

print(f"Test Accuracy: {l_test_acc:.4f}")
print(f"MCC: {l_test_mcc:.4f}")
print(f"F1 Score: {l_test_f1:.4f}")

Test Accuracy: 0.6703
MCC: 0.3405
F1 Score: 0.6738


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model = XGBClassifier(
    n_estimators=1000,      # number of trees, fewer to avoid overfitting small data
    max_depth=4,           # control model complexity, smaller depth to prevent overfitting
    learning_rate=0.1,     # moderate learning rate
    subsample=0.8,         # use 80% of data per tree (helps generalize)
    colsample_bytree=0.8,  # use 80% of features per tree (reduces overfitting)
    gamma=1,               # minimum loss reduction to make split (regularization)
    reg_lambda=1,          # L2 regularization term on weights
    random_state=42,
    eval_metric='logloss'
)

xg = model.fit(X_train, y_train)

xg_train_acc = accuracy_score(y_train, xg.predict(X_train))
xg_train_mcc = matthews_corrcoef(y_train, xg.predict(X_train))
xg_train_f1 = f1_score(y_train, xg.predict(X_train))

xg_test_acc = accuracy_score(y_test, xg.predict(X_test))
xg_test_mcc = matthews_corrcoef(y_test, xg.predict(X_test))
xg_test_f1 = f1_score(y_test, xg.predict(X_test))

print(f"Test Accuracy: {xg_test_acc:.4f}")
print(f"MCC: {xg_test_mcc:.4f}")
print(f"F1 Score: {xg_test_f1:.4f}")

Test Accuracy: 0.7135
MCC: 0.4270
F1 Score: 0.7166


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)


rf = rf_model.fit(X_train, y_train)

rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_train_mcc = matthews_corrcoef(y_train, rf.predict(X_train))
rf_train_f1 = f1_score(y_train, rf.predict(X_train))

rf_test_acc = accuracy_score(y_test, rf.predict(X_test))
rf_test_mcc = matthews_corrcoef(y_test, rf.predict(X_test))
rf_test_f1 = f1_score(y_test, rf.predict(X_test))

print(f"Accuracy: {rf_test_acc:.4f}")
print(f"MCC: {rf_test_mcc:.4f}")
print(f"F1 Score: {rf_test_f1:.4f}")

Accuracy: 0.7027
MCC: 0.4058
F1 Score: 0.7120


In [None]:
lr_train = lr.predict_proba(X_train)[:, 1]
rf_train = rf.predict_proba(X_train)[:, 1]
xg_train = xg.predict_proba(X_train)[:, 1]

stack_X_train = np.column_stack((lr_train, rf_train, xg_train))

meta_model = LogisticRegression(max_iter=10000)
meta_model.fit(stack_X_train, y_train)


lr_test = lr.predict_proba(X_test)[:, 1]
rf_test = rf.predict_proba(X_test)[:, 1]
xg_test = xg.predict_proba(X_test)[:, 1]

stack_X_test = np.column_stack((lr_test, rf_test, xg_test))

final_preds = meta_model.predict(stack_X_test)

s_test_acc = accuracy_score(y_test, final_preds)
s_test_f1 = f1_score(y_test, final_preds)
s_test_mcc = matthews_corrcoef(y_test, final_preds)

print(f"Stacked Model Accuracy: {s_test_acc:.4f}")
print(f"Stacked Model F1 Score: {s_test_f1:.4f}")
print(f"Stacked Model MCC: {s_test_mcc:.4f}")


Stacked Model Accuracy: 0.7081
Stacked Model F1 Score: 0.7065
Stacked Model MCC: 0.4164
