In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, matthews_corrcoef, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.model_selection import KFold

df_2 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\Barttorvik_Away_Neutral.csv')
df_3 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\games.csv')
df_4 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\raw\teamID.csv')

In [None]:
#Choose preferred columns
df_3_col = [
    'Season', 'WTeamID' , 'LTeamID'
]

df_4_col = [
    'TeamID', 'TeamName'
]
df_stats = df_2.drop('ROUND', axis=1)
df_WL = df_3[df_3_col]
df_ID = df_4[df_4_col]


#Match ID to team name
df_merged = df_WL.merge(df_ID, left_on='WTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'team1'})

df_merged = df_merged.merge(df_ID, left_on='LTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'team2'})
df_merged = df_merged.drop(columns=['TeamID_x', 'TeamID_y', 'WTeamID', 'LTeamID'])

#Name mapping
Bart_names = set(df_stats['TEAM'].unique())
record_name = set(df_merged['team1'].unique()) | set(df_merged['team2'].unique())
mismatch_name = Bart_names - record_name
name_map = {
    'Abilene Christian' : 'Abilene Chr', 'Alabama St.' : 'Alabama St', 'Albany' : 'SUNY Albany', 'American' : 'American Univ', 'Appalachian St.' : 'Appalachian St',
    'Arizona St.' : 'Arizona St', 'Arkansas Pine Bluff' : 'Ark Pine Bluff', 'Boise St.' : 'Boise St', 'Boston University' : 'Boston Univ', 'Cal St. Bakersfield' : 'CS Bakersfield',
    'Cal St. Fullerton' : 'CS Fullerton', 'Cleveland St.' : 'Cleveland St', 'Coastal Carolina' : 'Coastal Car', 'College of Charleston' : 'Col Charleston', 'Colorado St.' : 'Colorado St',
    'East Tennessee St.' : 'ETSU', 'Eastern Kentucky' : 'E Kentucky', 'Eastern Washington' : 'E Washington', 'Fairleigh Dickinson' : 'F Dickinson', 'Florida Atlantic' : 'FL Atlantic', 
    'Florida Gulf Coast' : 'FGCU', 'Florida St.' : 'Florida St', 'Fresno St.' : 'Fresno St', 'George Washington' : 'G Washington', 'Georgia St.' : 'Georgia St', 'Grambling St.' : 'Grambling',
    'Indiana St.' : 'Indiana St', 'Iowa St.' : 'Iowa St', 'Jacksonville St.' : 'Jacksonville St', 'Kansas St.' : 'Kansas St', 'Kennesaw St.' : 'Kennesaw', 'Kent St.' : 'Kent', 'Little Rock' : 'Ark Little Rock',
    'Long Beach St.' : 'Long Beach St', 'Louisiana Lafayette' : 'Lafayette', 'Loyola Chicago' : 'Loyola-Chicago', 'McNeese St.' : 'McNeese St', 'Michigan St.' : 'Michigan St',
    'Middle Tennessee' : 'MTSU', 'Milwaukee' : 'WI Milwaukee', 'Mississippi St.' : 'Mississippi St', 'Mississippi Valley St.' : 'MS Valley St', 'Montana St.' : 'Montana St', 'Morehead St.' : 'Morehead St',
    'Morgan St.' : 'Morgan St', "Mount St. Mary's" : "Mt St Mary's", 'Murray St.' : 'Murray St', 'Nebraska Omaha' : 'NE Omaha', 'New Mexico St.' : 'New Mexico St', 'Norfolk St.' : 'Norfolk St', 
    'North Carolina Central' : 'NC Central', 'North Carolina A&T' : 'NC A&T', 'North Carolina St.' : 'NC State', 'Northern Colorado' : 'N colorado', 'Northern Kentucky' : 'N Kentucky',
    'Northwestern St.' : 'Northwestern LA', 'Ohio St.' : 'Ohio St', 'Oklahoma St.' : 'Oklahoma St', 'Oregon St.' : 'Oregon St', 'Penn St.' : 'Penn St', 'Prairie View A&M' : 'Prairie View',
    'SIU Edwardsville' : 'S Illinois', 'Saint Francis' : 'St Francis PA', "Saint Joseph's" : "St Joseph's PA", 'Saint Louis' : 'St Louis', "Saint Mary's" : "St Mary's CA", 
    "Saint Peter's" : "St Peter's", 'Sam Houston St.' : 'Sam Houston St', 'San Diego St.' : 'San Diego St', 'South Dakota St.' : 'S Dakota St', 'Southeast Missouri St.' : 'SE Missouri St',
    'Southern' : 'Southern Univ', 'St. Bonaventure' : 'St Bonaventure', "St. John's" : "St John's", 'Stephen F. Austin' : 'SF Austin', 'Texas A&M Corpus Chris' : 'TAM C. Christi',
    'Texas Southern' : 'TX Southern', 'UTSA' : 'UT San Antonio', 'Utah St.' : 'Utah St', 'Washington St.' : 'Washington St', 'Weber St.' : 'Weber St', 'Western Kentucky' : 'WKU',
    'Western Michigan' : 'W Michigan', 'Wichita St.' : 'Wichita St', 'Wright St.' : 'Wright St'
}
df_stats['TEAM'] = df_stats['TEAM'].replace(name_map)

team1_stats = df_stats.rename(columns=lambda col: f'team1_{col}' if col not in ['YEAR', 'TEAM'] else col)
team2_stats = df_stats.rename(columns=lambda col: f'team2_{col}' if col not in ['YEAR', 'TEAM'] else col)

#Table with individual game and team stats
df_merged = df_merged[df_merged['Season'] >= 2010]
df_merged = df_merged.merge(team1_stats, left_on=['Season', 'team1'], right_on=['YEAR', 'TEAM'], how='left')
df_merged = df_merged.merge(team2_stats, left_on=['Season', 'team2'], right_on=['YEAR', 'TEAM'], how='left')
final_df = df_merged.dropna()
final_df = final_df.drop(columns=['YEAR_x', 'TEAM_x', 'YEAR_y', 'TEAM_y'])
final_df

stats_2010 = final_df.loc[final_df['Season'] == 2010]
stats_2010.to_csv('2010_stats.csv', index=False) 

stats_2011 = final_df.loc[final_df['Season'] == 2011]
stats_2011.to_csv('2011_stats.csv', index=False) 

stats_2012 = final_df.loc[final_df['Season'] == 2012]
stats_2012.to_csv('2012_stats.csv', index=False) 

stats_2013 = final_df.loc[final_df['Season'] == 2013]
stats_2013.to_csv('2013_stats.csv', index=False) 

stats_2014 = final_df.loc[final_df['Season'] == 2014]
stats_2014.to_csv('2014_stats.csv', index=False) 

stats_2015 = final_df.loc[final_df['Season'] == 2015]
stats_2015.to_csv('2015_stats.csv', index=False) 

stats_2016 = final_df.loc[final_df['Season'] == 2016]
stats_2016.to_csv('2016_stats.csv', index=False) 

stats_2017 = final_df.loc[final_df['Season'] == 2017]
stats_2017.to_csv('2017_stats.csv', index=False) 

stats_2018 = final_df.loc[final_df['Season'] == 2018]
stats_2018.to_csv('2018_stats.csv', index=False) 

stats_2019 = final_df.loc[final_df['Season'] == 2019]
stats_2019.to_csv('2019_stats.csv', index=False) 

stats_2020 = final_df.loc[final_df['Season'] == 2020]
stats_2020.to_csv('2020_stats.csv', index=False) 

stats_2021 = final_df.loc[final_df['Season'] == 2021]
stats_2021.to_csv('2021_stats.csv', index=False) 

stats_2022 = final_df.loc[final_df['Season'] == 2022]
stats_2022.to_csv('2022_stats.csv', index=False) 

stats_2023 = final_df.loc[final_df['Season'] == 2023]
stats_2023.to_csv('2023_stats.csv', index=False) 

stats_2024 = final_df.loc[final_df['Season'] == 2024]
stats_2024.to_csv('2024_stats.csv', index=False) 


In [None]:
df = final_df.copy()

flip = np.random.rand(len(df)) < 0.5

team1_cols = df.filter(regex='^team1_').copy()
team2_cols = df.filter(regex='^team2_').copy()

team1_flipped = team2_cols.copy()
team2_flipped = team1_cols.copy()

team1_cols.loc[flip] = team1_flipped.loc[flip].values
team2_cols.loc[flip] = team2_flipped.loc[flip].values


X = pd.concat([
    team1_cols, team2_cols
 ], axis=1)

X = X.drop(columns=['team1_TEAM NO', 'team1_TEAM ID', 'team2_TEAM NO', 'team2_TEAM ID'])

y = (~flip).astype(int)

feature_names = X.columns.to_numpy()

X


Unnamed: 0,team1_SEED,team1_BADJ EM,team1_BADJ O,team1_BADJ D,team1_BARTHAG,team1_GAMES,team1_W,team1_L,team1_WIN%,team1_EFG%,...,team2_BADJT RANK,team2_AVG HGT RANK,team2_EFF HGT RANK,team2_EXP RANK,team2_TALENT RANK,team2_FT% RANK,team2_OP FT% RANK,team2_PPPO RANK,team2_PPPD RANK,team2_ELITE SOS RANK
0,16.0,-7.0,92.1,99.1,0.301,23.0,9.0,14.0,39.130400,44.1,...,252.0,112.0,113.0,231.0,168.0,274.0,18.0,299.0,14.0,281.0
1,3.0,22.3,116.3,94.0,0.920,16.0,10.0,6.0,62.500000,54.3,...,79.0,339.0,331.0,26.0,268.0,67.0,269.0,11.0,194.0,260.0
2,12.0,21.1,110.4,89.3,0.920,15.0,11.0,4.0,73.333300,52.1,...,252.0,225.0,136.0,191.0,93.0,35.0,209.0,55.0,14.0,129.0
3,10.0,15.8,114.6,98.8,0.846,16.0,8.0,8.0,50.000000,51.1,...,22.0,93.0,117.0,224.0,97.0,1.0,263.0,1.0,77.0,97.0
4,16.0,-4.2,103.6,107.8,0.388,15.0,7.0,8.0,46.666700,50.3,...,203.0,70.0,23.0,269.0,9.0,125.0,45.0,13.0,20.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,11.0,12.7,113.8,101.1,0.796,19.0,10.0,9.0,52.631579,49.8,...,300.0,32.0,10.0,351.0,1.0,146.0,13.0,9.0,86.0,66.0
930,2.0,18.6,113.9,95.3,0.886,16.0,10.0,6.0,62.500000,49.3,...,227.0,43.0,1.0,225.0,49.0,109.0,174.0,18.0,94.0,4.0
931,1.0,31.2,128.5,97.3,0.961,18.0,15.0,3.0,83.333333,56.7,...,10.0,33.0,8.0,156.0,106.0,12.0,357.0,3.0,351.0,8.0
932,11.0,12.7,113.8,101.1,0.796,19.0,10.0,9.0,52.631579,49.8,...,227.0,43.0,1.0,225.0,49.0,109.0,174.0,18.0,94.0,4.0


In [None]:
import time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

lr = LogisticRegression(max_iter = 3000)
lr.fit(X_train, y_train)

l_train_acc = accuracy_score(y_train, lr.predict(X_train))
l_train_mcc = matthews_corrcoef(y_train, lr.predict(X_train))
l_train_f1 = f1_score(y_train, lr.predict(X_train))

l_test_acc = accuracy_score(y_test, lr.predict(X_test))
l_test_mcc = matthews_corrcoef(y_test, lr.predict(X_test))
l_test_f1 = f1_score(y_test, lr.predict(X_test))

print(f"Test Accuracy: {l_test_acc:.4f}")
print(f"MCC: {l_test_mcc:.4f}")
print(f"F1 Score: {l_test_f1:.4f}")

coefficients = lr.coef_[0]
feature_importance = pd.Series(coefficients, index=feature_names)
feature_importance = feature_importance.sort_values(key=np.abs, ascending=False)

print(feature_importance.head(10))


Test Accuracy: 0.6432
MCC: 0.2828
F1 Score: 0.6733


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
model = XGBClassifier(
    n_estimators=1000,      # number of trees, fewer to avoid overfitting small data
    max_depth=4,           # control model complexity, smaller depth to prevent overfitting
    learning_rate=0.1,     # moderate learning rate
    subsample=0.8,         # use 80% of data per tree (helps generalize)
    colsample_bytree=0.8,  # use 80% of features per tree (reduces overfitting)
    gamma=1,               # minimum loss reduction to make split (regularization)
    reg_lambda=1,          # L2 regularization term on weights
    random_state=42,
    eval_metric='logloss'
)

xg = model.fit(X_train, y_train)

xg_train_acc = accuracy_score(y_train, xg.predict(X_train))
xg_train_mcc = matthews_corrcoef(y_train, xg.predict(X_train))
xg_train_f1 = f1_score(y_train, xg.predict(X_train))

xg_test_acc = accuracy_score(y_test, xg.predict(X_test))
xg_test_mcc = matthews_corrcoef(y_test, xg.predict(X_test))
xg_test_f1 = f1_score(y_test, xg.predict(X_test))

print(f"Test Accuracy: {xg_test_acc:.4f}")
print(f"MCC: {xg_test_mcc:.4f}")
print(f"F1 Score: {xg_test_f1:.4f}")

Test Accuracy: 0.6378
MCC: 0.2721
F1 Score: 0.6825


In [68]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)


rf = rf_model.fit(X_train, y_train)

rf_train_acc = accuracy_score(y_train, rf.predict(X_train))
rf_train_mcc = matthews_corrcoef(y_train, rf.predict(X_train))
rf_train_f1 = f1_score(y_train, rf.predict(X_train))

rf_test_acc = accuracy_score(y_test, rf.predict(X_test))
rf_test_mcc = matthews_corrcoef(y_test, rf.predict(X_test))
rf_test_f1 = f1_score(y_test, rf.predict(X_test))

print(f"Accuracy: {rf_test_acc:.4f}")
print(f"MCC: {rf_test_mcc:.4f}")
print(f"F1 Score: {rf_test_f1:.4f}")

rf_importance = pd.Series(rf.feature_importances_, index=feature_names)
rf_importance = rf_importance.sort_values(ascending=False)

print("Top features (Random Forest):")
print(rf_importance.head(10))

Accuracy: 0.7081
MCC: 0.4163
F1 Score: 0.7429
Top features (Random Forest):
team2_WAB             0.021004
team2_BARTHAG         0.019853
team2_BADJ EM         0.018173
team2_BARTHAG RANK    0.016028
team2_BADJ EM RANK    0.015936
team2_TALENT          0.013194
team1_BARTHAG         0.012317
team2_SEED            0.012174
team2_ELITE SOS       0.011889
team1_BADJ EM RANK    0.011361
dtype: float64


In [None]:
lr_train = lr.predict_proba(X_train)[:, 1]
rf_train = rf.predict_proba(X_train)[:, 1]
xg_train = xg.predict_proba(X_train)[:, 1]

stack_X_train = np.column_stack((lr_train, rf_train, xg_train))

meta_model = LogisticRegression(max_iter=3000)
meta_model.fit(stack_X_train, y_train)


lr_test = lr.predict_proba(X_test)[:, 1]
rf_test = rf.predict_proba(X_test)[:, 1]
xg_test = xg.predict_proba(X_test)[:, 1]

stack_X_test = np.column_stack((lr_test, rf_test, xg_test))

final_preds = meta_model.predict(stack_X_test)

s_test_acc = accuracy_score(y_test, final_preds)
s_test_f1 = f1_score(y_test, final_preds)
s_test_mcc = matthews_corrcoef(y_test, final_preds)

print(f"Stacked Model Accuracy: {s_test_acc:.4f}")
print(f"Stacked Model F1 Score: {s_test_f1:.4f}")
print(f"Stacked Model MCC: {s_test_mcc:.4f}")

xg_importance = pd.Series(xg.feature_importances_, index=feature_names)
xg_importance = xg_importance.sort_values(ascending=False)

print("Top features (XGBoost):")
print(xg_importance.head(10))


Stacked Model Accuracy: 0.6649
Stacked Model F1 Score: 0.7048
Stacked Model MCC: 0.3275
Top features (XGBoost):
team2_WAB               0.018140
team2_SEED              0.017445
team2_BARTHAG           0.015060
team1_BARTHAG           0.013361
team1_BARTHAG RANK      0.012924
team1_ELITE SOS RANK    0.011911
team1_BADJ O RANK       0.009056
team1_WAB               0.008984
team2_TOV%              0.008949
team2_BADJ EM           0.008849
dtype: float32
