In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform


df_1 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\Shooting_Splits.csv')
df_2 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\Barttorvik_Away_Neutral.csv')
df_3 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\mmWinLoss.csv')
df_4 = pd.read_csv(r'C:\Users\Sweat\OneDrive\Desktop\march_madness_ai\data\teamID.csv')

In [92]:
#Choose preferred columns
df_3_col = [
    'Season', 'WTeamID' , 'LTeamID'
]

df_4_col = [
    'TeamID', 'TeamName'
]

df_splits = df_1.drop('CONF', axis=1)
df_stats = df_2.drop('ROUND', axis=1)
df_WL = df_3[df_3_col]
df_ID = df_4[df_4_col]

team_stats = pd.merge(df_splits, df_stats, on=['YEAR', 'TEAM'], how='inner')

#Match ID to team name
df_merged = df_WL.merge(df_ID, left_on='WTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'WTeamName'})

df_merged = df_merged.merge(df_ID, left_on='LTeamID', right_on='TeamID', how='left')
df_merged = df_merged.rename(columns={'TeamName':'LTeamName'})
df_merged = df_merged.drop(columns=['TeamID_x', 'TeamID_y', 'WTeamID', 'LTeamID'])

#Name mapping
stat_name = set(team_stats['TEAM'].unique())
record_name = set(df_merged['WTeamName'].unique()) | set(df_merged['LTeamName'].unique())
mismatch_name = stat_name - record_name
name_map = {
    'Abilene Christian' : 'Abilene Chr', 'Alabama St.' : 'Alabama St', 'Albany' : 'SUNY Albany', 'American' : 'American Univ', 'Appalachian St.' : 'Appalachian St',
    'Arizona St.' : 'Arizona St', 'Arkansas Pine Bluff' : 'Ark Pine Bluff', 'Boise St.' : 'Boise St', 'Boston University' : 'Boston Univ', 'Cal St. Bakersfield' : 'CS Bakersfield',
    'Cal St. Fullerton' : 'CS Fullerton', 'Cleveland St.' : 'Cleveland St', 'Coastal Carolina' : 'Coastal Car', 'College of Charleston' : 'Col Charleston', 'Colorado St.' : 'Colorado St',
    'East Tennessee St.' : 'ETSU', 'Eastern Kentucky' : 'E Kentucky', 'Eastern Washington' : 'E Washington', 'Fairleigh Dickinson' : 'F Dickinson', 'Florida Atlantic' : 'FL Atlantic', 
    'Florida Gulf Coast' : 'FGCU', 'Florida St.' : 'Florida St', 'Fresno St.' : 'Fresno St', 'George Washington' : 'G Washington', 'Georgia St.' : 'Georgia St', 'Grambling St.' : 'Grambling',
    'Indiana St.' : 'Indiana St', 'Iowa St.' : 'Iowa St', 'Jacksonville St.' : 'Jacksonville St', 'Kansas St.' : 'Kansas St', 'Kennesaw St.' : 'Kennesaw', 'Kent St.' : 'Kent', 'Little Rock' : 'Ark Little Rock',
    'Long Beach St.' : 'Long Beach St', 'Louisiana Lafayette' : 'Lafayette', 'Loyola Chicago' : 'Loyola-Chicago', 'McNeese St.' : 'McNeese St', 'Michigan St.' : 'Michigan St',
    'Middle Tennessee' : 'MTSU', 'Milwaukee' : 'WI Milwaukee', 'Mississippi St.' : 'Mississippi St', 'Mississippi Valley St.' : 'MS Valley St', 'Montana St.' : 'Montana St', 'Morehead St.' : 'Morehead St',
    'Morgan St.' : 'Morgan St', "Mount St. Mary's" : "Mt St Mary's", 'Murray St.' : 'Murray St', 'Nebraska Omaha' : 'NE Omaha', 'New Mexico St.' : 'New Mexico St', 'Norfolk St.' : 'Norfolk St', 
    'North Carolina Central' : 'NC Central', 'North Carolina A&T' : 'NC A&T', 'North Carolina St.' : 'NC State', 'Northern Colorado' : 'N colorado', 'Northern Kentucky' : 'N Kentucky',
    'Northwestern St.' : 'Northwestern LA', 'Ohio St.' : 'Ohio St', 'Oklahoma St.' : 'Oklahoma St', 'Oregon St.' : 'Oregon St', 'Penn St.' : 'Penn St', 'Prairie View A&M' : 'Prairie View',
    'SIU Edwardsville' : 'S Illinois', 'Saint Francis' : 'St Francis PA', "Saint Joseph's" : "St Joseph's PA", 'Saint Louis' : 'St Louis', "Saint Mary's" : "St Mary's CA", 
    "Saint Peter's" : "St Peter's", 'Sam Houston St.' : 'Sam Houston St', 'San Diego St.' : 'San Diego St', 'South Dakota St.' : 'S Dakota St', 'Southeast Missouri St.' : 'SE Missouri St',
    'Southern' : 'Southern Univ', 'St. Bonaventure' : 'St Bonaventure', "St. John's" : "St John's", 'Stephen F. Austin' : 'SF Austin', 'Texas A&M Corpus Chris' : 'TAM C. Christi',
    'Texas Southern' : 'TX Southern', 'UTSA' : 'UT San Antonio', 'Utah St.' : 'Utah St', 'Washington St.' : 'Washington St', 'Weber St.' : 'Weber St', 'Western Kentucky' : 'WKU',
    'Western Michigan' : 'W Michigan', 'Wichita St.' : 'Wichita St', 'Wright St.' : 'Wright St'
}
team_stats['TEAM'] = team_stats['TEAM'].replace(name_map)

w_stats = team_stats.rename(columns=lambda col: f'W_{col}' if col not in ['YEAR', 'TEAM'] else col)
l_stats = team_stats.rename(columns=lambda col: f'L_{col}' if col not in ['YEAR', 'TEAM'] else col)

#Table with individual game and team stats
df_merged = df_merged[df_merged['Season'] >= 2010]
df_merged = df_merged.merge(w_stats, left_on=['Season', 'WTeamName'], right_on=['YEAR', 'TEAM'], how='left')
df_merged = df_merged.merge(l_stats, left_on=['Season', 'LTeamName'], right_on=['YEAR', 'TEAM'], how='left')
final_df = df_merged.dropna()
final_df = final_df.drop(columns=['YEAR_x', 'TEAM_x', 'YEAR_y', 'TEAM_y'])
final_df.to_csv('completedata.csv')


In [93]:
df = final_df.copy()

flip = np.random.rand(len(df)) < 0.5


team1_stats = df.filter(regex='^W_').copy()
team2_stats = df.filter(regex='^L_').copy()

team1_stats.columns = team1_stats.columns.str.replace('^W_', 'team1_', regex=True)
team2_stats.columns = team2_stats.columns.str.replace('^L_', 'team2_', regex=True)

team1_stats_flipped = df.filter(regex='^L_').copy()
team2_stats_flipped = df.filter(regex='^W_').copy()

team1_stats_flipped.columns = team1_stats_flipped.columns.str.replace('^L_', 'team1_', regex=True)
team2_stats_flipped.columns = team2_stats_flipped.columns.str.replace('^W_', 'team2_', regex=True)

team1_stats[flip] = team1_stats_flipped[flip]
team2_stats[flip] = team2_stats_flipped[flip]

team1_stats.to_csv('team1_stats.csv')
team2_stats.to_csv('team2_stats.csv')

X = pd.concat([
    team1_stats, team2_stats
 ], axis=1)

y = (~flip).astype(int)

X = X.to_numpy()

In [91]:
import time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

l_model = LogisticRegression(max_iter = 10000)
l_model.fit(X_train, y_train)

train_acc = accuracy_score(y_train, l_model.predict(X_train))
test_acc = accuracy_score(y_test, l_model.predict(X_test))

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
probs = l_model.predict_proba(X_test)
probs *100
np.set_printoptions(suppress=True)
print(probs)

Train Accuracy: 0.8041
Test Accuracy: 0.7027
[[0.27523154 0.72476846]
 [0.58762173 0.41237827]
 [0.06695939 0.93304061]
 [0.1828965  0.8171035 ]
 [0.0192277  0.9807723 ]
 [0.74089322 0.25910678]
 [0.05813355 0.94186645]
 [0.93725493 0.06274507]
 [0.88210007 0.11789993]
 [0.35719444 0.64280556]
 [0.91856448 0.08143552]
 [0.09223102 0.90776898]
 [0.29443497 0.70556503]
 [0.52664961 0.47335039]
 [0.52944828 0.47055172]
 [0.23988703 0.76011297]
 [0.82218518 0.17781482]
 [0.00269871 0.99730129]
 [0.85867715 0.14132285]
 [0.75228606 0.24771394]
 [0.15060324 0.84939676]
 [0.07455806 0.92544194]
 [0.87352088 0.12647912]
 [0.02189957 0.97810043]
 [0.05691167 0.94308833]
 [0.03565338 0.96434662]
 [0.51453035 0.48546965]
 [0.34941583 0.65058417]
 [0.53280475 0.46719525]
 [0.14364867 0.85635133]
 [0.9720866  0.0279134 ]
 [0.17494267 0.82505733]
 [0.38219689 0.61780311]
 [0.9680358  0.0319642 ]
 [0.69685793 0.30314207]
 [0.97555673 0.02444327]
 [0.11179451 0.88820549]
 [0.00161133 0.99838867]
 [0.9

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=10000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [88]:
start = time.time()

model = XGBClassifier(
    n_estimators=1000,      # number of trees, fewer to avoid overfitting small data
    max_depth=4,           # control model complexity, smaller depth to prevent overfitting
    learning_rate=0.1,     # moderate learning rate
    subsample=0.8,         # use 80% of data per tree (helps generalize)
    colsample_bytree=0.8,  # use 80% of features per tree (reduces overfitting)
    gamma=1,               # minimum loss reduction to make split (regularization)
    reg_lambda=1,          # L2 regularization term on weights
    random_state=42,
    eval_metric='logloss'
)

# Fit the model
model.fit(X_train, y_train)

# Evaluate accuracy
train_acc = accuracy_score(y_train, model.predict(X_train))
test_acc = accuracy_score(y_test, model.predict(X_test))

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

end = time.time()
print(f"Time taken: {(end - start)/60:.2f} minutes")

Train Accuracy: 1.0000
Test Accuracy: 0.7243
Time taken: 0.01 minutes
