In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, brier_score_loss, mean_squared_error, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [None]:
data_dir = '/kaggle/input/march-machine-learning-mania-2025/**'
data_dir ='data/**'

In [None]:
class TournamentPredictor:
    def __init__(self, data_dir):
        self.data_path = data_dir
        self.data = None
        self.teams = None
        self.seeds = None
        self.games = None
        self.sub = None
        self.gb = None
        self.col = None
        self.imputer = SimpleImputer(strategy='mean')
        self.scaler = StandardScaler()

    def load_data(self):
        files = glob.glob(self.data_path)
        self.data = {p.split('/')[-1].split('.')[0]: pd.read_csv(p, encoding='latin-1') for p in files}

        teams = pd.concat([self.data['MTeams'], self.data['WTeams']])
        teams_spelling = pd.concat([self.data['MTeamSpellings'], self.data['WTeamSpellings']])
        teams_spelling = teams_spelling.groupby(by='TeamID', as_index=False)['TeamNameSpelling'].count()
        teams_spelling.columns = ['TeamID', 'TeamNameCount']
        self.teams = pd.merge(teams, teams_spelling, how='left', on=['TeamID'])

        season_cresults = pd.concat([self.data['MRegularSeasonCompactResults'], self.data['WRegularSeasonCompactResults']])
        season_dresults = pd.concat([self.data['MRegularSeasonDetailedResults'], self.data['WRegularSeasonDetailedResults']])
        tourney_cresults = pd.concat([self.data['MNCAATourneyCompactResults'], self.data['WNCAATourneyCompactResults']])
    
        tourney_dresults = pd.concat([self.data['MNCAATourneyDetailedResults'], self.data['WNCAATourneyDetailedResults']])

        seeds_df = pd.concat([self.data['MNCAATourneySeeds'], self.data['WNCAATourneySeeds']])
        self.seeds = {'_'.join(map(str, [int(k1), k2])): int(v[1:3]) for k1, v, k2 in seeds_df[['Season', 'Seed', 'TeamID']].values}

        

        season_cresults['ST'] = 'S'
        season_dresults['ST'] = 'S'
        tourney_cresults['ST'] = 'T'
        tourney_dresults['ST'] = 'T'

        self.games = pd.concat((season_dresults, tourney_dresults), axis=0, ignore_index=True)
        self.games['Team1'] = self.games.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[0], axis=1)
        self.games['Team2'] = self.games.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[1], axis=1)
        self.games['Pred'] = self.games.apply(lambda r: 1.0 if sorted([r['WTeamID'], r['LTeamID']])[0] == r['WTeamID'] else 0.0, axis=1)
        #restrict to only tournament data (faster & better) // edit: wrong bc we only predict like 4 games
        self.teamsResults = self.getTeamStats(self.games)


        # merge games with team results
        self.games = self.games[['Season', 'DayNum', 'Team1', 'Team2','Pred']]
        self.games = pd.merge(self.games, self.teamsResults, how='left', left_on=['Season', 'DayNum', 'Team1'], right_on=['Season','DayNum','TeamID'], suffixes=('', '_1'))
        self.games = pd.merge(self.games, self.teamsResults, how='left', left_on=['Season', 'DayNum', 'Team2'], right_on=['Season','DayNum','TeamID'], suffixes=('', '_2'))
        self.col = self.games.drop(['Pred'], axis=1).columns
        

        self.sub = self.data['SampleSubmissionStage1']
        self.sub['Season'] = self.sub['ID'].map(lambda x: x.split('_')[0]).astype(int)
        self.sub['Team1'] = self.sub['ID'].map(lambda x: x.split('_')[1]).astype(int)
        self.sub['Team2'] = self.sub['ID'].map(lambda x: x.split('_')[2]).astype(int)

        last_day = self.teamsResults.groupby(['Season', 'TeamID'])['DayNum'].max().reset_index()
        last_day = last_day.rename(columns={'DayNum': 'LastDayNum'})
        # Merge last day for Team1
        self.sub = pd.merge(self.sub, last_day, how='left', left_on=['Season', 'Team1'], right_on=['Season', 'TeamID'])
        self.sub = self.sub.rename(columns={'LastDayNum': 'LastDayNum1'})
        self.sub = self.sub.drop('TeamID', axis=1)

        # Merge last day for Team2
        self.sub = pd.merge(self.sub, last_day, how='left', left_on=['Season', 'Team2'], right_on=['Season', 'TeamID'])
        self.sub = self.sub.rename(columns={'LastDayNum': 'LastDayNum2'})
        self.sub = self.sub.drop('TeamID', axis=1)

        self.sub = pd.merge(self.sub, self.teamsResults, how='left', left_on=['Season', 'LastDayNum1', 'Team1'], right_on=['Season','DayNum','TeamID'], suffixes=('', '_1'))
        self.sub = pd.merge(self.sub, self.teamsResults, how='left', left_on=['Season', 'LastDayNum2', 'Team2'], right_on=['Season','DayNum','TeamID'], suffixes=('', '_2'))
    
   

    def getTeamStats(self, df):
        def calculate_perc(made, att):
            if att == 0:
                return 0.0  # Avoid division by zero
            return np.round(made / att, 3)
        
        winF =  ['WTeamID','LTeamID','Season','DayNum','WScore','LScore','WLoc','NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF','LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
        loseF = ['LTeamID','WTeamID','Season','DayNum','LScore','WScore','WLoc','NumOT', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF','WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']

        winners = df[winF]
        losers = df[loseF]
        teamF =     ['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast', 'TO', 'Stl', 'Blk', 'PF']
        opponentF = ['OFGM', 'OFGA', 'OFGM3', 'OFGA3', 'OFTM', 'OFTA', 'OOR', 'ODR', 'OAst', 'OTO', 'OStl', 'OBlk', 'OPF']
        winners.columns = ['TeamID','OpponentID','Season','DayNum','Score','OppScore','WLoc','NumOT'] + teamF + opponentF
        losers.columns =  ['TeamID','OpponentID','Season','DayNum','Score','OppScore','WLoc','NumOT'] + teamF + opponentF

        winners['Pred'] = 1
        losers['Pred'] = 0

        # 1 = away, 2 = home, 3 = neutral
        winners['WLoc'] = winners['WLoc'].map({'A': -1, 'H': 1, 'N': 0}) #WLoc is winner location, looser is opposite
        losers['WLoc'] = losers['WLoc'].map({'A': 1, 'H': -1, 'N': 0})
        teamF.append('WLoc')

        concatDf = pd.concat((winners, losers), axis=0, ignore_index=True)
        concatDf['Margin'] = concatDf['Score'] - concatDf['OppScore']
        concatDf['NumOT'] = concatDf['NumOT'] > 0
        teamF.append('NumOT')

        # calculate features
        concatDf['FGP2'] = concatDf.apply(lambda row: calculate_perc(row['FGM']-row['FGM3'], row['FGA']-row['FGA3']), axis=1) 
        concatDf['FGP'] = concatDf.apply(lambda row: calculate_perc(row['FGM'], row['FGA']), axis=1)  #field goals made
        concatDf['FGP3'] = concatDf.apply(lambda row: calculate_perc(row['FGM3'], row['FGA3']), axis=1)
        concatDf['FTP'] = concatDf.apply(lambda row: calculate_perc(row['FTM'], row['FTA']), axis=1)
        teamF += ['FGP2', 'FGP', 'FGP3', 'FTP']

        concatDf['OFGP2'] = concatDf.apply(lambda row: calculate_perc(row['OFGM']-row['OFGM3'], row['OFGA']-row['OFGA3']), axis=1) 
        concatDf['OFGP'] = concatDf.apply(lambda row: calculate_perc(row['OFGM'], row['OFGA']), axis=1)  #field goals made
        concatDf['OFGP3'] = concatDf.apply(lambda row: calculate_perc(row['FGM3'], row['OFGA3']), axis=1)
        concatDf['OFTP'] = concatDf.apply(lambda row: calculate_perc(row['OFTM'], row['OFTA']), axis=1)
        opponentF+=['OFGP2', 'OFGP', 'OFGP3', 'OFTP']

        # diff features
        concatDf['FGP2_Diff'] = concatDf['FGP2'] - concatDf['OFGP2']
        concatDf['FGP_Diff'] = concatDf['FGP'] - concatDf['OFGP']
        concatDf['FGP3_Diff'] = concatDf['FGP3'] - concatDf['OFGP3']
        concatDf['FTP_Diff'] = concatDf['FTP'] - concatDf['OFTP']

        concatDf['OR_Diff'] = concatDf['OR'] - concatDf['OOR']
        concatDf['DR_Diff'] = concatDf['DR'] - concatDf['ODR']
        concatDf['Ast_Diff'] = concatDf['Ast'] - concatDf['OAst']
        concatDf['TO_Diff'] = concatDf['TO'] - concatDf['OTO']
        concatDf['Stl_Diff'] = concatDf['Stl'] - concatDf['OStl']
        concatDf['Blk_Diff'] = concatDf['Blk'] - concatDf['OBlk']
        concatDf['PF_Diff'] = concatDf['PF'] - concatDf['OPF']
        diffF = ['Margin','FGP2_Diff', 'FGP_Diff', 'FGP3_Diff', 'FTP_Diff', 'OR_Diff', 'DR_Diff', 'Ast_Diff', 'TO_Diff', 'Stl_Diff', 'Blk_Diff', 'PF_Diff']

        # seeds
        concatDf['IDTeam'] = concatDf.apply(lambda r: '_'.join(map(str, [r['Season'], r['TeamID']])), axis=1)
        concatDf['IDTeamOpp'] = concatDf.apply(lambda r: '_'.join(map(str, [r['Season'], r['OpponentID']])), axis=1)
        concatDf['TeamSeed'] = concatDf['IDTeam'].map(self.seeds).fillna(0)
        concatDf['Opp2Seed'] = concatDf['IDTeamOpp'].map(self.seeds).fillna(0)
        concatDf['SeedDiff'] = concatDf['TeamSeed'] - concatDf['Opp2Seed']
        diffF.append('SeedDiff')
        teamF.append('TeamSeed')
        opponentF.append('Opp2Seed')
        concatDf.drop(['IDTeam', 'IDTeamOpp'], axis=1, inplace=True)

        gameF = teamF + opponentF + diffF

        # create lag features
        # log: wloc, 
        concatDf = concatDf.sort_values(['Season', 'TeamID', 'DayNum'])
        concatDf['cum_wins'] = concatDf.groupby(['Season', 'TeamID'])['Pred'].apply(lambda x: x.cumsum().shift(1)).reset_index(level=[0, 1], drop=True).fillna(0)
        concatDf['cum_games'] = concatDf.groupby(['Season', 'TeamID'])['DayNum'].cumcount().fillna(1)#.shift(1) + 1
        concatDf['win_pct'] = (concatDf['cum_wins'] / concatDf['cum_games']).fillna(0)
        
        meanF = [ 'cum_wins', 'cum_games', 'win_pct'] # season stats
        for f in ['Pred']+teamF+diffF:
            print(f)
            for i in [3]:#,5,10]:
                concatDf, mF = self.getLastNGamesAverage(concatDf, i, f)
                meanF.append(mF)
            for i in [3]:#,5,10]:
                concatDf, mF = self.getLastNGamesStd(concatDf, i, f)
                meanF.append(mF)
            #for i in [3,5,10]:
            #    concatDf, mF = self.getLastNGamesMax(concatDf, i, f)
            #    meanF.append(mF)

        self.featPerTeam = meanF + ['TeamID', 'Season', 'DayNum','TeamSeed'] 

        return concatDf[self.featPerTeam]
    
    def getLastNGamesAverage(self, df, n, f):
            df[f+'_last_'+str(n)+'_games_avg'] = df.groupby(['Season', 'TeamID'])[f].apply(lambda x: x.rolling(n, min_periods=1).mean().shift(1)).reset_index(level=[0, 1], drop=True).fillna(0)
            return df, f+'_last_'+str(n)+'_games_avg'
    def getLastNGamesStd(self, df, n, f):
            df[f+'_last_'+str(n)+'_games_std'] = df.groupby(['Season', 'TeamID'])[f].apply(lambda x: x.rolling(n, min_periods=1).std().shift(1)).reset_index(level=[0, 1], drop=True).fillna(0)
            return df, f+'_last_'+str(n)+'_games_std'
    def getLastNGamesMin(self, df, n, f):
            df[f+'_last_'+str(n)+'_games_min'] = df.groupby(['Season', 'TeamID'])[f].apply(lambda x: x.rolling(n, min_periods=1).min().shift(1)).reset_index(level=[0, 1], drop=True).fillna(0)
            return df, f+'_last_'+str(n)+'_games_min'
    def getLastNGamesMax(self, df, n, f):
            df[f+'_last_'+str(n)+'_games_max'] = df.groupby(['Season', 'TeamID'])[f].apply(lambda x: x.rolling(n, min_periods=1).max().shift(1)).reset_index(level=[0, 1], drop=True).fillna(0)
            return df, f+'_last_'+str(n)+'_games_max'


    def get_xbg(self):
      return XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=3000,  
        learning_rate=0.1, 
        early_stopping_rounds=25,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5
        #eval_metric= "rmse"
      )
    def scaled_data(self, input_data):
        X_scaled = self.scaler.fit_transform(input_data)
        return X_scaled
    def impute_data(self, input_data):
        X_imputed = self.imputer.fit_transform(input_data)
        return X_imputed

    def train_model(self, model, df):
        X = df[self.col].reset_index(drop=True)#.fillna(-1)
        y = df['Pred']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
        train_preds = model.predict(X_train).clip(0.001, 0.999)
        test_preds0 = model.predict(X_test).clip(0.001, 0.999)

        print(f'Log Loss (Train/Test): {log_loss(y_train, train_preds):.4f}, {log_loss(y_test, test_preds0):.4f}')
        print(f'Brier Score (Train/Test): {brier_score_loss(y_train, train_preds):.4f}, {brier_score_loss(y_test, test_preds0):.4f}')
        print(f'MSE (TrainTest): {mean_squared_error(y_train, train_preds):.4f}, {mean_squared_error(y_test, test_preds0):.4f}')

        # Plot ROC Curve for the calibration set.
        self.plot_roc_curve(y_test, test_preds0, "Calibration Set ROC Curve")

        feature_importances = model.feature_importances_
        feature_names = self.col
        self.plot_feature_importance(feature_importances, feature_names)

        self.plot_calibration_curve(y_test, test_preds0)

        # Plot the distribution of calibrated predictions.
        self.plot_prediction_distribution(y_test, "Distribution of Test Predictions")

    def train_model_cv(self, model, df):
        X = df[self.col].reset_index(drop=True)
        y = df['Pred'].reset_index(drop=True)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_mse_scores = []
        cv_logloss_scores = []
        cv_test_mse_scores = []
        cv_test_logloss_scores = []
        for train_index, val_index in kf.split(X):
            X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[val_index]
            y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[val_index]

            # start with fresh model every time!
            clonsed_model = clone(model)
            clonsed_model.fit(X_train_cv, y_train_cv, eval_set=[(X_test_cv, y_test_cv)], verbose=100)
            train_preds_cv = clonsed_model.predict(X_train_cv).clip(0.001, 0.999)
            test_preds_cv = clonsed_model.predict(X_test_cv).clip(0.001, 0.999)


            train_mse_cv = mean_squared_error(y_train_cv, train_preds_cv)
            train_logloss_cv = log_loss(y_train_cv, train_preds_cv)
            test_mse_cv = mean_squared_error(y_test_cv, test_preds_cv)
            test_logloss_cv = log_loss(y_test_cv, test_preds_cv)

            cv_mse_scores.append(train_mse_cv)
            cv_logloss_scores.append(train_logloss_cv)
            cv_test_mse_scores.append(test_mse_cv)
            cv_test_logloss_scores.append(test_logloss_cv)

        
        print(f'Cross-validated MSE: {np.mean(cv_mse_scores):.4f},{np.mean(cv_test_mse_scores):.4f}')
        print(f'Cross-validated LogLoss: {np.mean(cv_logloss_scores):.4f},{np.mean(cv_test_logloss_scores):.4f}')
        print("Test cv array: ",cv_test_mse_scores)

    def predict_submission(self, output_file='submission.csv'):
        sub_X = self.sub[self.col].fillna(-1)
        sub_X_imputed = self.imputer.transform(sub_X)
        sub_X_scaled = self.scaler.transform(sub_X_imputed)

        preds = self.model.predict(sub_X_scaled).clip(0.001, 0.999)
        preds_calibrated = self.calibration_model.predict(preds.reshape(-1, 1)).clip(0.001, 0.999)

        self.sub['Pred'] = preds_calibrated
        self.sub[['ID', 'Pred']].to_csv(output_file, index=False)
        print(f"Submission file saved to {output_file}")

    def plot_feature_importance(self, importances, feature_names, top_n=20):
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).head(top_n)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
        plt.title('Top {} Feature Importances'.format(top_n))
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.show()

    def plot_calibration_curve(self, y_true, y_proba, n_bins=10):

        combined = np.stack([y_proba, y_true], axis=-1)
        combined = combined[np.argsort(combined[:, 0])]
        sorted_probas = combined[:, 0]
        sorted_true = combined[:, 1]

        bins = np.linspace(0, 1, n_bins + 1)
        bin_midpoints = bins[:-1] + (bins[1] - bins[0]) / 2
        bin_assignments = np.digitize(sorted_probas, bins) - 1

        bin_sums = np.bincount(bin_assignments, weights=sorted_probas, minlength=n_bins)
        bin_true = np.bincount(bin_assignments, weights=sorted_true, minlength=n_bins)
        bin_total = np.bincount(bin_assignments, minlength=n_bins)

        fraction_of_positives = bin_true / bin_total
        fraction_of_positives[np.isnan(fraction_of_positives)] = 0

        plt.figure(figsize=(8, 6))
        plt.plot(bin_midpoints, fraction_of_positives, marker='o', label='Calibration Curve')
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')

        plt.xlabel('Predicted Probability')
        plt.ylabel('Fraction of Positives')
        plt.title('Calibration Curve')
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def plot_prediction_distribution(self, predictions, title="Distribution of Predictions"):
        """Plots the distribution of model predictions."""
        plt.figure(figsize=(8, 6))
        sns.histplot(predictions, kde=True, color='skyblue')
        plt.title(title)
        plt.xlabel('Predicted Probability')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()

    def plot_roc_curve(self, y_true, y_proba, title="ROC Curve"):
      """Plots the Receiver Operating Characteristic (ROC) curve."""
      fpr, tpr, thresholds = roc_curve(y_true, y_proba)
      roc_auc = auc(fpr, tpr)

      plt.figure(figsize=(8, 6))
      plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
      plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title(title)
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.legend(loc="lower right")
      plt.tight_layout()
      plt.show()

    def train_model_base_xgboost(self):
        model = self.get_xbg()
        self.train_model_cv(model, self.games.fillna(-1))
        self.train_model(model, self.games.fillna(-1))


In [None]:
predictor = TournamentPredictor(data_dir)
predictor.load_data()

In [None]:
""" 
lags only 3
Cross-validated MSE: 0.1708,0.1827
Cross-validated LogLoss: 0.5104,0.5394
Test cv array:  [0.18181727767832848, 0.18038723020479824, 0.1835351904420142, 0.18512420426450932, 0.18283504670415224]

test difference rather than feat_1 feat_2

"""

In [None]:
xgb = XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=3000,  
        learning_rate=0.1, 
        early_stopping_rounds=50,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=30
        #eval_metric= "rmse"
      )
data = predictor.games.fillna(-1)
predictor.train_model_cv(xgb, data)
predictor.train_model(xgb, data)

In [None]:
predictor.sub

In [None]:
predictor.games

In [None]:
df = predictor.teamsResults

In [None]:
df.DayNum.value_counts()

In [None]:
pd.set_option('display.max_columns', 1000)
df.loc[(df['TeamID'] == 1103) & (df['Season'] == 2024)].sort_values(by='DayNum')

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df.loc[(df['TeamID'] == 1104) & (df['Season'] == 2024)].sort_values(by='DayNum')['Margin'])
plot_pacf(df.loc[(df['TeamID'] == 1104) & (df['Season'] == 2024)].sort_values(by='DayNum')['Margin'])