In [None]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, brier_score_loss, mean_squared_error, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
import catboost as cb
from sklearn.base import clone

import warnings
warnings.filterwarnings("ignore")

In [None]:
data_dir = '/kaggle/input/march-machine-learning-mania-2025/**'
data_dir ='data/**'

In [None]:
class TournamentPredictor:
    def __init__(self, data_dir):
        self.data_path = data_dir
        self.data = None
        self.teams = None
        self.seeds = None
        self.games = None
        self.sub = None
        self.gb = None
        self.col = None
        self.imputer = SimpleImputer(strategy='mean')
        self.scaler = StandardScaler()

    def load_data(self):
        files = glob.glob(self.data_path)
        self.data = {p.split('/')[-1].split('.')[0]: pd.read_csv(p, encoding='latin-1') for p in files}

        teams = pd.concat([self.data['MTeams'], self.data['WTeams']])
        teams_spelling = pd.concat([self.data['MTeamSpellings'], self.data['WTeamSpellings']])
        teams_spelling = teams_spelling.groupby(by='TeamID', as_index=False)['TeamNameSpelling'].count()
        teams_spelling.columns = ['TeamID', 'TeamNameCount']
        self.teams = pd.merge(teams, teams_spelling, how='left', on=['TeamID'])

        season_cresults = pd.concat([self.data['MRegularSeasonCompactResults'], self.data['WRegularSeasonCompactResults']])
        season_dresults = pd.concat([self.data['MRegularSeasonDetailedResults'], self.data['WRegularSeasonDetailedResults']])
        tourney_cresults = pd.concat([self.data['MNCAATourneyCompactResults'], self.data['WNCAATourneyCompactResults']])
    
        tourney_dresults = pd.concat([self.data['MNCAATourneyDetailedResults'], self.data['WNCAATourneyDetailedResults']])

        seeds_df = pd.concat([self.data['MNCAATourneySeeds'], self.data['WNCAATourneySeeds']])
        self.seeds = {'_'.join(map(str, [int(k1), k2])): int(v[1:3]) for k1, v, k2 in seeds_df[['Season', 'Seed', 'TeamID']].values}

        self.sub = self.data['SampleSubmissionStage1']

        season_cresults['ST'] = 'S'
        season_dresults['ST'] = 'S'
        tourney_cresults['ST'] = 'T'
        tourney_dresults['ST'] = 'T'

        self.games = pd.concat((season_dresults, tourney_dresults), axis=0, ignore_index=True)
        #restrict to only tournament data (faster & better)
        self.games = self.games[self.games['ST'] == 'T']
        self.games['WLoc'] = self.games['WLoc'].map({'A': 1, 'H': 2, 'N': 3})

        self.games['ID'] = self.games.apply(lambda r: '_'.join(map(str, [r['Season']] + sorted([r['WTeamID'], r['LTeamID']]))), axis=1)
        self.games['IDTeams'] = self.games.apply(lambda r: '_'.join(map(str, sorted([r['WTeamID'], r['LTeamID']]))), axis=1)
        self.games['Team1'] = self.games.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[0], axis=1)
        self.games['Team2'] = self.games.apply(lambda r: sorted([r['WTeamID'], r['LTeamID']])[1], axis=1)
        self.games['IDTeam1'] = self.games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
        self.games['IDTeam2'] = self.games.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
        self.games['Team1Seed'] = self.games['IDTeam1'].map(self.seeds).fillna(0)
        self.games['Team2Seed'] = self.games['IDTeam2'].map(self.seeds).fillna(0)
        self.games['ScoreDiff'] = self.games['WScore'] - self.games['LScore']
        # team 1 won == (pred = 1)
        self.games['Pred'] = self.games.apply(lambda r: 1.0 if sorted([r['WTeamID'], r['LTeamID']])[0] == r['WTeamID'] else 0.0, axis=1)
        self.games['ScoreDiffNorm'] = self.games.apply(lambda r: r['ScoreDiff'] * -1 if r['Pred'] == 0.0 else r['ScoreDiff'], axis=1)
        self.games['SeedDiff'] = self.games['Team1Seed'] - self.games['Team2Seed']


        team1 = ['FGM1', 'FGA1', 'FGM31', 'FGA31', 'FTM1', 'FTA1', 'OR1', 'DR1', 'Ast1', 'TO1', 'Stl1', 'Blk1', 'PF1']
        team2 = ['FGM2', 'FGA2', 'FGM32', 'FGA32', 'FTM2', 'FTA2', 'OR2', 'DR2', 'Ast2', 'TO2', 'Stl2', 'Blk2', 'PF2']
        self.games[team1 + team2] = self.games.apply(self.sortStatsByWinTeam, axis=1, result_type='expand')


        # momentum as feature (win loss streaks / wins in a row, wins in last 5 games)
        # we would use the latest data for a team to calculate the momentum
        self.create_features(self.games)

        
        c_score_col = ['NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
        self.games.drop(columns=c_score_col, inplace=True)

        c_score_col = team1 + team2
        c_score_agg = ['sum', 'mean', 'median', 'max', 'min', 'std', 'skew', 'nunique']
        self.gb = self.games.groupby(by=['IDTeams']).agg({k: c_score_agg for k in c_score_col}).reset_index()

        self.gb.columns = [''.join(c) + '_c_score' for c in self.gb.columns]

        

        self.sub['WLoc'] = 3
        self.sub['Season'] = self.sub['ID'].map(lambda x: x.split('_')[0]).astype(int)
        self.sub['Team1'] = self.sub['ID'].map(lambda x: x.split('_')[1])
        self.sub['Team2'] = self.sub['ID'].map(lambda x: x.split('_')[2])
        self.sub['IDTeams'] = self.sub.apply(lambda r: '_'.join(map(str, [r['Team1'], r['Team2']])), axis=1)
        self.sub['IDTeam1'] = self.sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team1']])), axis=1)
        self.sub['IDTeam2'] = self.sub.apply(lambda r: '_'.join(map(str, [r['Season'], r['Team2']])), axis=1)
        self.sub['Team1Seed'] = self.sub['IDTeam1'].map(self.seeds).fillna(0)
        self.sub['Team2Seed'] = self.sub['IDTeam2'].map(self.seeds).fillna(0)
        self.sub['SeedDiff'] = self.sub['Team1Seed'] - self.sub['Team2Seed']
        self.sub = self.sub.fillna(-1)

        self.games = pd.merge(self.games, self.gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')
        self.sub = pd.merge(self.sub, self.gb, how='left', left_on='IDTeams', right_on='IDTeams_c_score')

        exclude_cols = ['ID', 'DayNum', 'ST', 'Team1', 'Team2', 'IDTeams', 'IDTeam1', 'IDTeam2', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'NumOT', 'Pred', 'ScoreDiff', 'ScoreDiffNorm', 'WLoc','IDTeams_c_score'] + c_score_col
        self.col = [c for c in self.games.columns if c not in exclude_cols]
        print("Data loading and preprocessing completed.")

    def sortStatsByWinTeam(self, row):
        winF = ['WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']
        loseF = ['LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF']
        if row['IDTeam1'] == row['WTeamID']:
            return row[winF + loseF]
        else:
            return row[loseF + winF]
        

    def create_features(self, df):
        # 1. Effective Field Goal Percentage (EFG%)
        def calculate_efg(fgm, fgm3, fga):
            if fga == 0:
                return 0.0  # Avoid division by zero
            return (fgm + 0.5 * fgm3) / fga
        
        def calculate_perc(made, att):
            if att == 0:
                return 0.0  # Avoid division by zero
            return np.round(made / att, 3)
        
        df['Team1_EFG'] = df.apply(lambda row: calculate_efg(row['FGM1'], row['FGM31'], row['FGA1']), axis=1)
        df['Team2_EFG'] = df.apply(lambda row: calculate_efg(row['FGM2'], row['FGM32'], row['FGA2']), axis=1)
        df['EFG_Diff'] = df['Team1_EFG'] - df['Team2_EFG']

        df['Team1_FGP'] = df.apply(lambda row: calculate_perc(row['FGM1'], row['FGA1']), axis=1)
        df['Team2_FGP'] = df.apply(lambda row: calculate_perc(row['FGM2'], row['FGA2']), axis=1)
        df['FGP_Diff'] = df['Team1_FGP'] - df['Team2_FGP']
        
        df['Team1_FGP3'] = df.apply(lambda row: calculate_perc(row['FGM31'], row['FGA31']), axis=1)
        df['Team2_FGP3'] = df.apply(lambda row: calculate_perc(row['FGM32'], row['FGA32']), axis=1)
        df['FGP3_Diff'] = df['Team1_FGP3'] - df['Team2_FGP3']

        # 3. WLoc Feature transformation
        # Replace WLoc values with numerical representations
        df['WLoc'] = df['WLoc'].replace({'H': 1, 'A': -1, 'N': 0})

        # Create features for Team 1 and Team 2 based on WLoc
        df['Team1_Home'] = np.where((df['IDTeam1'] == df['WTeamID']) & (df['WLoc'] == 1), 1,
                                np.where((df['IDTeam1'] == df['LTeamID']) & (df['WLoc'] == -1), 1, 0))
        df['Team2_Home'] = np.where((df['IDTeam2'] == df['WTeamID']) & (df['WLoc'] == 1), 1,
                                np.where((df['IDTeam2'] == df['LTeamID']) & (df['WLoc'] == -1), 1, 0))

        df['Team1_Away'] = np.where((df['IDTeam1'] == df['WTeamID']) & (df['WLoc'] == -1), 1,
                                np.where((df['IDTeam1'] == df['LTeamID']) & (df['WLoc'] == 1), 1, 0))
        df['Team2_Away'] = np.where((df['IDTeam2'] == df['WTeamID']) & (df['WLoc'] == -1), 1,
                                np.where((df['IDTeam2'] == df['LTeamID']) & (df['WLoc'] == 1), 1, 0))

        df['Team1_Neutral'] = np.where((df['IDTeam1'] == df['WTeamID']) & (df['WLoc'] == 0), 1,
                                   np.where((df['IDTeam1'] == df['LTeamID']) & (df['WLoc'] == 0), 1, 0))
        df['Team2_Neutral'] = np.where((df['IDTeam2'] == df['WTeamID']) & (df['WLoc'] == 0), 1,
                                   np.where((df['IDTeam2'] == df['LTeamID']) & (df['WLoc'] == 0), 1, 0))

        # Drop the original WLoc column
        df = df.drop(columns=['WLoc'])

        return df


    def get_random_forest(self):
      # Create the models here with the same parameters.
      return RandomForestRegressor(
          n_estimators=235,
          random_state=42,
          max_depth=5,
          min_samples_split=2,
          max_features='sqrt',
          n_jobs=-1
      )

    def get_xbg(self):
      return XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=3000,  
        learning_rate=0.1, 
        early_stopping_rounds=25,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5
        #eval_metric= "rmse"
      )
    def scaled_data(self, input_data):
        X_scaled = self.scaler.fit_transform(input_data)
        return X_scaled
    def impute_data(self, input_data):
        X_imputed = self.imputer.fit_transform(input_data)
        return X_imputed

    def train_model(self, model, df):
        X = df[self.col].reset_index(drop=True)#.fillna(-1)
        y = df['Pred']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
        train_preds = model.predict(X_train).clip(0.001, 0.999)
        test_preds0 = model.predict(X_test).clip(0.001, 0.999)

        print(f'Log Loss (Train/Test): {log_loss(y_train, train_preds):.4f}, {log_loss(y_test, test_preds0):.4f}')
        print(f'Brier Score (Train/Test): {brier_score_loss(y_train, train_preds):.4f}, {brier_score_loss(y_test, test_preds0):.4f}')
        print(f'MSE (TrainTest): {mean_squared_error(y_train, train_preds):.4f}, {mean_squared_error(y_test, test_preds0):.4f}')

        # Plot ROC Curve for the calibration set.
        self.plot_roc_curve(y_test, test_preds0, "Calibration Set ROC Curve")

        feature_importances = model.feature_importances_
        feature_names = self.col
        self.plot_feature_importance(feature_importances, feature_names)

        self.plot_calibration_curve(y_test, test_preds0)

        # Plot the distribution of calibrated predictions.
        self.plot_prediction_distribution(y_test, "Distribution of Test Predictions")

    def train_model_cv(self, model, df):
        X = df[self.col].reset_index(drop=True)
        y = df['Pred'].reset_index(drop=True)

        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_mse_scores = []
        cv_logloss_scores = []
        cv_test_mse_scores = []
        cv_test_logloss_scores = []
        for train_index, val_index in kf.split(X):
            X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[val_index]
            y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[val_index]

            # start with fresh model every time!
            clonsed_model = clone(model)
            clonsed_model.fit(X_train_cv, y_train_cv, eval_set=[(X_test_cv, y_test_cv)], verbose=100)
            train_preds_cv = clonsed_model.predict(X_train_cv).clip(0.001, 0.999)
            test_preds_cv = clonsed_model.predict(X_test_cv).clip(0.001, 0.999)


            train_mse_cv = mean_squared_error(y_train_cv, train_preds_cv)
            train_logloss_cv = log_loss(y_train_cv, train_preds_cv)
            test_mse_cv = mean_squared_error(y_test_cv, test_preds_cv)
            test_logloss_cv = log_loss(y_test_cv, test_preds_cv)

            cv_mse_scores.append(train_mse_cv)
            cv_logloss_scores.append(train_logloss_cv)
            cv_test_mse_scores.append(test_mse_cv)
            cv_test_logloss_scores.append(test_logloss_cv)

        
        print(f'Cross-validated MSE: {np.mean(cv_mse_scores):.4f},{np.mean(cv_test_mse_scores):.4f}')
        print(f'Cross-validated LogLoss: {np.mean(cv_logloss_scores):.4f},{np.mean(cv_test_logloss_scores):.4f}')
        print("Test cv array: ",cv_test_mse_scores)

    def predict_submission(self, output_file='submission.csv'):
        sub_X = self.sub[self.col].fillna(-1)
        sub_X_imputed = self.imputer.transform(sub_X)
        sub_X_scaled = self.scaler.transform(sub_X_imputed)

        preds = self.model.predict(sub_X_scaled).clip(0.001, 0.999)
        preds_calibrated = self.calibration_model.predict(preds.reshape(-1, 1)).clip(0.001, 0.999)

        self.sub['Pred'] = preds_calibrated
        self.sub[['ID', 'Pred']].to_csv(output_file, index=False)
        print(f"Submission file saved to {output_file}")

    def plot_feature_importance(self, importances, feature_names, top_n=20):
        feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).head(top_n)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance_df, palette='viridis')
        plt.title('Top {} Feature Importances'.format(top_n))
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.show()

    def plot_calibration_curve(self, y_true, y_proba, n_bins=10):

        combined = np.stack([y_proba, y_true], axis=-1)
        combined = combined[np.argsort(combined[:, 0])]
        sorted_probas = combined[:, 0]
        sorted_true = combined[:, 1]

        bins = np.linspace(0, 1, n_bins + 1)
        bin_midpoints = bins[:-1] + (bins[1] - bins[0]) / 2
        bin_assignments = np.digitize(sorted_probas, bins) - 1

        bin_sums = np.bincount(bin_assignments, weights=sorted_probas, minlength=n_bins)
        bin_true = np.bincount(bin_assignments, weights=sorted_true, minlength=n_bins)
        bin_total = np.bincount(bin_assignments, minlength=n_bins)

        fraction_of_positives = bin_true / bin_total
        fraction_of_positives[np.isnan(fraction_of_positives)] = 0

        plt.figure(figsize=(8, 6))
        plt.plot(bin_midpoints, fraction_of_positives, marker='o', label='Calibration Curve')
        plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')

        plt.xlabel('Predicted Probability')
        plt.ylabel('Fraction of Positives')
        plt.title('Calibration Curve')
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        plt.legend()
        plt.tight_layout()
        plt.show()

    def plot_prediction_distribution(self, predictions, title="Distribution of Predictions"):
        """Plots the distribution of model predictions."""
        plt.figure(figsize=(8, 6))
        sns.histplot(predictions, kde=True, color='skyblue')
        plt.title(title)
        plt.xlabel('Predicted Probability')
        plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()

    def plot_roc_curve(self, y_true, y_proba, title="ROC Curve"):
      """Plots the Receiver Operating Characteristic (ROC) curve."""
      fpr, tpr, thresholds = roc_curve(y_true, y_proba)
      roc_auc = auc(fpr, tpr)

      plt.figure(figsize=(8, 6))
      plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
      plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title(title)
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.legend(loc="lower right")
      plt.tight_layout()
      plt.show()

    def train_model_base_random_forest(self):
        model = self.get_random_forest()
        self.train_model_cv(model, self.games.fillna(-1))
        self.train_model(model, self.games.fillna(-1))
    
    def train_model_base_xgboost(self):
        model = self.get_xbg()
        self.train_model_cv(model, self.games.fillna(-1))
        self.train_model(model, self.games.fillna(-1))

    def run_all(self):
        self.load_data()
        self.train_model()
        self.predict_submission()


In [None]:
"""  only tournament data
base (auc 0.81)
Cross-validated MSE: 0.1447,0.1763
Cross-validated LogLoss: 0.4481,0.5214
Test cv array:  [0.17609657793288822, 0.17890084203209153, 0.18060527774749116, 0.16694279227517508, 0.1790824236025069]

base + additional features (auc 0.82)
Cross-validated MSE: 0.1441,0.1760
Cross-validated LogLoss: 0.4470,0.5205
Test cv array:  [0.176214665260042, 0.1760310657447056, 0.18180507612797978, 0.1668597346510348, 0.17891750344098578]


base + additional features + sorted features (auc 0.82)
    Cross-validated MSE: 0.1449,0.1758
    Cross-validated LogLoss: 0.4489,0.5204
    Test cv array:  [0.1759090613619681, 0.1767461997335639, 0.18180240042148935, 0.16652015296114905, 0.1780013603607607]

    all data
    Cross-validated MSE: 0.1817,0.2081
    Cross-validated LogLoss: 0.5378,0.5972
    Test cv array:  [0.2072069469355009, 0.20832444654169144, 0.2086487822469951, 0.20843162619147731, 0.20775880339858885]

    
"""

In [None]:
predictor = TournamentPredictor(data_dir)
predictor.load_data()

In [None]:
xgb = XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=3000,  
        learning_rate=0.1, 
        early_stopping_rounds=50,
        objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=30
        #eval_metric= "rmse"
      )
data = predictor.games.fillna(-1)
predictor.train_model_cv(xgb, data)
predictor.train_model(xgb, data)