# **Part 1:** NCAA 2024 Men's Bracket Predictions

# Importing, Functions, Team Spelling, and Merging

In [1]:
# Import Libraries and Data
import regex as re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from numpy import random
import random
from tqdm import tqdm

import h2o
from h2o.automl import H2OAutoML

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 300)
pd.options.mode.chained_assignment = None

# MDCM
mdcm = pd.read_csv('/kaggle/input/mdcm-data/NCAA_Tourney_2002_2023.csv')
team_spellings = pd.read_csv('/kaggle/input/mdcm-data/team_spellings.csv')

# CBBData
selection_sunday_resume = pd.read_csv('/kaggle/input/cbbdata/selection_sunday_resume.csv')

# Kaggle
barttovik_home = pd.read_csv('/kaggle/input/march-madness-data/Barttorvik Home.csv')
barttovik_away = pd.read_csv('/kaggle/input/march-madness-data/Barttorvik Away.csv')
kenpom_barttovik = pd.read_csv('/kaggle/input/march-madness-data/KenPom Barttorvik.csv')
shooting_splits = pd.read_csv('/kaggle/input/march-madness-data/Shooting Splits.csv')

In [27]:
# Define Functions
def merge_team_season(df: pd.DataFrame, df_merge_onto: pd.DataFrame, filter_df_merge_onto_year = None, title: str = None):
    """
    
        Function to merge teams and their seasons in college basketball with a bevy of alternative spellings, using team_spellings.csv.
    
        df (pd.DataFrame): The dataframe you'd like to establish as your left, or original df. Must contain 'teamname' and 'season' columns.
        
        df_merge_onto (pd.DataFrame): The dataframe you'd like to left merge onto df. Must contain 'team' and 'year' columns.
        
        filter_df_merge_onto_year (int): If there is a cutoff year for the df_merge_onto, the year of interest. This does not affect the 
                                        merge, just the reporting success messages.
    
    """
    # Print Title of Run For Terminal
    if title:
        print(title,'-----------------------\n')
        
    # If Oncoming Data Filtered By Specifc Year 
    if filter_df_merge_onto_year != None:
        df_post_cutoff = df[df['season'] >= filter_df_merge_onto_year]
        df_pre_cutoff = df[df['season'] < filter_df_merge_onto_year]
        df = df_post_cutoff
    
    # Find Null df Column Identifier For Oncoming DF
    for col_name in df_merge_onto.columns:
        if 'team' not in col_name.lower() and 'year' not in col_name.lower():
            col_null_match_identifier = col_name
            print('Column Null Match Identifier:', col_null_match_identifier,'\n')
            break
     
    # Set Both Teamname Columns To Lowercase Easier Merging
    for team_num in range(1,3):
        df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()
    
    # Loop Through Team 1 and Team 2
    df_both_teams = pd.DataFrame()
    for team_num in range(1, 3):
        
        # Adjust Column Names Due To Team1 and Team2 (Remove 'team1_' or 'team1_'/'team2_')
        if team_num == 2:
            df_merge_onto.columns = df_merge_onto.columns.str[6:]
        df_merge_onto = df_merge_onto.add_prefix(f'team{team_num}_')
        
        # Establish df_merge_onto Team Column Name and Set to Lowercase
        df_merge_onto[f'team{team_num}_team'] = df_merge_onto[f'team{team_num}_team'].str.lower()
        
        # If Second Iteration, Find The Merge Columns and The Spellings Ones To Keep
        if team_num == 2:
            spellings = ['team2_teamname', 'season']
            for col in df.columns:
                if f"team{team_num}_name_spelling" in col: 
                    spellings.append(col)
            df = df[spellings].drop_duplicates()
            
        # Original Merge For Team 1 or 2
        print(f"Team {team_num} Merge ...\n")
        df_merged = pd.merge(df, df_merge_onto, how = 'left', left_on = [f'team{team_num}_teamname', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
        
        # Original Split Up Merged and Unmerged Data
        df_not_merged = df_merged[df_merged[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
        df_merged = df_merged[df_merged[f'team{team_num}_{col_null_match_identifier}'].isna() == False]
        
        print(f'Original Team {team_num} Merge:', len(df), 'total rows.')
        print('Matched During Iteration:', len(df_merged)) 
        print('Unmatched Rows Remaining:', len(df_not_merged), '\n') 
        
        # Remove Columns That Didn't Merge Properly Based On Num of Columns
        # Reduce To Team and Year, Along with Alternate Spellings
        neg_col_count_df_merge_onto = df_merge_onto.shape[1] * -1
        df_not_merged = df_not_merged.iloc[:, :neg_col_count_df_merge_onto]
        
        # Loop Through Columnns To Fix The Merge
        print(f"Correcting Team {team_num} Merge ...\n")
        merge_complete, i = False, 1
        while merge_complete == False:
            
            # Perform Loop Everytime More Unmatched Columns Are Found
            team_season_loop = pd.merge(df_not_merged, df_merge_onto, how = 'left', left_on = [f'team{team_num}_name_spelling_{i}', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
            # print(f'Team {team_num} Season Loop {i}:', len(team_season_loop), 'total rows.')
            
            # Split Up The Matched and Unmatched
            matched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == False]
            # print('Matched During Iteration:', len(matched_df)) 
            unmatched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
            # print('Unmatched Rows Remaining:', len(unmatched_df), '\n') 
            
            # For The DataFrames With Data In The Column From Second DF, Add To team_season
            if len(matched_df) > 0:
                df_merged = pd.concat([df_merged, matched_df])
            # If There Are Still Null Rows, Throw Those Back In The Loop For The Next Iteration
            if len(unmatched_df) > 0:
                df_not_merged = unmatched_df.iloc[:, :neg_col_count_df_merge_onto]
            # If There Aren't Any Null Rows Left, End The Loop
            else:
                print(f'Success! Team {team_num} Merge Completed Early!\n')
                merge_complete = True
            if i == 11:
                merge_complete = True 
            i += 1
   
        # Concat Team 1 or 2 Onto Full DataFrame
        if team_num == 2:
            spellings = []
            for col in df_merged.columns:
                if 'team2_name_spelling' not in col:
                    spellings.append(col)

            df_both_teams = pd.merge(df_both_teams, df_merged[spellings], how = 'left', on = [f'team2_teamname', 'season'])
        else:
            if len(df_not_merged) > 0:
                df_both_teams = pd.concat([df_merged, df_not_merged])
    
    # Inspect Final Results
    print("Filter Views of Resulting DataFrame -------------------------\n")
    for team_num in range(1, 3):
        print(f'Team {team_num}:')
        print('Null Match Identifier Column:', col_null_match_identifier)
        if filter_df_merge_onto_year != None:
            
            # Gather Data About Merge Post Cutoff
            post_cutoff_rows = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == False) & (df_both_teams['season'] >= filter_df_merge_onto_year)]
            post_cutoff_rows_na = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == True) & (df_both_teams['season'] >= filter_df_merge_onto_year)]
            
            if post_cutoff_rows_na.shape[0] > 0:
                print(f'Oh No! There were {post_cutoff_rows.shape[0]} matches and {post_cutoff_rows_na.shape[0]} non matches post {filter_df_merge_onto_year}.')
                unique_unmatched_teams = post_cutoff_rows_na[f'team{team_num}_teamname'].drop_duplicates().sort_values()
                print(f'\n{len(unique_unmatched_teams)} team name(s) from the Team {team_num} Merge that exist(s) in the original df:\n')
                print(unique_unmatched_teams,'\n')
            else:
                print(f'Great! No Null Rows Post {filter_df_merge_onto_year}')
     
            # Gather Data About Merge Pre Cutoff
            pre_cutoff_rows = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == False) & (df_both_teams['season'] < filter_df_merge_onto_year)].shape[0]
            if pre_cutoff_rows == 0:
                print(f'Great! No Matched Rows Pre {filter_df_merge_onto_year}\n')
                                
    # Concat Pre and Post Cutoff If Exists
    if filter_df_merge_onto_year != None:
        df_both_teams = pd.concat([df_both_teams, df_pre_cutoff], ignore_index=True)    
    
    # Fix Rare Situation Where There Is A Duplicate of Team1_Teamname, Team2_Teamname, and Season (If Something To Be Dropped, Then Drop It)
    df_both_teams.drop_duplicates(subset=['team1_teamname', 'team2_teamname', 'season'], keep = 'first', inplace = True)
    
    return df_both_teams.drop(['team1_team','team1_year', 'team2_team', 'team2_year'], axis = 1)

# Ammend List To Remove Duplicate Columns And Retain One Set 
def drop_dup_columns(df: pd.DataFrame, dup_cols_keep: list[str]):
    same_data_columns = []
    for i in range(df.shape[1] - 1):
        for j in range(i + 1, df.shape[1]):
            col1, col2 = df.columns[i], df.columns[j]
            if df[col1].equals(df[col2]):
                same_data_columns.append((col1, col2))
    if not same_data_columns:
        print("No columns have the same data.")
    else:
        for col_pair in same_data_columns:
            if col_pair not in dup_cols_keep:
                df.drop(columns = col_pair[0], axis = 1, inplace = True)
                df.rename(columns = {col_pair[1]: col_pair[1].replace('_away', '')}, inplace = True)
    return df

In [28]:
# Create Round Column
mdcm['round'] = mdcm['slot'].str.extract(r'(\d+)')
mdcm['round'] = pd.to_numeric(mdcm['round'], errors='coerce')

# Adjust Team Spellings
team_spellings = team_spellings.pivot_table(index='team_id', columns=team_spellings.groupby('team_id').cumcount(), values='name_spelling', aggfunc='first')
team_spellings.columns = [f'name_spelling_{i + 1}' for i in range(team_spellings.shape[1])]
team_spellings.reset_index(inplace=True)

# Merge Team Spellings
team_spellings_t1 = team_spellings.add_prefix('team1_')
mdcm = pd.merge(mdcm, team_spellings_t1, how = 'inner', left_on = ['team1_id'], right_on = ['team1_team_id'])
team_spellings_t2 = team_spellings.add_prefix('team2_')
mdcm = pd.merge(mdcm, team_spellings_t2, how = 'inner', left_on = ['team2_id'], right_on = ['team2_team_id'])

# Reduce MDCM To Columns of Interest
# Simple Seed Data, Geographies, Team & Coach Counting Tourney Stats, and Preseason Rankings
mdcm.drop(columns = ['num_ot', 'WLoc', 'team1_region', 'team2_region', 'host','host_lat', 'host_long', 'team1_lat', 'team1_long', 'team2_lat', 'team2_long',  
                     'team1_pt_school_ncaa', 'team1_pt_overall_ncaa','team1_pt_school_s16','team1_pt_overall_s16','team1_pt_school_ff','team1_pt_overall_ff',
                     'team1_pt_career_school_wins','team1_pt_career_school_losses','team1_pt_career_overall_wins','team1_pt_career_overall_losses',
                     'team1_pt_team_season_wins','team1_pt_team_season_losses','team1_pt_coach_season_wins','team1_pt_coach_season_losses','team2_coach_id',
                     'team2_pt_school_ncaa','team2_pt_overall_ncaa','team2_pt_school_s16','team2_pt_overall_s16','team2_pt_school_ff','team2_pt_overall_ff',
                     'team2_pt_career_school_wins', 'team2_pt_career_school_losses','team2_pt_career_overall_wins','team2_pt_career_overall_losses',
                     'team2_pt_team_season_wins', 'team2_pt_team_season_losses','team2_pt_coach_season_wins','team2_pt_coach_season_losses','team1_ap_final',
                     'team1_ap_preseason','team1_coaches_before_final','team1_coaches_preseason','team2_ap_final','team2_ap_preseason','team2_coaches_before_final',
                     'team2_coaches_preseason'], inplace= True)

# **Testing**

In [29]:
# Game Factors - Explore Team Momentum Going Into Tourney
# game_factors = pd.read_csv('../data/cbbdata/game/game_factors.csv').query('year != 2020')
# game_factors['date']= pd.to_datetime(game_factors['date'])
# game_factors[game_factors['type'] != 'post'].head()

# Importing, Functions, Creating Features, Flipping Values, Feature Selection

In [30]:
# Import Data (NOT CURRENT YEAR)
df = pd.read_csv("/kaggle/working/merged_team_season.csv")

# Create Features, Including Difference and Ratio Columns
def calculate_diff_ratio_pythag(df: pd.DataFrame, features = [str], diff_ratio_pythag: str = 'difference'):
    for feature in features:

        # Find Difference, Ratio, or Pythagorean of A Feature Between The Two Teams
        if diff_ratio_pythag == 'difference':
            df[f'{feature}_diff'] = df[f'team1_{feature}'] - df[f'team2_{feature}']
        elif diff_ratio_pythag == 'ratio':
            df[f'{feature}_ratio'] = df[f'team1_{feature}'] / df[f'team2_{feature}']   
        elif diff_ratio_pythag == 'pythag':
            df['team1_pythag'] = (df[f'team1_{feature}'] - (df[f'team1_{feature}']*df[f'team2_{feature}']))/ (df[f'team1_{feature}'] + df[f'team2_{feature}']-(2*df[f'team1_{feature}']*df[f'team2_{feature}']))
    return df

# Flips DF2 Differences and Ratios
def flip_df2_diff_ratio_pythag(df: pd.DataFrame, feature_dict: dict):
    
    # Loop Through Difference and Ratio, Apply Appropriate Transformation
    for diff_ratio_pythag in list(feature_dict.keys()):
        for feat in feature_dict[diff_ratio_pythag]:
            
            if diff_ratio_pythag == 'diff':
                df[f'{feat}_diff'] = df[f'{feat}_diff'] * -1
            elif diff_ratio_pythag == 'ratio':
                df[f'{feat}_ratio'] = 1 / df[f'{feat}_ratio']
            elif diff_ratio_pythag == 'pythag':
                df[f'team1_{diff_ratio_pythag}'] = 1 - df[f'team1_{diff_ratio_pythag}']
                
    # Change Team1 Win, Score, and Game ID
    df['team1_win'] = 0
    t1s, t2s = df['team1_score'], df['team2_score']
    df['team1_score'], df['team2_score'] = t2s, t1s
    df['game_id'] = (df['game_id'].str.split('-', expand=True)[0] + '-' + df['game_id'].str.split('-', expand=True)[2] + '-' + df['game_id'].str.split('-', expand=True)[1])
    
    return df

In [31]:
# Creat List of Features For Ratios
features_ratio = []
for item in list(df.columns):
    if 'team1_' in item or 'team2_' in item:
        features_ratio.append(item[6:])
features_ratio = list(set(features_ratio))

# Drop Irrelevant Features
features_ratio = [feat for feat in features_ratio if feat not in ['coach_id', 'team_id', 'id', 'teamname', 'position']]

# Calculate Differences, Ratios, and Pythag Between Features
features = {'diff': ['seed'],
            'ratio': features_ratio,
            'pythag': ['exp_win']}

# Calculate Pythagorean W/L Based On Adjusted Offensive Efficiency
df['team1_exp_win'] = (df['team1_adjoe']**11.5)/ ((df['team1_adjde']**11.5)+(df['team1_adjoe']**11.5))
df['team2_exp_win'] = (df['team2_adjoe']**11.5)/ ((df['team2_adjde']**11.5)+(df['team2_adjoe']**11.5))

In [33]:
df.head()

Unnamed: 0,team1_id,team1_score,team2_id,team2_score,team1_position,team2_position,team1_seed,team2_seed,strongseed,weakseed,slot,team1_teamname,team2_teamname,season,team1_coach_id,team1_fg2pct,team1_fg3pct,team1_ftpct,team1_blockpct,team1_oppfg2pct,team1_oppfg3pct,team1_oppftpct,team1_oppblockpct,team1_f3grate,team1_oppf3grate,team1_arate,team1_opparate,team1_stlrate,team1_oppstlrate,team2_fg2pct,team2_fg3pct,team2_ftpct,team2_blockpct,team2_oppfg2pct,team2_oppfg3pct,team2_oppftpct,team2_oppblockpct,team2_f3grate,team2_oppf3grate,team2_arate,team2_opparate,team2_stlrate,team2_oppstlrate,team1_tempo,team1_adjtempo,team1_oe,team1_adjoe,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,team2_adjoe,team2_de,team2_adjde,game_id,round,team1_team_id,team2_team_id,team1_net,team1_resume,team1_wab,team1_elo,team1_power,team2_net,team2_resume,team2_wab,team2_elo,team2_power,team1_barthag,team1_avg hgt,team1_eff hgt,team1_exp,team1_talent,team1_elite sos,team2_barthag,team2_avg hgt,team2_eff hgt,team2_exp,team2_talent,team2_elite sos,team1_home_badj em,team1_home_badj o,team1_home_badj d,team1_home_barthag,team1_home_badj t,team1_home_3pt%,team1_home_ft%,team2_home_badj em,team2_home_badj o,team2_home_badj d,team2_home_barthag,team2_home_badj t,team2_home_3pt%,team2_home_ft%,team1_away_badj em,team1_away_badj o,team1_away_badj d,team1_away_barthag,team1_away_badj t,team1_away_3pt%,team1_away_ft%,team2_away_badj em,team2_away_badj o,team2_away_badj d,team2_away_barthag,team2_away_badj t,team2_away_3pt%,team2_away_ft%,team1_team no,team1_team id,team1_dunks fg%,team1_dunks share,team1_dunks fg%d,team1_dunks d share,team1_close twos fg%,team1_close twos share,team1_close twos fg%d,team1_close twos d share,team1_farther twos fg%,team1_farther twos share,team1_farther twos fg%d,team1_farther twos d share,team1_threes fg%,team1_threes share,team1_threes fg%d,team1_threes d share,team2_team no,team2_team id,team2_dunks fg%,team2_dunks share,team2_dunks fg%d,team2_dunks d share,team2_close twos fg%,team2_close twos share,team2_close twos fg%d,team2_close twos d share,team2_farther twos fg%,team2_farther twos share,team2_farther twos fg%d,team2_farther twos d share,team2_threes fg%,team2_threes share,team2_threes fg%d,team2_threes d share,team1_exp_win,team2_exp_win
0,1314,81,1181,77,W08,X02,8,2,2,8,R5WX,north carolina,duke,2022,hubert-davis-1,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,55.844156,36.821192,72.881356,13.25811,46.897038,31.396957,67.160494,8.097785,36.579457,33.769267,56.987116,51.793722,0.094448,0.091014,70.2181,70.1745,109.416,113.035,100.974,96.9911,68.2765,67.4185,117.152,119.357,98.2346,95.6444,2022-1314-1181,5,1314,1181,31.0,48.0,19.0,24.0,26.7,12.0,20.0,7.0,9.0,11.3,0.866,78.254,81.474,1.742,87.453,29.665,0.943,78.372,81.821,0.988,91.521,28.364,18.3,113.0,94.7,0.884,70.2,39.1,74.5,21.4,118.1,96.7,0.909,70.0,38.1,76.5,17.7,114.7,97.0,0.873,72.0,35.4,79.8,34.1,123.9,89.8,0.976,66.1,39.3,68.8,904.0,135.0,86.8,8.6,88.3,4.9,61.5,35.6,59.0,32.4,34.6,26.4,35.7,31.4,35.8,38.0,33.7,36.2,927.0,50.0,89.7,13.4,86.2,7.5,66.7,38.7,55.4,36.5,39.8,25.6,35.9,29.9,36.6,35.7,32.0,33.6,0.85326,0.927374
1,1242,81,1437,65,Y01,Z02,1,2,1,2,R5YZ,kansas,villanova,2022,bill-self-1,54.511559,35.526316,72.439759,10.498883,47.877885,30.08596,70.723104,7.829978,33.777778,34.198922,53.901437,47.479484,0.091219,0.097495,50.348953,35.906433,82.332155,7.177033,48.229665,31.003812,74.509804,10.867398,46.017223,42.958515,49.014778,50.0,0.096876,0.072892,69.5494,69.1275,111.605,119.388,97.191,93.9009,64.1915,62.5758,112.845,117.921,97.6798,93.8099,2022-1242-1437,5,1242,1437,6.0,1.0,1.0,2.0,5.7,8.0,7.0,6.0,3.0,9.0,0.951,77.142,80.111,2.082,77.998,36.398,0.934,77.347,79.315,2.392,69.071,31.835,25.5,117.8,92.3,0.943,70.3,37.1,71.8,28.6,121.8,93.2,0.956,62.6,41.3,88.7,25.6,121.0,95.4,0.939,68.2,34.8,74.1,19.6,114.7,95.1,0.896,65.3,34.0,79.6,918.0,86.0,91.2,8.5,86.6,4.3,64.4,38.9,57.4,34.0,38.7,28.0,34.5,31.3,36.1,33.1,29.8,34.8,881.0,228.0,83.3,5.8,84.3,5.7,57.9,34.3,57.1,30.2,34.6,19.4,38.3,27.6,35.9,46.3,30.8,42.2,0.940565,0.932803
2,1242,72,1314,69,Y01,W08,1,8,1,8,R6CH,kansas,north carolina,2022,bill-self-1,54.511559,35.526316,72.439759,10.498883,47.877885,30.08596,70.723104,7.829978,33.777778,34.198922,53.901437,47.479484,0.091219,0.097495,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,69.5494,69.1275,111.605,119.388,97.191,93.9009,70.2181,70.1745,109.416,113.035,100.974,96.9911,2022-1242-1314,6,1242,1314,6.0,1.0,1.0,2.0,5.7,31.0,48.0,19.0,24.0,26.7,0.951,77.142,80.111,2.082,77.998,36.398,0.866,78.254,81.474,1.742,87.453,29.665,25.5,117.8,92.3,0.943,70.3,37.1,71.8,18.3,113.0,94.7,0.884,70.2,39.1,74.5,25.6,121.0,95.4,0.939,68.2,34.8,74.1,17.7,114.7,97.0,0.873,72.0,35.4,79.8,918.0,86.0,91.2,8.5,86.6,4.3,64.4,38.9,57.4,34.0,38.7,28.0,34.5,31.3,36.1,33.1,29.8,34.8,904.0,135.0,86.8,8.6,88.3,4.9,61.5,35.6,59.0,32.4,34.6,26.4,35.7,31.4,35.8,38.0,33.7,36.2,0.940565,0.85326
3,1242,76,1274,50,Y01,Y10,1,10,1,10,R4Y1,kansas,miami fl,2022,bill-self-1,54.511559,35.526316,72.439759,10.498883,47.877885,30.08596,70.723104,7.829978,33.777778,34.198922,53.901437,47.479484,0.091219,0.097495,54.836066,35.285505,74.31694,8.544028,53.879686,35.167785,69.098712,8.770492,35.890699,39.376321,50.879121,56.590909,0.128379,0.078727,69.5494,69.1275,111.605,119.388,97.191,93.9009,67.5221,67.3101,110.368,114.757,104.983,102.132,2022-1242-1274,4,1242,1274,6.0,1.0,1.0,2.0,5.7,62.0,40.0,31.0,39.0,61.3,0.951,77.142,80.111,2.082,77.998,36.398,0.813,76.713,80.224,2.424,65.462,27.253,25.5,117.8,92.3,0.943,70.3,37.1,71.8,11.2,114.8,103.6,0.765,69.3,36.9,73.1,25.6,121.0,95.4,0.939,68.2,34.8,74.1,20.1,115.4,95.3,0.9,68.0,35.5,77.4,918.0,86.0,91.2,8.5,86.6,4.3,64.4,38.9,57.4,34.0,38.7,28.0,34.5,31.3,36.1,33.1,29.8,34.8,911.0,113.0,89.5,6.3,88.0,7.6,65.6,37.2,63.4,37.3,39.9,27.2,37.0,24.1,33.9,35.6,34.5,38.7,0.940565,0.792545
4,1314,69,1389,49,W08,W15,8,15,8,15,R4W1,north carolina,st peter's,2022,hubert-davis-1,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,45.916515,34.791667,68.46543,14.609053,44.032922,29.422383,73.482428,13.157895,30.34134,36.304063,51.560178,51.607445,0.108634,0.105047,70.2181,70.1745,109.416,113.035,100.974,96.9911,67.0262,65.9923,98.7557,98.7475,92.3289,94.4745,2022-1314-1389,4,1314,1389,31.0,48.0,19.0,24.0,26.7,,,,,,0.866,78.254,81.474,1.742,87.453,29.665,0.604,76.197,79.399,1.895,0.2,12.317,18.3,113.0,94.7,0.884,70.2,39.1,74.5,2.1,91.6,89.5,0.566,67.4,32.6,69.4,17.7,114.7,97.0,0.873,72.0,35.4,79.8,2.5,101.5,99.0,0.571,66.8,35.8,67.4,904.0,135.0,86.8,8.6,88.3,4.9,61.5,35.6,59.0,32.4,34.6,26.4,35.7,31.4,35.8,38.0,33.7,36.2,896.0,176.0,88.4,3.4,85.1,6.0,49.9,43.6,51.4,36.7,38.4,26.1,35.0,27.0,34.6,30.3,29.2,36.3,0.85326,0.624505


In [32]:
# Inspect Correlation and Influence on Team 1 Winning

# Get All Diffs, Ratio, and Pythag
complete_ratio_cols = [col + '_ratio' for col in features_ratio]
complete_ratio_cols.append('team1_pythag')

# Normalize Features For Fair Comparison
scaler = StandardScaler()
df_cols_to_norm = df[complete_ratio_cols]
df_cols_to_norm = df_cols_to_norm.fillna(df_cols_to_norm.mean())
df_norm = pd.DataFrame(scaler.fit_transform(df_cols_to_norm), columns = complete_ratio_cols)
df_norm = pd.concat([df_norm, df['team1_win']], axis = 1)

# Compare Biggest Mean Difference When Team 1 Wins and Losses
mean_values_0 = df_norm[df_norm['team1_win'] == 0][complete_ratio_cols].mean()
mean_values_1 = df_norm[df_norm['team1_win'] == 1][complete_ratio_cols].mean()
team1_win_change = pd.DataFrame(abs(mean_values_1 - mean_values_0).sort_values(ascending=False), columns = ['team1_win_difference'])

# Show Corrleation To Team1 Pythag, The Strongest Indicator
team1_pythag = df_norm[complete_ratio_cols].corr(method='spearman')
team1_pythag = pd.DataFrame(abs(team1_pythag['team1_pythag']))

# View Them Together
win_and_pythag = pd.merge(team1_win_change, team1_pythag, left_index = True, right_index = True, how = 'inner')
win_and_pythag['win_difference_minus_pythag_corr'] = win_and_pythag['team1_win_difference'] - win_and_pythag['team1_pythag']
win_and_pythag[(win_and_pythag['team1_win_difference'] > .15) & (win_and_pythag['win_difference_minus_pythag_corr'] > -.1)].sort_values(by = 'team1_win_difference', ascending=False)

KeyError: "None of [Index(['net_ratio', 'opparate_ratio', 'home_badj d_ratio',\n       'farther twos d share_ratio', 'blockpct_ratio', 'team no_ratio',\n       'home_badj o_ratio', 'oppf3grate_ratio', 'score_ratio', 'barthag_ratio',\n       'avg hgt_ratio', 'talent_ratio', 'home_barthag_ratio',\n       'close twos fg%_ratio', 'farther twos share_ratio', 'dunks share_ratio',\n       'eff hgt_ratio', 'oppfg3pct_ratio', 'home_3pt%_ratio',\n       'threes share_ratio', 'de_ratio', 'resume_ratio', 'oppftpct_ratio',\n       'f3grate_ratio', 'ftpct_ratio', 'home_badj em_ratio', 'home_ft%_ratio',\n       'away_badj d_ratio', 'away_ft%_ratio', 'team id_ratio',\n       'dunks fg%_ratio', 'arate_ratio', 'away_barthag_ratio',\n       'adjtempo_ratio', 'power_ratio', 'away_badj o_ratio',\n       'farther twos fg%_ratio', 'away_badj em_ratio',\n       'farther twos fg%d_ratio', 'wab_ratio', 'close twos d share_ratio',\n       'threes fg%d_ratio', 'exp_ratio', 'dunks d share_ratio',\n       'away_badj t_ratio', 'home_badj t_ratio', 'adjde_ratio', 'seed_ratio',\n       'threes fg%_ratio', 'away_3pt%_ratio', 'close twos fg%d_ratio',\n       'elite sos_ratio', 'fg3pct_ratio', 'oe_ratio', 'oppfg2pct_ratio',\n       'fg2pct_ratio', 'adjoe_ratio', 'stlrate_ratio', 'tempo_ratio',\n       'elo_ratio', 'close twos share_ratio', 'oppstlrate_ratio',\n       'threes d share_ratio', 'dunks fg%d_ratio', 'oppblockpct_ratio',\n       'team1_pythag'],\n      dtype='object')] are in the [columns]"

In [None]:
# Calculate Team Differences, Ratios, and Pythag
df = calculate_diff_ratio_pythag(df, features['diff'], diff_ratio_pythag = 'difference')
df = calculate_diff_ratio_pythag(df, features['ratio'], diff_ratio_pythag = 'ratio')
df = calculate_diff_ratio_pythag(df, features['pythag'], diff_ratio_pythag = 'pythag')

# Establish Training Columns
standard_training_columns = ['game_id','season','team1_win','team1_score','team2_score']
feature_columns = [col for col in df.columns if 'diff' in col or 'ratio' in col or 'pythag' in col]
training_columns = standard_training_columns + feature_columns

#Create a team 1 win column to determine which rows actually have them winning
df['team1_win'] = (df['team1_score'] > df['team2_score']).astype(int)

# Randomly Select Games To Place in Df 1 and Rest in Df 2
df_1 = df.sample(frac=.5, random_state=15)[training_columns]
df_2 = df[~df.index.isin(df_1.index)][training_columns].reset_index(drop=True)

# In Second Df, Flip The Attributes
df_2 = flip_df2_diff_ratio_pythag(df_2, features)

#Combine DataFrames, Create 2019 Holdout Set
df = pd.concat([df_1, df_2]).reset_index(drop=True)

In [None]:
# Produce Correlation Matrix and Correlation Sum
final_features = ['team1_pythag', 'elite sos_ratio', 'blockpct_ratio','threes fg%_ratio']

corr_matrix = df[final_features].corr(method='spearman')
print('Total Correlation Overlap:', round((abs(corr_matrix).sum().sum() - len(corr_matrix))/2,2))
print('Total Correlation Overlap per Feature:', round((abs(corr_matrix).sum().sum() - len(corr_matrix))/2/len(corr_matrix),2))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

plt.figure(figsize=(10, 7))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", mask = mask)
plt.title("Correlation Matrix - Men's March Madness Modeling - Team Statistics as Ratios")
plt.show()

In [None]:
# Export Training Data
df = df[standard_training_columns + final_features]
df[final_features] = df[final_features].fillna(df[final_features].mean())

df.to_csv("/kaggle/working/train.csv", index = False)
df.describe()

# **Part 3 (H2o AutoML XGBoost Training, Fitting Current Data)**

In [None]:
# Import Data
train = pd.read_csv("/kaggle/working/train.csv")

# Initialize H2O
h2o.init()

# Convert DataFrame to H2O Frame
train_h2o = h2o.H2OFrame(train)

# Test Train Split with Validation
train_h2o['team1_win'] = train_h2o['team1_win'].asfactor()
train, test = train_h2o.split_frame(ratios = [0.8], seed = 42)

# Initiate the H20 AutoML Model
aml = H2OAutoML(max_models = 5, seed = 42, verbosity = "info", nfolds = 4, balance_classes = True, 
                include_algos = ['XGBoost'], stopping_metric = 'logloss')

# Train Model with Feature and Target Columns
aml.train(x = train.columns[5:], y = 'team1_win', training_frame = train)
print('Training Columns:', train.columns[5:],'\n','Target Column: "team1_win"')
print('Training Size:', train.shape,'\n','Test Size:', test.shape)

In [None]:
# Predict On Test Data and Evaluate Model Performance
# aml.leaderboard.head(5)
xgb_men = aml.get_best_model(algorithm="xgboost", criterion="logloss")
xgb_men.model_performance(test) 

In [None]:
# Explain Model Performance
h2o.explain(xgb_men,test)

# Prepare Submission

In [None]:
# Import 2024 Data

# MDCM
mdcm_2024 = pd.read_csv('/kaggle/input/mdcm-data/NCAA_Tourney_2024.csv')
team_spellings = pd.read_csv('/kaggle/input/mdcm-data/team_spellings.csv')

# Adjust Team Spellings
team_spellings = team_spellings.pivot_table(index='team_id', columns=team_spellings.groupby('team_id').cumcount(), values='name_spelling', aggfunc='first')
team_spellings.columns = [f'name_spelling_{i + 1}' for i in range(team_spellings.shape[1])]
team_spellings.reset_index(inplace=True)

# Merge Team Spellings
team_spellings_t1 = team_spellings.add_prefix('team1_')
mdcm_2024 = pd.merge(mdcm_2024, team_spellings_t1, how = 'inner', left_on = ['team1_id'], right_on = ['team1_team_id'])
team_spellings_t2 = team_spellings.add_prefix('team2_')
mdcm_2024 = pd.merge(mdcm_2024, team_spellings_t2, how = 'inner', left_on = ['team2_id'], right_on = ['team2_team_id'])

# Merge MDCM and Selection Sunday, Kenpom Barttovik, Home & Away Barttovik, and Shooting Splits
df = merge_team_season(mdcm_2024, selection_sunday_resume, filter_df_merge_onto_year = 2024, title = 'MDCM and Selection Sunday (2024)')
df = merge_team_season(df, kenpom_barttovik, filter_df_merge_onto_year = 2024, title = 'MDCM and Kenpom Barttovik (2024)')
df = merge_team_season(df, barttovik_home, filter_df_merge_onto_year = 2024, title = 'MDCM and Home Barttovic Data (2024)')
df = merge_team_season(df, barttovik_away, filter_df_merge_onto_year = 2024, title = 'MDCM and Away Barttovic Data (2024)')
df = merge_team_season(df, shooting_splits, filter_df_merge_onto_year = 2024, title = 'MDCM and Kenpom Barttovik (2024)')

# Calculate Pythagorean W/L Based On Adjusted Offensive Efficiency
df['team1_exp_win'] = (df['team1_adjoe']**11.5)/ ((df['team1_adjde']**11.5)+(df['team1_adjoe']**11.5))
df['team2_exp_win'] = (df['team2_adjoe']**11.5)/ ((df['team2_adjde']**11.5)+(df['team2_adjoe']**11.5))

# Manually Calculate Features
df['team1_pythag'] = (df['team1_exp_win'] - (df['team1_exp_win']*df['team2_exp_win']))/ (df['team1_exp_win'] + df['team2_exp_win']-(2*df['team1_exp_win']*df['team2_exp_win']))
df['elite sos_ratio'] = df['team1_elite sos'] / (df['team2_elite sos'])
df['avg hgt_ratio'] = df['team1_avg hgt'] / (df['team2_avg hgt'])
df['threes fg%_ratio'] = df['team1_threes fg%'] / (df['team2_threes fg%'])
df['blockpct_ratio'] = df['team1_blockpct'] / (df['team2_blockpct'])

In [None]:
# Normalize
scaler = StandardScaler()
df_cols_to_norm = df[final_features] # TEST PYHTAG W NO NORMALIZE IN TRAIN AND 2024
df_cols_to_norm = df_cols_to_norm.fillna(df_cols_to_norm.mean())
df_norm = pd.DataFrame(scaler.fit_transform(df_cols_to_norm), columns = final_features)
mens_2024 = pd.concat([df_norm, df[['game_id', 'season', 'team1_id', 'team2_id', 'team1_position', 
                                    'team2_position', 'slot', 'strongseed', 'weakseed']]], axis = 1)

In [None]:
# Predict On 2024
mens_2024_predictions = xgb_men.predict(h2o.H2OFrame(mens_2024))
predictions = pd.concat([mens_2024[['game_id','season', 'team1_id', 'team2_id', 'team1_position', 'team2_position', 
                                    'strongseed', 'weakseed', 'slot']], mens_2024_predictions.as_data_frame()], axis = 1)
predictions.rename(columns = {'p1':'Pred'}, inplace = True)

slots = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv').query('Season == 2024').reset_index(drop = True)
ncaa_tourney = slots[['Slot', 'Season']].merge(predictions, left_on = 'Slot', right_on = 'slot', how = 'left')
ncaa_tourney.drop(columns = ['p0', 'slot', 'Season'], inplace = True)
ncaa_tourney.head()

# import plotly.express as px
# fig = px.scatter(predictions, x="team1_pythag", y="prob")
# fig.show()

ncaa_tourney['ID'] = (ncaa_tourney['season'].astype(str) + "_" + ncaa_tourney['team1_id'].astype(str) + "_" + ncaa_tourney['team2_id'].astype(str))
ncaa_tourney[['ID','Pred']].head()


In [None]:
# Load and filter data
round_slots = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/MNCAATourneySlots.csv')
round_slots = round_slots[round_slots['Season'] == 2024]
round_slots = round_slots[round_slots['Slot'].str.contains('R')] # Filter out First Four

seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv')
seeds_m = seeds[seeds['Tournament'] == 'M']
seeds_w = seeds[seeds['Tournament'] == 'W']

preds = pd.read_csv('/kaggle/input/paris-madness-2023/submission.csv') 
preds['ID'] = preds['ID'].str.split('_')

def prepare_data(seeds, preds):
    # Function preparing the data for the simulation
    seed_dict = seeds.set_index('Seed')['TeamID'].to_dict()
    inverted_seed_dict = {value: key for key, value in seed_dict.items()}
    probas_dict = {}
    
    for teams, proba in zip(preds['ID'], preds['Pred']):
        team1, team2 = teams[1], teams[2]

        probas_dict.setdefault(team1, {})[team2] = proba
        probas_dict.setdefault(team2, {})[team1] = 1 - proba

    return seed_dict, inverted_seed_dict, probas_dict


def simulate(round_slots, seeds, inverted_seeds, probas, random_values, sim=True):
    '''
    Simulates each round of the tournament.

    Parameters:
    - round_slots: DataFrame containing information on who is playing in each round.
    - seeds (dict): Dictionary mapping seed values to team IDs.
    - inverted_seeds (dict): Dictionary mapping team IDs to seed values.
    - probas (dict): Dictionary containing matchup probabilities.
    - random_values (array-like): Array with precomputed random-values.
    - sim (boolean): Simulates match if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - list: List with winning team IDs for each match.
    - list: List with corresponding slot names for each match.
    '''
    winners = []
    slots = []

    for slot, strong, weak, random_val in zip(round_slots.Slot, round_slots.StrongSeed, round_slots.WeakSeed, random_values):
        team1, team2 = seeds[strong], seeds[weak]
        proba = probas[str(team1)][str(team2)]
        if sim:
            winner = team1 if random_val < proba else team2
        else:
            winner = [team1, team2][np.argmax([proba, 1-proba])]
        winners.append(winner)
        slots.append(slot)
        seeds[slot] = winner
    return [inverted_seeds[w] for w in winners], slots


def run_simulation(brackets=1, seeds=None, preds=None, round_slots=None, sim=True):
    '''
    Runs a simulation of bracket tournaments.

    Parameters:
    - brackets (int): Number of brackets to simulate.
    - seeds (pd.DataFrame): DataFrame containing seed information.
    - preds (pd.DataFrame): DataFrame containing prediction information for each match-up.
    - round_slots (pd.DataFrame): DataFrame containing information about the tournament rounds.
    - sim (boolean): Simulates matches if True. Chooses team with higher probability as winner otherwise.

    Returns:
    - pd.DataFrame: DataFrame with simulation results.
    '''
    # Get relevant data for the simulation
    seed_dict, inverted_seed_dict, probas_dict = prepare_data(seeds, preds)
    # Lists to store simulation results
    results = []
    bracket = []
    slots = []
    
    # Precompute random-values
    random_values = np.random.random(size=(brackets, len(round_slots)))

    # Iterate through the specified number of brackets
    for b in tqdm(range(1, brackets+1)):
        # Run single simulation
        r, s = simulate(round_slots, seed_dict, inverted_seed_dict, probas_dict, random_values[b-1], sim)
        
        # Update results
        results.extend(r)
        bracket.extend([b] * len(r))
        slots.extend(s)

    # Create final DataFrame
    result_df = pd.DataFrame({'Bracket': bracket, 'Slot': slots, 'Team': results})

    return result_df

n_brackets = 100
result_m=run_simulation(brackets=n_brackets, seeds=seeds_m, preds=preds, round_slots=round_slots, sim=True)
result_m['Tournament'] = 'M'

In [None]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

# **Part 2:** NCAA 2024 Women's Bracket Predictions

In [None]:
# Kaggle
hist_results = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/WNCAATourneyDetailedResults.csv')
tourney_seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv')
hist_seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/WNCAATourneySeeds.csv')
reg_season = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/WRegularSeasonDetailedResults.csv')

# Total Points Scored In Season
feats = pd.merge(pd.DataFrame(reg_season.groupby(['WTeamID', 'Season'])['WScore'].sum()).reset_index(), pd.DataFrame(reg_season.groupby(['LTeamID', 
    'Season'])['LScore'].sum()).reset_index(), left_on = ['WTeamID', 'Season'], right_on = ['LTeamID', 'Season'], how = 'inner')
feats.drop(columns = ['LTeamID'], inplace = True)
feats.rename(columns = {'WTeamID': 'TeamID'}, inplace = True)
feats['season_points'] = feats['WScore'] + feats['LScore']

# Seed Of Teams That Made Tourney
train = pd.merge(feats, hist_seeds, on = ['TeamID', 'Season'] , how = 'left').dropna()
train.drop(columns = ['WScore', 'LScore'], inplace = True)

# Function
def extract_numeric_part(x):
    numeric_parts = re.findall(r'\d+', str(x))
    if numeric_parts:
        return int(numeric_parts[-1])
    else:
        return None 

In [None]:
# Merge Onto Historical Results
hist = hist_results[['Season', 'WTeamID', 'WScore', 'LTeamID', 'LScore']]
df = hist.merge(train, left_on = ['WTeamID', 'Season'], right_on = ['TeamID', 'Season'], how = 'left')
df = df.merge(train, left_on = ['LTeamID', 'Season'], right_on = ['TeamID', 'Season'], how = 'left')

# Merge Onto Historical Results
df.drop(columns = ['TeamID_x', 'TeamID_y'], inplace = True)
df.columns = ['season', 'team1_id', 'team1_score', 'team2_id', 'team2_score', 'team1_season_points', 'team1_seed', 'team2_season_points', 'team2_seed']

#Create a team 1 win column to determine which rows actually have them winning
df['team1_win'] = (df['team1_score'] > df['team2_score']).astype(int)
df['season_points_ratio'] = df['team1_season_points'] / df['team2_season_points']

df['team2_seed'] = df['team2_seed'].apply(extract_numeric_part)
df['team1_seed'] = df['team1_seed'].apply(extract_numeric_part)
df['seed_diff'] = df['team1_seed'] - df['team2_seed']

# Randomly Select Games To Place in Df 1 and Rest in Df 2
df_1 = df.sample(frac=.5, random_state=15)
df_2 = df[~df.index.isin(df_1.index)].reset_index(drop=True)

# In Second Df, Flip The Attributes
df_2['team1_win'] = 0
t1s, t2s = df_2['team1_score'], df_2['team2_score']
df_2['team1_score'], df_2['team2_score'] = t2s, t1s
df_2['seed_diff'] = df_2['seed_diff'] * -1
df_2['season_points_ratio'] = 1 / df_2['season_points_ratio']

#Combine DataFrames, Create 2019 Holdout Set
train = pd.concat([df_1, df_2]).reset_index(drop=True)
train.head()

In [None]:
# Initialize H2O
h2o.init()

# Convert DataFrame to H2O Frame
train_h2o = h2o.H2OFrame(train)

# Test Train Split with Validation
train_h2o['team1_win'] = train_h2o['team1_win'].asfactor()
train, test = train_h2o.split_frame(ratios = [0.8], seed = 42)

# Initiate the H20 AutoML Model
aml = H2OAutoML(max_models = 5, seed = 42, verbosity = "info", nfolds = 3, balance_classes = True, 
                include_algos = ['XGBoost'], stopping_metric = 'logloss')

# Train Model with Feature and Target Columns
aml.train(x = ['season_points_ratio', 'seed_diff'], y = 'team1_win', training_frame = train)
print('Training Columns:', train.columns[5:],'\n','Target Column: "team1_win"')
print('Training Size:', train.shape,'\n','Test Size:', test.shape)

In [None]:
# Predict On Test Data and Evaluate Model Performance
# aml.leaderboard.head(5)
xgb_women = aml.get_best_model(algorithm="xgboost", criterion="logloss")
xgb_women.model_performance(test) 

In [None]:
# Explain Model Performance
h2o.explain(xgb_women,test)

In [None]:
# Create 2024 Data
tourney_seeds = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/2024_tourney_seeds.csv')
reg_season = pd.read_csv('/kaggle/input/march-machine-learning-mania-2024/WRegularSeasonDetailedResults.csv')

# Total Points Scored In Season
feats = pd.merge(pd.DataFrame(reg_season.groupby(['WTeamID', 'Season'])['WScore'].sum()).reset_index(), pd.DataFrame(reg_season.groupby(['LTeamID', 
    'Season'])['LScore'].sum()).reset_index(), left_on = ['WTeamID', 'Season'], right_on = ['LTeamID', 'Season'], how = 'inner').query('Season == 2024')
feats.drop(columns = ['LTeamID'], inplace = True)
feats.rename(columns = {'WTeamID': 'TeamID'}, inplace = True)
feats['season_points'] = feats['WScore'] + feats['LScore']

# Seed Of Teams That Made Tourney
train = pd.merge(feats, tourney_seeds, on = ['TeamID'] , how = 'left').dropna()
train.drop(columns = ['WScore', 'LScore'], inplace = True)
train['Seed'] = train['Seed'].apply(extract_numeric_part)

In [None]:
# Generate all combinations of IDs
all_id_combinations = list(set(itertools.product(train['TeamID'], repeat=2)))
new_data = {'ID': [pair[0] for pair in all_id_combinations],'Other_ID': [pair[1] for pair in all_id_combinations]}
all_combos_id = pd.DataFrame(new_data)

womens_2024 = train.merge(all_combos_id, left_on = 'TeamID', right_on = 'ID', how = 'left')
womens_2024 = womens_2024.merge(train, left_on = 'Other_ID', right_on = 'TeamID', how = 'left')
womens_2024.drop(columns = ['TeamID_x', 'Tournament_x', 'TeamID_y', 'Tournament_y', "Season_y"], inplace = True)

#Create a team 1 win column to determine which rows actually have them winning
womens_2024.columns = ['season', 'team1_season_points', 'team1_seed', 'team1_id', 'team2_id', 'team2_season_points', 'team2_seed']

womens_2024['season_points_ratio'] = womens_2024['team1_season_points'] / womens_2024['team2_season_points']

womens_2024['team2_position'] = womens_2024['team2_seed']
womens_2024['team2_seed'] = womens_2024['team2_seed'].apply(extract_numeric_part)
womens_2024['team1_seed'] = womens_2024['team1_seed'].apply(extract_numeric_part)
womens_2024['seed_diff'] = womens_2024['team1_seed'] - womens_2024['team2_seed']
womens_2024.head()

In [None]:
# Predict On 2024
womens_2024_predictions = xgb_women.predict(h2o.H2OFrame(womens_2024))
predictions = pd.concat([womens_2024[['season', 'team1_id', 'team2_id']], womens_2024_predictions.as_data_frame()], axis = 1)
predictions.rename(columns = {'p1':'Pred'}, inplace = True)

predictions['ID'] = (predictions['season'].astype(str) + "_" + predictions['team1_id'].astype(str) + "_" + predictions['team2_id'].astype(str))
predictions[['ID','Pred']].head()

In [None]:
result_w=run_simulation(brackets=n_brackets, seeds=seeds_w, preds=preds, round_slots=round_slots, sim=True)
result_w['Tournament'] = 'W'

In [None]:
submission = pd.concat([result_m, result_w])
submission.reset_index(inplace=True, drop=True)
submission.index.names = ['RowId']

submission.to_csv('/kaggle/')