In [303]:
# Import Libraries
import os 
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)

# Import and Merge Historical Team Data 
mdcm = pd.read_csv('../data/mdcm/NCAA_Tourney_2002_2023.csv')
team_spellings = pd.read_csv('../data/mdcm/team_spellings.csv')

ncaa_sheets = pd.read_csv('../data/cbbdata/team/ncaa_sheets.csv').query('year != 2020')
selection_sunday_resume = pd.read_csv('../data/cbbdata/team/selection_sunday_resume.csv').query('year != 2020')

coach_results = pd.read_csv('../data/kaggle/march_madness_data/coach_results.csv')
barttovik_home = pd.read_csv('../data/kaggle/march_madness_data/barttovik_home.csv')
barttovik_away = pd.read_csv('../data/kaggle/march_madness_data/barttovik_away.csv')
kenpom_barttovik = pd.read_csv('../data/kaggle/march_madness_data/kenpom_barttovik.csv')
shooting_splits = pd.read_csv('../data/kaggle/march_madness_data/shooting_splits.csv')
heat_check = pd.read_csv('../data/kaggle/march_madness_data/heat_check.csv')

In [304]:
# Define Functions
def merge_team_season(df: pd.DataFrame, df_merge_onto: pd.DataFrame, filter_df_merge_onto_year_cutoff: bool = False, filter_df_merge_onto_year = None, title: str = None):
    """
    
        Function to merge teams and their seasons in college basketball with a bevy of alternative spellings, using team_spellings.csv.
    
        df (pd.DataFrame): The dataframe you'd like to establish as your left, or original df. Must contain 'teamname' and 'season' columns.
        
        df_merge_onto (pd.DataFrame): The dataframe you'd like to left merge onto df. Must contain 'team' and 'year' columns.
        
        filter_df_merge_onto_year_cutoff (bool): A boolean determining if there is a cutoff year for the anchor data (i.e. Statcast data only 
                                                reaching back to 2015). This does not affect the merge, just the reporting success messages.
        
        filter_df_merge_onto_year (int): If there is a cutoff year for the df_merge_onto, the year of interest. This does not affect the 
                                        merge, just the reporting success messages.
    
    """
    # Print Title of Run For Terminal
    if title:
        print(title,'-----------------------\n')
        
    # If Oncoming Data Filtered By Specifc Year 
    if filter_df_merge_onto_year_cutoff == True:
        df_post_cutoff = df[df['season'] >= filter_df_merge_onto_year]
        df_pre_cutoff = df[df['season'] < filter_df_merge_onto_year]
    df = df_post_cutoff
    
    # Find Null DF Column Identifier For Oncoming DF
    for col_name in df_merge_onto.columns:
        if 'team' not in col_name.lower() and 'year' not in col_name.lower():
            col_null_match_identifier = col_name
            break
    
    # Loop Through Team 1 and Team 2
    df_both_teams = pd.DataFrame()
    for team_num in range(1, 3):
        
        # Adjust Column Names Due To Team1 and Team2
        df_merge_onto.columns = df_merge_onto.columns.str[6:] if team_num == 2 else df_merge_onto.columns
        df_merge_onto = df_merge_onto.add_prefix(f'team{team_num}_')
        
        # Set All The Team and Teamname Columns To Lowercase
        df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()
        df_merge_onto[f'team{team_num}_team'] = df_merge_onto[f'team{team_num}_team'].str.lower()
        
         # Original Merge
        print(f"Team {team_num} Merge ...\n")
        df = pd.merge(df, df_merge_onto, how = 'left', left_on = [f'team{team_num}_teamname', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
        df_not_merged = df[df[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
        print(f'Original Team {team_num} Merge:', len(df), 'total rows.')
        print('Matched During Iteration:', len(df) - len(df_not_merged)) 
        print('Unmatched Rows Remaining:', len(df_not_merged), '\n') 
        
        # Remove Columns That Didn't Merge Properly Based On Num of Columns
        neg_col_count_df_merge_onto = df_merge_onto.shape[1] * -1
        df_not_merged = df_not_merged.iloc[:, :neg_col_count_df_merge_onto]
          
        # Loop Through Columnns To Fix The Merge
        print(f"Correcting Team {team_num} Merge ...\n")
        merge_complete, i = False, 1
        while merge_complete == False:
            
            # Perform Loop Everytime More Unmatched Columns Are Found
            team_season_loop = pd.merge(df_not_merged, df_merge_onto, how = 'left', left_on = [f'team{team_num}_name_spelling_{i}', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
            print(f'Team {team_num} Season Loop {i}:', len(team_season_loop), 'total rows.')
            
            # Split Up The Matched and Unmatched
            matched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == False]
            print('Matched During Iteration:', len(matched_df)) 
            unmatched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
            print('Unmatched Rows Remaining:', len(unmatched_df), '\n') 
            
            # For The DataFrames With Data In The Column From Second DF, Add To team_season
            if len(matched_df) > 0:
                df = pd.concat([df, matched_df])
            # If There Are Still Null Rows, Throw Those Back In The Loop For The Next Iteration
            if len(unmatched_df) > 0:
                df_not_merged = unmatched_df.iloc[:, :neg_col_count_df_merge_onto]
            # If There Aren't Any Null Rows Left, End The Loop
            else:
                print(f'Success! Team {team_num} Merge Completed Early!\n')
                merge_complete = True
            if i == 11:
                merge_complete = True 
            i += 1
        # Concat Team 1 or 2 Onto Full DataFrame
        df_both_teams = pd.concat([df_both_teams, df])
    
    # Inspect Final Results
    print("Filter Views of Resulting DataFrame -------------------------\n")
    for team_num in range(1, 3):
        for na_value in [True, False]:
            print('Team', team_num, 'Filter Null Values In Oncoming DF?', na_value)
            if filter_df_merge_onto_year_cutoff == True:
                print(f'{filter_df_merge_onto_year} and After:', df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == na_value) & (df_both_teams['season'] >= filter_df_merge_onto_year)].shape)
                print(f'Before {filter_df_merge_onto_year}:', df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == na_value) & (df_both_teams['season'] < filter_df_merge_onto_year)].shape, '\n')
            else:
                print('Dataframe Shape:', df_both_teams[df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == na_value].shape)
    
    # Concat Pre and Post Cutoff If Exists
    if filter_df_merge_onto_year_cutoff == True:
        df_both_teams = pd.concat([df_both_teams, df_pre_cutoff], ignore_index=True)
        
    return df_both_teams.drop(['team1_team','team1_year', 'team2_team', 'team2_year'], axis = 1)

# Adjust Team Spellings
team_spellings = team_spellings.pivot_table(index='team_id', columns=team_spellings.groupby('team_id').cumcount(), values='name_spelling', aggfunc='first')
team_spellings.columns = [f'name_spelling_{i + 1}' for i in range(team_spellings.shape[1])]
team_spellings.reset_index(inplace=True)

# Merge Team Spellings
team_spellings_t1 = team_spellings.add_prefix('team1_')
mdcm = pd.merge(mdcm, team_spellings_t1, how = 'inner', left_on = ['team1_id'], right_on = ['team1_team_id'])
team_spellings_t2 = team_spellings.add_prefix('team2_')
mdcm = pd.merge(mdcm, team_spellings_t2, how = 'inner', left_on = ['team2_id'], right_on = ['team2_team_id'])

# Merge MDCM and NCAA_Sheets (2019-2024)
ncaa_sheets.drop(['seed', 'net', 'quad_1a', 'quad_1', 'quad_2', 'quad_1_and_2', 'quad_3', 'quad_4'], axis = 1, inplace = True)
mdcm = merge_team_season(mdcm, ncaa_sheets, filter_df_merge_onto_year_cutoff = True, filter_df_merge_onto_year = 2019, title = 'MDCM and NCAA Sheets (2019-2024)')

# Merge MDCM and Selection Sunday (2008-2024)
selection_sunday_resume = selection_sunday_resume[selection_sunday_resume['year'] != 'Year']
selection_sunday_resume['year'] = selection_sunday_resume['year'].astype(int)
selection_sunday_resume.drop(columns = ['net', 'seed'], axis = 1, inplace = True)
selection_sunday_resume.rename(columns={'score': 'team_score'}, inplace=True)
mdcm = merge_team_season(mdcm, selection_sunday_resume, filter_df_merge_onto_year_cutoff = True, filter_df_merge_onto_year = 2008, title = 'MDCM and Selection Sunday (2008 - 2024)')

MDCM and NCAA Sheets (2019-2024) -----------------------

Team 1 Merge ...

Original Team 1 Merge: 268 total rows.
Matched During Iteration: 214
Unmatched Rows Remaining: 54 

Correcting Team 1 Merge ...

Team 1 Season Loop 1: 54 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 54 

Team 1 Season Loop 2: 54 total rows.
Matched During Iteration: 9
Unmatched Rows Remaining: 45 

Team 1 Season Loop 3: 45 total rows.
Matched During Iteration: 31
Unmatched Rows Remaining: 14 

Team 1 Season Loop 4: 14 total rows.
Matched During Iteration: 6
Unmatched Rows Remaining: 8 

Team 1 Season Loop 5: 8 total rows.
Matched During Iteration: 3
Unmatched Rows Remaining: 5 

Team 1 Season Loop 6: 5 total rows.
Matched During Iteration: 3
Unmatched Rows Remaining: 2 

Team 1 Season Loop 7: 2 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 2 

Team 1 Season Loop 8: 2 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 2 

Team 1 Season Loop 9: 2 total rows.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()


Matched During Iteration: 3
Unmatched Rows Remaining: 3 

Team 2 Season Loop 7: 3 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 3 

Team 2 Season Loop 8: 3 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 3 

Team 2 Season Loop 9: 3 total rows.
Matched During Iteration: 3
Unmatched Rows Remaining: 0 

Success! Team 2 Merge Completed Early!

Filter Views of Resulting DataFrame -------------------------

Team 1 Filter Null Values In Oncoming DF? True
2019 and After: (129, 148)
Before 2019: (0, 148) 

Team 1 Filter Null Values In Oncoming DF? False
2019 and After: (620, 148)
Before 2019: (0, 148) 

Team 2 Filter Null Values In Oncoming DF? True
2019 and After: (427, 148)
Before 2019: (0, 148) 

Team 2 Filter Null Values In Oncoming DF? False
2019 and After: (322, 148)
Before 2019: (0, 148) 

MDCM and Selection Sunday (2008 - 2024) -----------------------

Team 1 Merge ...

Original Team 1 Merge: 1477 total rows.
Matched During Iteration: 1051
Unmatched

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()


Team 1 Season Loop 8: 140 total rows.
Matched During Iteration: 1
Unmatched Rows Remaining: 139 

Team 1 Season Loop 9: 139 total rows.
Matched During Iteration: 14
Unmatched Rows Remaining: 125 

Team 1 Season Loop 10: 125 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 125 

Team 1 Season Loop 11: 125 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 125 

Team 2 Merge ...

Original Team 2 Merge: 1778 total rows.
Matched During Iteration: 1000
Unmatched Rows Remaining: 778 

Correcting Team 2 Merge ...

Team 2 Season Loop 1: 778 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 778 

Team 2 Season Loop 2: 778 total rows.
Matched During Iteration: 54
Unmatched Rows Remaining: 724 

Team 2 Season Loop 3: 724 total rows.
Matched During Iteration: 225
Unmatched Rows Remaining: 499 

Team 2 Season Loop 4: 499 total rows.
Matched During Iteration: 70
Unmatched Rows Remaining: 429 

Team 2 Season Loop 5: 429 total rows.
Matched During Iterat

In [307]:
mdcm.season.value_counts()

2023    668
2019    514
2022    467
2021    445
2012    186
2013    185
2014    179
2018    176
2011    173
2015    171
2016    166
2010    164
2017    160
2009    158
2008    153
2003     64
2005     64
2002     64
2004     64
2006     64
2007     64
Name: season, dtype: int64

In [309]:
mdcm.groupby('season')['team2_teamname'].nunique().reset_index(name='Unique_Count')

Unnamed: 0,season,Unique_Count
0,2002,64
1,2003,64
2,2004,64
3,2005,64
4,2006,64
5,2007,64
6,2008,128
7,2009,128
8,2010,128
9,2011,134


In [70]:
# Merge Kaggle march_madness_data
files = ['coach_results', 'barttovik_home', 'barttovik_away', 'kenpom_barttovik', 'shooting_splits', 'heat_check']
for file in files:
    df = pd.read_csv(f'../data/kaggle/march_madness_data/{file}.csv')
    if file == 'coach_results':
        
        print('coach')
        # need additional param for coach id
        
    elif file in files[1:2]:
        print(df.shape)
        df.columns = df.columns.str.lower()
        mdcm = merge_team_season(mdcm, df, filter_2019 = False)
        mdcm.rename(columns={col: f'{file[-4:]}_' + col for col in df.columns[-85:]}, inplace=True)
        
    elif file in files[:-2]:
        print('kenpom')
    else:
        print('rest')
    # elif file in file[]: 
    
mdcm.head()

coach
(1079, 85)
Merging Team 1 ...
Team Season Loop 1: 2181 total rows.
Matched DataFrame Rows: 0
Unmatched DataFrame Rows: 2181
Not Merged Ending Iteration Shape: (2181, 170) 

Team Season Loop 2: 2181 total rows.
Matched DataFrame Rows: 0
Unmatched DataFrame Rows: 2181
Not Merged Ending Iteration Shape: (2181, 170) 

Team Season Loop 3: 2181 total rows.
Matched DataFrame Rows: 78
Unmatched DataFrame Rows: 2103
Not Merged Ending Iteration Shape: (2103, 170) 

Team Season Loop 4: 2103 total rows.
Matched DataFrame Rows: 102
Unmatched DataFrame Rows: 2001
Not Merged Ending Iteration Shape: (2001, 170) 

Team Season Loop 5: 2001 total rows.
Matched DataFrame Rows: 266
Unmatched DataFrame Rows: 1735
Not Merged Ending Iteration Shape: (1735, 170) 

Team Season Loop 6: 1735 total rows.
Matched DataFrame Rows: 800
Unmatched DataFrame Rows: 935
Not Merged Ending Iteration Shape: (935, 170) 

Team Season Loop 7: 935 total rows.
Matched DataFrame Rows: 8
Unmatched DataFrame Rows: 927
Not Merge

Unnamed: 0,team1_id,team1_score_x,team2_id,team2_score,WLoc,num_ot,team1_position,team2_position,team1_seed_x,team2_seed,strongseed,weakseed,team1_region,team2_region,slot,team1_teamname,team2_teamname,season,host,host_lat,host_long,team1_lat,team1_long,team2_lat,team2_long,team1_coach_id,team1_pt_school_ncaa,team1_pt_overall_ncaa,team1_pt_school_s16,team1_pt_overall_s16,team1_pt_school_ff,team1_pt_overall_ff,team1_pt_career_school_wins,team1_pt_career_school_losses,team1_pt_career_overall_wins,team1_pt_career_overall_losses,team1_pt_team_season_wins,team1_pt_team_season_losses,team1_pt_coach_season_wins,team1_pt_coach_season_losses,team2_coach_id,team2_pt_school_ncaa,team2_pt_overall_ncaa,team2_pt_school_s16,team2_pt_overall_s16,team2_pt_school_ff,team2_pt_overall_ff,team2_pt_career_school_wins,team2_pt_career_school_losses,team2_pt_career_overall_wins,team2_pt_career_overall_losses,team2_pt_team_season_wins,team2_pt_team_season_losses,team2_pt_coach_season_wins,team2_pt_coach_season_losses,team1_ap_final,team1_ap_preseason,team1_coaches_before_final,team1_coaches_preseason,team2_ap_final,team2_ap_preseason,team2_coaches_before_final,team2_coaches_preseason,team1_fg2pct,team1_fg3pct,team1_ftpct,team1_blockpct,team1_oppfg2pct,team1_oppfg3pct,team1_oppftpct,team1_oppblockpct,team1_f3grate,team1_oppf3grate,team1_arate,team1_opparate,team1_stlrate,team1_oppstlrate,team2_fg2pct,team2_fg3pct,team2_ftpct,team2_blockpct,team2_oppfg2pct,team2_oppfg3pct,team2_oppftpct,team2_oppblockpct,team2_f3grate,team2_oppf3grate,team2_arate,team2_opparate,team2_stlrate,team2_oppstlrate,team1_tempo,team1_adjtempo,team1_oe,team1_adjoe,team1_de,team1_adjde,team2_tempo,team2_adjtempo,team2_oe,...,team1_3ptr rank,team1_2ptrd rank,team1_3ptrd rank,team1_badjt rank,team1_avg hgt rank,team1_eff hgt rank,team1_exp rank,team1_talent rank,team1_ft% rank,team1_op ft% rank,team1_pppo rank,team1_pppd rank,team1_elite sos rank,team1_seed,team1_wab,team2_seed_x,team2_wab_x,team2_team no,team2_team id,team2_seed_y,team2_round,team2_badj em,team2_badj o,team2_badj d,team2_barthag,team2_games,team2_w,team2_l,team2_win%,team2_efg%,team2_efg%d,team2_ftr,team2_ftrd,team2_tov%,team2_tov%d,team2_oreb%,team2_dreb%,team2_op oreb%,team2_op dreb%,team2_raw t,team2_2pt%,team2_2pt%d,team2_3pt%,team2_3pt%d,team2_blk%,team2_blked%,team2_ast%,team2_op ast%,team2_2ptr,team2_3ptr,team2_2ptrd,team2_3ptrd,team2_badj t,team2_avg hgt,team2_eff hgt,team2_exp,team2_talent,team2_ft%,team2_op ft%,team2_pppo,team2_pppd,team2_elite sos,team2_wab_y,team2_badj em rank,team2_badj o rank,team2_badj d rank,team2_barthag rank,team2_efg% rank,team2_efgd% rank,team2_ftr rank,team2_ftrd rank,team2_tov% rank,team2_tov%d rank,team2_oreb% rank,team2_dreb% rank,team2_op oreb% rank,team2_op dreb% rank,team2_raw t rank,team2_2pt% rank,team2_2pt%d rank,team2_3pt% rank,team2_3pt%d rank,team2_blk% rank,team2_blked% rank,team2_ast% rank,team2_op ast% rank,team2_2ptr rank,team2_3ptr rank,team2_2ptrd rank,team2_3ptrd rank,team2_badjt rank,team2_avg hgt rank,team2_eff hgt rank,team2_exp rank,team2_talent rank,team2_ft% rank,team2_op ft% rank,team2_pppo rank,team2_pppd rank,team2_elite sos rank
0,1314,81,1181,77,N,0,W08,X02,8,2.0,2,8,W,X,R5WX,North Carolina,Duke,2022,new_orleans,29.9667,-90.05,35.912165,-79.050969,36.00159,-78.94226,hubert-davis-1,0.0,0.0,0.0,0.0,0.0,0.0,24.0,9,24.0,9,24.0,9,24.0,9,mike-krzyzewski-1,35.0,35.0,25.0,25.0,12.0,12.0,1125.0,308,1198.0,367,28.0,6,28.0,6,,19.0,,20.0,9.0,9.0,10.0,9.0,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,55.844156,36.821192,72.881356,13.25811,46.897038,31.396957,67.160494,8.097785,36.579457,33.769267,56.987116,51.793722,0.094448,0.091014,70.2181,70.1745,109.416,113.035,100.974,96.9911,68.2765,67.4185,117.152,...,190.0,190.0,175.0,77.0,29.0,24.0,259.0,2.0,106.0,217.0,44.0,124.0,69.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1277,68,1181,67,N,0,W02,W01,2,1.0,1,2,W,W,R4W1,Michigan St,Duke,2019,washington,38.8951,-77.0367,42.72476,-84.473639,36.00159,-78.94226,tom-izzo-1,21.0,21.0,13.0,13.0,7.0,7.0,602.0,231,602.0,231,28.0,6,28.0,6,mike-krzyzewski-1,34.0,34.0,24.0,24.0,12.0,12.0,1056.0,284,1129.0,343,29.0,5,29.0,5,11.0,10.0,5.0,10.0,4.0,3.0,1.0,4.0,55.140187,38.342697,75.0,13.899614,41.158301,31.781915,67.54386,8.071368,37.6919,36.736688,68.329718,52.331606,0.077123,0.093336,58.302583,30.195599,68.9747,16.361072,44.922426,29.34927,69.097889,7.97048,37.64381,34.684477,52.16972,48.834499,0.128027,0.088267,68.6506,67.6316,114.457,121.681,94.8722,90.5738,73.6764,72.2812,113.184,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,1397,65,1181,52,N,0,W04,W05,4,5.0,4,5,W,W,R2W4,Tennessee,Duke,2023,orlando,28.4158,-81.2989,35.955093,-83.929696,36.00159,-78.94226,rick-barnes-1,5.0,27.0,2.0,8.0,0.0,1.0,175.0,92,779.0,406,25.0,11,25.0,11,jon-scheyer-1,1.0,1.0,0.0,0.0,0.0,0.0,27.0,9,27.0,9,27.0,9,27.0,9,17.0,11.0,16.0,11.0,21.0,7.0,18.0,8.0,50.91225,32.948718,71.356784,11.811024,44.586614,26.206897,72.203947,8.514335,40.393578,41.642734,66.192171,50.07776,0.12442,0.102088,51.462451,33.573487,76.986755,12.037766,46.656176,30.377907,76.483051,9.644269,35.426238,35.119959,56.674208,47.880299,0.086547,0.093239,66.2411,65.5273,108.476,112.675,87.5278,87.9699,65.7055,65.258,110.245,...,121.0,42.0,323.0,260.0,116.0,17.0,211.0,29.0,265.0,158.0,47.0,1.0,30.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,1314,69,1389,49,N,0,W08,W15,8,15.0,8,15,W,W,R4W1,North Carolina,St Peter's,2022,philadelphia,39.95,-75.1667,35.912165,-79.050969,40.727105,-74.071541,hubert-davis-1,0.0,0.0,0.0,0.0,0.0,0.0,24.0,9,24.0,9,24.0,9,24.0,9,shaheen-holloway-1,0.0,0.0,0.0,0.0,0.0,0.0,61.0,53,61.0,53,19.0,11,19.0,11,,19.0,,20.0,,,,,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,45.916515,34.791667,68.46543,14.609053,44.032922,29.422383,73.482428,13.157895,30.34134,36.304063,51.560178,51.607445,0.108634,0.105047,70.2181,70.1745,109.416,113.035,100.974,96.9911,67.0262,65.9923,98.7557,...,190.0,190.0,175.0,77.0,29.0,24.0,259.0,2.0,106.0,217.0,44.0,124.0,69.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,1314,73,1417,66,N,0,W08,W04,8,4.0,4,8,W,W,R3W1,North Carolina,UCLA,2022,philadelphia,39.95,-75.1667,35.912165,-79.050969,34.068892,-118.443901,hubert-davis-1,0.0,0.0,0.0,0.0,0.0,0.0,24.0,9,24.0,9,24.0,9,24.0,9,mick-cronin-1,1.0,12.0,1.0,2.0,1.0,1.0,66.0,29,431.0,200,25.0,7,25.0,7,,19.0,,20.0,11.0,2.0,12.0,2.0,50.714286,36.206897,77.21943,9.814529,48.299845,34.920635,73.696145,8.492063,37.437934,36.878049,53.618421,48.031496,0.076692,0.093829,49.558824,35.14377,73.648649,9.575429,47.33514,32.243685,72.254335,7.867647,31.520645,37.808989,50.0,51.147099,0.102746,0.066214,70.2181,70.1745,109.416,113.035,100.974,96.9911,67.1833,65.5765,111.821,...,190.0,190.0,175.0,77.0,29.0,24.0,259.0,2.0,106.0,217.0,44.0,124.0,69.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [168]:
file = ['coach_results', 'barttovik_home', 'barttovik_away', 'kenpom_barttovik', 'shooting_splits', 'heat_check']
pd.read_csv(f'../data/kaggle/march_madness_data/{file[3]}.csv').head()

Unnamed: 0,YEAR,CONF,CONF ID,QUAD NO,QUAD ID,TEAM NO,TEAM ID,TEAM,SEED,ROUND,K TEMPO,K TEMPO RANK,KADJ T,KADJ T RANK,K OFF,KO RANK,KADJ O,KADJ O RANK,K DEF,KD RANK,KADJ D,KADJ D RANK,KADJ EM,KADJ EM RANK,BADJ EM,BADJ O,BADJ D,BARTHAG,GAMES,W,L,WIN%,EFG%,EFG%D,FTR,FTRD,TOV%,TOV%D,OREB%,DREB%,OP OREB%,OP DREB%,RAW T,2PT%,2PT%D,3PT%,3PT%D,BLK%,BLKED%,AST%,OP AST%,2PTR,3PTR,2PTRD,3PTRD,BADJ T,AVG HGT,EFF HGT,EXP,TALENT,FT%,OP FT%,PPPO,PPPD,ELITE SOS,WAB,BADJ EM RANK,BADJ O RANK,BADJ D RANK,BARTHAG RANK,EFG% RANK,EFGD% RANK,FTR RANK,FTRD RANK,TOV% RANK,TOV%D RANK,OREB% RANK,DREB% RANK,OP OREB% RANK,OP DREB% RANK,RAW T RANK,2PT% RANK,2PT%D RANK,3PT% RANK,3PT%D RANK,BLK% RANK,BLKED% RANK,AST% RANK,OP AST% RANK,2PTR RANK,3PTR RANK,2PTRD RANK,3PTRD RANK,BADJT RANK,AVG HGT RANK,EFF HGT RANK,EXP RANK,TALENT RANK,FT% RANK,OP FT% RANK,PPPO RANK,PPPD RANK,ELITE SOS RANK
0,2024,MAC,17,64,4,1079,2,Akron,14,0,66.5246,292,65.4089,294,109.684,88,109.69,110,97.9075,40,100.646,72,9.0439,90,8.267,108.551,100.284,0.713,21,16,5,76.19048,53.0,48.4,34.6,29.8,16.9,17.9,29.7,74.7,25.3,70.3,66.5,54.4,49.7,34.0,30.8,8.0,8.3,49.1,47.1,58.5,41.5,63.8,36.2,65.326,76.612,79.89,2.564,11.033,73.0,68.6,1.098,0.98,12.238,0.0,95,129,71,95,73,79,132,104,154,130,157,40,40,157,295,43,148,170,37,233,108,212,93,288,77,227,139,298,266,216,19,175,125,46,87,42,277
1,2024,SEC,28,62,2,1078,3,Alabama,3,0,73.0676,24,72.0976,17,123.229,1,125.857,2,102.693,137,99.7332,57,26.1239,7,27.305,126.633,99.328,0.942,22,16,6,72.72727,57.7,48.6,36.3,35.8,16.9,16.9,35.0,71.0,29.0,65.0,73.2,58.1,49.2,38.2,31.7,10.7,11.3,51.4,44.5,52.1,47.9,62.9,37.1,72.011,78.251,82.491,2.07,26.153,79.1,70.6,1.228,1.028,31.268,0.0,5,2,53,6,3,88,93,261,154,205,29,174,174,29,23,9,125,16,70,108,314,140,41,352,13,197,176,18,29,7,162,110,5,138,1,140,7
2,2024,SB,26,63,3,1077,7,Appalachian St.,12,0,68.906,171,67.1149,216,108.813,109,108.259,134,96.0406,24,98.5874,45,9.67158,87,7.98,106.628,98.648,0.71,21,17,4,80.95238,52.0,45.4,32.1,21.0,15.1,15.4,30.1,72.4,27.6,69.9,68.9,54.3,44.1,31.5,31.9,15.6,5.7,50.5,43.3,66.7,33.3,66.3,33.7,66.932,76.968,80.693,2.129,16.566,66.2,69.0,1.088,0.965,12.355,0.0,98,163,47,96,115,16,199,5,61,300,142,115,115,142,176,44,13,279,79,8,3,165,26,88,278,300,65,221,209,102,149,147,328,63,106,29,274
3,2024,P12,24,61,1,1076,8,Arizona,2,0,74.5598,8,72.7631,11,118.039,10,121.455,6,95.9483,23,94.4483,12,27.007,4,26.191,121.091,94.9,0.943,22,17,5,77.27273,54.8,49.3,36.0,24.6,16.3,19.1,37.8,78.1,21.9,62.2,74.6,54.8,47.5,36.5,34.7,9.1,7.7,59.1,54.4,68.1,31.9,60.3,39.7,72.836,78.144,81.69,1.916,70.181,72.3,71.1,1.181,0.956,29.109,0.0,6,8,14,5,31,121,104,16,120,72,9,3,3,9,8,33,77,50,247,186,69,31,276,58,307,99,267,11,40,22,210,11,163,167,10,23,22
4,2024,SEC,28,64,4,1075,12,Auburn,5,0,70.7523,82,69.8715,71,116.363,13,119.147,13,93.071,5,92.3563,5,26.7903,5,27.394,120.062,92.668,0.952,22,18,4,81.81818,53.3,43.3,36.2,36.5,15.0,19.4,34.0,71.0,29.0,66.0,70.8,55.1,42.6,33.6,29.7,15.7,7.8,63.5,45.9,61.0,39.0,64.4,35.6,69.98,77.653,80.941,2.196,44.191,76.3,73.2,1.164,0.931,22.047,0.0,4,12,6,4,62,2,96,276,56,64,42,174,174,42,82,28,3,186,18,7,77,6,63,228,138,244,121,63,101,78,127,64,37,275,14,6,86


In [None]:
# Import Team Ratings By Day (2015-2019)
team_ratings = pd.read_csv('../data/cbbdata/team/team_ratings.csv').query('year != 2020')
team_ratings['date']= pd.to_datetime(team_ratings['date'])

# Split Daily Team Ranknings By Before/After Selection Sunday
ss_dict = {2024: "2024-3-17", 2023: "2023-3-12", 2022: "2022-3-13", 
           2021: "2021-3-14", 2019: "2019-3-17", 2018: "2018-3-11", 
           2017: "2017-3-12", 2016: "2016-3-13", 2015: "2015-3-15"}

# Create A Pre and During NCAA Tournament Day By Day Ratings
team_rating_pre_ncaa = pd.DataFrame(columns = team_ratings.columns)
team_rating_ncaa = pd.DataFrame(columns = team_ratings.columns)
for year, ss_date in ss_dict.items():
    team_rating_pre_ncaa = pd.concat([team_rating_pre_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] < ss_date)]])    
    team_rating_ncaa = pd.concat([team_rating_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] > ss_date)]])
    
# Look At Team Rating By Day and Calculate Rolling Adj Offensive Rank
team_rating_pre_ncaa['rolling_avg_adj_o_rk'] = team_rating_pre_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
df = team_rating_pre_ncaa[(team_rating_pre_ncaa['team'] == 'Kentucky') & (team_rating_pre_ncaa['year'] == 2015)]
# team_rating_ncaa['rolling_avg_adj_o_rk'] = team_rating_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
# df = team_rating_ncaa[(team_rating_ncaa['team'] == 'Kentucky') & (team_rating_ncaa['year'] == 2015)]