In [172]:
# Import Libraries
import os 
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 300)

# Import and Merge Historical Team Data 
mdcm = pd.read_csv('../data/mdcm/NCAA_Tourney_2002_2023.csv')
team_spellings = pd.read_csv('../data/mdcm/team_spellings.csv')

ncaa_sheets = pd.read_csv('../data/cbbdata/team/ncaa_sheets.csv').query('year != 2020')
selection_sunday_resume = pd.read_csv('../data/cbbdata/team/selection_sunday_resume.csv').query('year != 2020')

coach_results = pd.read_csv('../data/kaggle/march_madness_data/coach_results.csv')
barttovik_home = pd.read_csv('../data/kaggle/march_madness_data/barttovik_home.csv')
barttovik_away = pd.read_csv('../data/kaggle/march_madness_data/barttovik_away.csv')
kenpom_barttovik = pd.read_csv('../data/kaggle/march_madness_data/kenpom_barttovik.csv')
shooting_splits = pd.read_csv('../data/kaggle/march_madness_data/shooting_splits.csv')
heat_check = pd.read_csv('../data/kaggle/march_madness_data/heat_check.csv')

In [173]:
# Define Functions
def merge_team_season(df: pd.DataFrame, df_merge_onto: pd.DataFrame, filter_df_merge_onto_year_cutoff: bool = False, filter_df_merge_onto_year = None, title: str = None, home_away: str = None):
    """
    
        Function to merge teams and their seasons in college basketball with a bevy of alternative spellings, using team_spellings.csv.
    
        df (pd.DataFrame): The dataframe you'd like to establish as your left, or original df. Must contain 'teamname' and 'season' columns.
        
        df_merge_onto (pd.DataFrame): The dataframe you'd like to left merge onto df. Must contain 'team' and 'year' columns.
        
        filter_df_merge_onto_year_cutoff (bool): A boolean determining if there is a cutoff year for the anchor data (i.e. Statcast data only 
                                                reaching back to 2015). This does not affect the merge, just the reporting success messages.
        
        filter_df_merge_onto_year (int): If there is a cutoff year for the df_merge_onto, the year of interest. This does not affect the 
                                        merge, just the reporting success messages.
    
    """
    # Print Title of Run For Terminal
    if title:
        print(title,'-----------------------\n')
        
    # If Oncoming Data Filtered By Specifc Year 
    if filter_df_merge_onto_year_cutoff == True:
        df_post_cutoff = df[df['season'] >= filter_df_merge_onto_year]
        df_pre_cutoff = df[df['season'] < filter_df_merge_onto_year]
        df = df_post_cutoff
    
    # Find Null df Column Identifier For Oncoming DF
    for col_name in df_merge_onto.columns:
        if 'team' not in col_name.lower() and 'year' not in col_name.lower():
            col_null_match_identifier = col_name
            break
    
    # Set Both Teamname Columns To Lowercas
    for team_num in range(1,3):
        df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()
    
    # Loop Through Team 1 and Team 2
    df_both_teams = pd.DataFrame()
    for team_num in range(1, 3):
        
        # Adjust Column Names Due To Team1 and Team2
        df_merge_onto.columns = df_merge_onto.columns.str[6:] if team_num == 2 else df_merge_onto.columns
        df_merge_onto = df_merge_onto.add_prefix(f'team{team_num}_')
        
        # Set The Team Column To Lowercase
        df_merge_onto[f'team{team_num}_team'] = df_merge_onto[f'team{team_num}_team'].str.lower()
        
        # If Second Iteration, Find The Merge Columns and The Spellings Ones To Keep
        if team_num == 2:
            spellings = ['team2_teamname', 'season']
            for col in df.columns:
                if f"team{team_num}_name_spelling" in col: 
                    spellings.append(col)
            df = df[spellings]
        
         # Original Merge
        print(f"Team {team_num} Merge ...\n")
        
        df_merged = pd.merge(df, df_merge_onto, how = 'left', left_on = [f'team{team_num}_teamname', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
        
        # Split Up Merged and Unmerged Data
        df_not_merged = df_merged[df_merged[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
        df_merged = df_merged[df_merged[f'team{team_num}_{col_null_match_identifier}'].isna() == False]
        
        print(f'Original Team {team_num} Merge:', len(df), 'total rows.')
        print('Matched During Iteration:', len(df_merged)) 
        print('Unmatched Rows Remaining:', len(df_not_merged), '\n') 
        
        # Remove Columns That Didn't Merge Properly Based On Num of Columns
        neg_col_count_df_merge_onto = df_merge_onto.shape[1] * -1
        df_not_merged = df_not_merged.iloc[:, :neg_col_count_df_merge_onto]
          
        # Loop Through Columnns To Fix The Merge
        print(f"Correcting Team {team_num} Merge ...\n")
        merge_complete, i = False, 1
        while merge_complete == False:
            
            # Perform Loop Everytime More Unmatched Columns Are Found
            team_season_loop = pd.merge(df_not_merged, df_merge_onto, how = 'left', left_on = [f'team{team_num}_name_spelling_{i}', 'season'], right_on = [f'team{team_num}_team', f'team{team_num}_year'])
            print(f'Team {team_num} Season Loop {i}:', len(team_season_loop), 'total rows.')
            
            # Split Up The Matched and Unmatched
            matched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == False]
            print('Matched During Iteration:', len(matched_df)) 
            unmatched_df = team_season_loop[team_season_loop[f'team{team_num}_{col_null_match_identifier}'].isna() == True]
            print('Unmatched Rows Remaining:', len(unmatched_df), '\n') 
            
            # For The DataFrames With Data In The Column From Second DF, Add To team_season
            if len(matched_df) > 0:
                df_merged = pd.concat([df_merged, matched_df])
            # If There Are Still Null Rows, Throw Those Back In The Loop For The Next Iteration
            if len(unmatched_df) > 0:
                df_not_merged = unmatched_df.iloc[:, :neg_col_count_df_merge_onto]
            # If There Aren't Any Null Rows Left, End The Loop
            else:
                print(f'Success! Team {team_num} Merge Completed Early!\n')
                merge_complete = True
            if i == 11:
                merge_complete = True 
            i += 1
   
        # Concat Team 1 or 2 Onto Full DataFrame
        if team_num == 2:
            spellings = []
            for col in df_merged.columns:
                if 'team2_name_spelling' not in col:
                    spellings.append(col)
                
            df_both_teams = pd.merge(df_both_teams, df_merged[spellings], how = 'left', on = [f'team2_teamname', 'season'])
        else:
            df_both_teams = df_merged
    
    # Inspect Final Results
    print("Filter Views of Resulting DataFrame -------------------------\n")
    for team_num in range(1, 3):
        print(f'Team{team_num}:')
        if filter_df_merge_onto_year_cutoff == True:
            # Gather Data About Merge Post Cutoff
            post_cutoff_rows = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == False) & (df_both_teams['season'] >= filter_df_merge_onto_year)].shape[0]
            post_cutoff_rows_na = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == True) & (df_both_teams['season'] >= filter_df_merge_onto_year)].shape[0]
            if post_cutoff_rows_na > 0:
                print(f'Oh No! There were {post_cutoff_rows} matches and {post_cutoff_rows_na} post {filter_df_merge_onto_year} null games.')
                # Get Teamnames That Dont Merge
                if pd.read_csv('../data/sample_spell.csv').shape[0] == pd.merge(pd.read_csv('../data/mdcm/team_spellings.csv'), 
                    pd.read_csv('../data/sample_spell.csv'), how = 'inner', left_on = 'name_spelling', right_on = 'team2_teamname').shape[0]:
                        print("Despite all the mismatches, none of them were in the merged onto Dataframe.")
                else:
                    print("Here are the distinct team names from the 'df' dataframe that weren't merged properly:")
                    print(df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == True) & (df_both_teams['season'] >= filter_df_merge_onto_year)][f'team{team_num}_teamname'].drop_duplicates().sort_values())
            else:
                print(f'Great! No Null Rows Post {filter_df_merge_onto_year}')
     
            # Gather Data About Merge Pre Cutoff
            pre_cutoff_rows = df_both_teams[(df_both_teams[f'team{team_num}_{col_null_match_identifier}'].isna() == False) & (df_both_teams['season'] < filter_df_merge_onto_year)].shape[0]
            if pre_cutoff_rows == 0:
                print(f'Great! No Matched Rows Pre {filter_df_merge_onto_year}\n')
                                
    # Concat Pre and Post Cutoff If Exists
    if filter_df_merge_onto_year_cutoff == True:
        df_both_teams = pd.concat([df_both_teams, df_pre_cutoff], ignore_index=True)    
    
    return df_both_teams.drop(['team1_team','team1_year', 'team2_team', 'team2_year'], axis = 1)

# Adjust Team Spellings
team_spellings = team_spellings.pivot_table(index='team_id', columns=team_spellings.groupby('team_id').cumcount(), values='name_spelling', aggfunc='first')
team_spellings.columns = [f'name_spelling_{i + 1}' for i in range(team_spellings.shape[1])]
team_spellings.reset_index(inplace=True)

# Merge Team Spellings
team_spellings_t1 = team_spellings.add_prefix('team1_')
mdcm = pd.merge(mdcm, team_spellings_t1, how = 'inner', left_on = ['team1_id'], right_on = ['team1_team_id'])
team_spellings_t2 = team_spellings.add_prefix('team2_')
mdcm = pd.merge(mdcm, team_spellings_t2, how = 'inner', left_on = ['team2_id'], right_on = ['team2_team_id'])

# Merge MDCM and NCAA_Sheets (2019-2024)
ncaa_sheets.drop(['seed', 'net', 'quad_1a', 'quad_1', 'quad_2', 'quad_1_and_2', 'quad_3', 'quad_4'], axis = 1, inplace = True)
mdcm = merge_team_season(mdcm, ncaa_sheets, filter_df_merge_onto_year_cutoff = True, filter_df_merge_onto_year = 2019, title = 'MDCM and NCAA Sheets (2019-2024)')

# Merge MDCM and Selection Sunday (2008-2024)
selection_sunday_resume = selection_sunday_resume[selection_sunday_resume['year'] != 'Year']
selection_sunday_resume['year'] = selection_sunday_resume['year'].astype(int)
selection_sunday_resume.drop(columns = ['net', 'seed'], axis = 1, inplace = True)
selection_sunday_resume.rename(columns={'score': 'team_score'}, inplace=True)
mdcm = merge_team_season(mdcm, selection_sunday_resume, filter_df_merge_onto_year_cutoff = True, filter_df_merge_onto_year = 2008, title = 'MDCM and Selection Sunday (2008 - 2024)')

MDCM and NCAA Sheets (2019-2024) -----------------------

Team 1 Merge ...

Original Team 1 Merge: 268 total rows.
Matched During Iteration: 214
Unmatched Rows Remaining: 54 

Correcting Team 1 Merge ...

Team 1 Season Loop 1: 54 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 54 

Team 1 Season Loop 2: 54 total rows.
Matched During Iteration: 9
Unmatched Rows Remaining: 45 

Team 1 Season Loop 3: 45 total rows.
Matched During Iteration: 31
Unmatched Rows Remaining: 14 

Team 1 Season Loop 4: 14 total rows.
Matched During Iteration: 6
Unmatched Rows Remaining: 8 

Team 1 Season Loop 5: 8 total rows.
Matched During Iteration: 3
Unmatched Rows Remaining: 5 

Team 1 Season Loop 6: 5 total rows.
Matched During Iteration: 3
Unmatched Rows Remaining: 2 

Team 1 Season Loop 7: 2 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 2 

Team 1 Season Loop 8: 2 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 2 

Team 1 Season Loop 9: 2 total rows.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()


Team 1 Season Loop 5: 67 total rows.
Matched During Iteration: 6
Unmatched Rows Remaining: 61 

Team 1 Season Loop 6: 61 total rows.
Matched During Iteration: 2
Unmatched Rows Remaining: 59 

Team 1 Season Loop 7: 59 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 59 

Team 1 Season Loop 8: 59 total rows.
Matched During Iteration: 1
Unmatched Rows Remaining: 58 

Team 1 Season Loop 9: 58 total rows.
Matched During Iteration: 6
Unmatched Rows Remaining: 52 

Team 1 Season Loop 10: 52 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 52 

Team 1 Season Loop 11: 52 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 52 

Team 2 Merge ...

Original Team 2 Merge: 998 total rows.
Matched During Iteration: 609
Unmatched Rows Remaining: 389 

Correcting Team 2 Merge ...

Team 2 Season Loop 1: 389 total rows.
Matched During Iteration: 0
Unmatched Rows Remaining: 389 

Team 2 Season Loop 2: 389 total rows.
Matched During Iteration: 21
Unmatched Row

In [174]:
# Merge Kaggle march_madness_data
files = ['coach_results', 'barttovik_home', 'barttovik_away', 'kenpom_barttovik', 'shooting_splits', 'heat_check']
for file in files:
    df = pd.read_csv(f'../data/kaggle/march_madness_data/{file}.csv')
    
    # if file == 'coach_results':
    #     print(file)
        # need additional param for coach id
        
    if file in files[1:3]:
        print(file)
        df.columns = df.columns.str.lower()
        df.drop(columns = ['seed', 'round', 'wab'], axis = 1, inplace = True)
        df.rename(columns={col: f'{file[-4:]}_' + col for col in df.columns[-85:]}, inplace=True)
        print(df.head())
        mdcm = merge_team_season(mdcm, df, filter_df_merge_onto_year_cutoff = True,  filter_df_merge_onto_year = 2008, home_away = file[-4:])
        
    # elif file in files[:-2]:
    #     print(file)
    
mdcm.head()

barttovik_home
   home_year  home_team no  home_team id        home_team  home_badj em  \
0       2024          1079             2            Akron           8.8   
1       2024          1078             3          Alabama          32.4   
2       2024          1077             7  Appalachian St.          14.1   
3       2024          1076             8          Arizona          31.3   
4       2024          1075            12           Auburn          29.4   

   home_badj o  home_badj d  home_barthag  home_games  home_w  home_l  \
0        107.6         98.8         0.727          10      10       0   
1        128.1         95.7         0.966          14      13       1   
2        110.5         96.4         0.828          13      13       0   
3        120.8         89.5         0.969          15      14       1   
4        118.9         89.5         0.963          14      13       1   

   home_win%  home_efg%  home_efg%d  home_ftr  home_ftrd  home_tov%  \
0  100.00000       53.1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'team{team_num}_teamname'] = df[f'team{team_num}_teamname'].str.lower()


KeyError: 'team1_team'

In [None]:
# Import Team Ratings By Day (2015-2019)
team_ratings = pd.read_csv('../data/cbbdata/team/team_ratings.csv').query('year != 2020')
team_ratings['date']= pd.to_datetime(team_ratings['date'])

# Split Daily Team Ranknings By Before/After Selection Sunday
ss_dict = {2024: "2024-3-17", 2023: "2023-3-12", 2022: "2022-3-13", 
           2021: "2021-3-14", 2019: "2019-3-17", 2018: "2018-3-11", 
           2017: "2017-3-12", 2016: "2016-3-13", 2015: "2015-3-15"}

# Create A Pre and During NCAA Tournament Day By Day Ratings
team_rating_pre_ncaa = pd.DataFrame(columns = team_ratings.columns)
team_rating_ncaa = pd.DataFrame(columns = team_ratings.columns)
for year, ss_date in ss_dict.items():
    team_rating_pre_ncaa = pd.concat([team_rating_pre_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] < ss_date)]])    
    team_rating_ncaa = pd.concat([team_rating_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] > ss_date)]])
    
# Look At Team Rating By Day and Calculate Rolling Adj Offensive Rank
team_rating_pre_ncaa['rolling_avg_adj_o_rk'] = team_rating_pre_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
df = team_rating_pre_ncaa[(team_rating_pre_ncaa['team'] == 'Kentucky') & (team_rating_pre_ncaa['year'] == 2015)]
# team_rating_ncaa['rolling_avg_adj_o_rk'] = team_rating_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
# df = team_rating_ncaa[(team_rating_ncaa['team'] == 'Kentucky') & (team_rating_ncaa['year'] == 2015)]