In [273]:
# Import Libraries
import os 
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 200)

# Import and Merge Historical Team Data 
mdcm = pd.read_csv('../data/mdcm/NCAA_Tourney_2002_2023.csv')
ncaa_sheets = pd.read_csv('../data/cbbdata/team/ncaa_sheets.csv').query('year != 2020')
selection_sunday_resume = pd.read_csv('../data/cbbdata/team/selection_sunday_resume.csv').query('year != 2020')

# Import Team Spellings
team_spellings = pd.read_csv('../data/mdcm/team_spellings.csv')
team_spellings = team_spellings.pivot_table(index='team_id', columns=team_spellings.groupby('team_id').cumcount(), values='name_spelling', aggfunc='first')
team_spellings.columns = [f'name_spelling_{i + 1}' for i in range(team_spellings.shape[1])]
team_spellings.reset_index(inplace=True)

# Merge Team Spellings
team_spellings_t1 = team_spellings.add_prefix('team1_')
team_season = pd.merge(mdcm, team_spellings_t1, how = 'inner', left_on = ['team1_id'], right_on = ['team1_team_id'])
team_spellings_t2 = team_spellings.add_prefix('team2_')
mdcm = pd.merge(team_season, team_spellings_t2, how = 'inner', left_on = ['team2_id'], right_on = ['team2_team_id'])

# Column Adjustments For MDCM and NCAA_Sheets
ncaa_sheets = ncaa_sheets.add_prefix('team1_').drop(columns = ['team1_seed'])

# Merge MDCM and NCAA_Sheets (2019-2023)
team_season = pd.merge(mdcm[mdcm['season'] >= 2019], ncaa_sheets, how = 'left', left_on = ['team1_teamname', 'season'], right_on = ['team1_team', 'team1_year'])
team_season_not_merged = team_season[team_season['team1_team'].isna() == True]
team_season_not_merged = team_season_not_merged.iloc[:, :-17]

# For Unproperly Merged Columns, Merge To A New Spelling Column
merge_complete, i = False, 1
while merge_complete == False:
    team_season_loop = pd.merge(team_season_not_merged, ncaa_sheets, how = 'left', left_on = [f'team1_name_spelling_{i}', 'season'], right_on = ['team1_team', 'team1_year'])
    print(f'Team Season Loop {i}:', len(team_season_loop), 'total rows.')
    
    matched_df = team_season_loop[team_season_loop['team1_team'].isna() == False]
    print('Matched DataFrame Rows:', len(matched_df)) 
    unmatched_df = team_season_loop[team_season_loop['team1_team'].isna() == True]
    print('Unmatched DataFrame Rows:', len(unmatched_df)) 
    
    # For The DataFrames With Data In The Column From Second DF, Add To team_season
    if len(matched_df) > 0:
        team_season = pd.concat([team_season, matched_df])
        
    # If There Are Still Null Rows, Throw Those Back In The Loop For The Next Iteration
    if len(unmatched_df) > 0:
        team_season_not_merged = unmatched_df.iloc[:, :-17]
        print('Not Merged Ending Iteration Shape:', team_season_not_merged.shape, '\n')
    # If There Aren't Any Null Rows Left, End The Loop
    else:
        merge_complete = True
    if i == 11:
        merge_complete = True 
    i += 1

# Concat Pre 2019 with NCAA Data,  As Well As Post With Now Merged
mdcm = pd.concat([mdcm, mdcm[mdcm['season'] < 2019]])

# Inspect That All Went As Planned
print(mdcm[(mdcm['team1_id'].isna() == True) & (mdcm['season'] >= 2019)].shape)
print(mdcm[(mdcm['team1_id'].isna() == False) & (mdcm['season'] >= 2019)].shape)
print(mdcm[(mdcm['team1_id'].isna() == True) & (mdcm['season'] >= 2019)].shape)
print(mdcm[(mdcm['team1_id'].isna() == False) & (mdcm['season'] < 2019)].shape)

mdcm[(mdcm['team1_id'].isna() == False) & (mdcm['season'] >= 2019)]

In [276]:
# Import Team Ratings By Day (2015-2019)
team_ratings = pd.read_csv('../data/cbbdata/team/team_ratings.csv').query('year != 2020')
team_ratings['date']= pd.to_datetime(team_ratings['date'])

# Split Daily Team Ranknings By Before/After Selection Sunday
ss_dict = {2024: "2024-3-17", 2023: "2023-3-12", 2022: "2022-3-13", 
           2021: "2021-3-14", 2019: "2019-3-17", 2018: "2018-3-11", 
           2017: "2017-3-12", 2016: "2016-3-13", 2015: "2015-3-15"}

team_rating_pre_ncaa = pd.DataFrame(columns = team_ratings.columns)
team_rating_ncaa = pd.DataFrame(columns = team_ratings.columns)
for year, ss_date in ss_dict.items():
    team_rating_pre_ncaa = pd.concat([team_rating_pre_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] < ss_date)]])
    team_rating_ncaa = pd.concat([team_rating_ncaa, team_ratings[(team_ratings['year'] == year) & (team_ratings['date'] > ss_date)]])
    
# team_rating_ncaa['year'].drop_duplicates()
team_rating_pre_ncaa[team_rating_pre_ncaa['m']]

Unnamed: 0,rank,team,conf,record,barthag,adj_o,adj_o_rk,adj_d,adj_d_rk,adj_tempo,adj_tempo_rk,proj_record,proj_conf_record,wab,wab_rk,date,year,rec,proj_rec,proj_conf_rec,cur_rk,change
456122,1,Houston,Amer,31-3,0.958828,118.232584,12,89.919807,4,63.26221,343,32-3,17-1,8.409932,5,2023-03-13,2023,,,,,
456123,2,Alabama,SEC,29-5,0.956507,116.975919,15,89.408177,3,72.703653,4,29-5,16-2,10.578411,1,2023-03-13,2023,,,,,
456124,3,UCLA,P12,29-5,0.94652,113.633051,33,88.509161,2,66.345234,226,29.9-5.1,18-2,8.469027,4,2023-03-13,2023,,,,,
456125,4,Connecticut,BE,25-8,0.941662,119.839839,6,94.094149,19,66.980217,189,25.8-8.2,13-7,4.975927,14,2023-03-13,2023,,,,,
456126,5,Tennessee,SEC,23-10,0.941642,112.361609,49,88.22531,1,65.582471,275,23.9-10.1,11-7,3.152275,25,2023-03-13,2023,,,,,


In [287]:
team_rating_pre_ncaa['rolling_avg_adj_o_rk'] = team_rating_pre_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
df = team_rating_pre_ncaa[(team_rating_pre_ncaa['team'] == 'Connecticut') & (team_rating_pre_ncaa['year'] == 2023)]
df.tail(5)

Unnamed: 0,rank,team,conf,record,barthag,adj_o,adj_o_rk,adj_d,adj_d_rk,adj_tempo,adj_tempo_rk,proj_record,proj_conf_record,wab,wab_rk,date,year,rec,proj_rec,proj_conf_rec,cur_rk,change,rolling_avg_adj_o_rk
453948,5,Connecticut,BE,24-7,0.936785,119.766071,9,94.737751,18,67.486369,173,24.8-7.2,13-7,4.855933,13,2023-03-07,2023,,,,,,119.817373
454311,5,Connecticut,BE,24-7,0.93867,119.624204,9,94.360391,17,67.437981,171,24.8-7.2,13-7,4.76956,13,2023-03-08,2023,,,,,,119.758517
454674,5,Connecticut,BE,25-7,0.942048,119.583324,8,93.835235,16,67.236053,178,25.6-7.4,13-7,5.240232,12,2023-03-09,2023,,,,,,119.657866
455037,5,Connecticut,BE,25-8,0.939967,119.707668,7,94.239477,20,66.927943,195,25-8,13-7,4.937605,13,2023-03-10,2023,,,,,,119.638399
455400,5,Connecticut,BE,25-8,0.940839,119.848842,7,94.223159,20,67.004705,189,25-8,13-7,4.943409,13,2023-03-11,2023,,,,,,119.713278


In [292]:
team_rating_ncaa['rolling_avg_adj_o_rk'] = team_rating_ncaa.groupby(['team', 'year'])['adj_o'].transform(lambda x: x.rolling(window=3).mean())
df = team_rating_ncaa[(team_rating_ncaa['team'] == 'Connecticut') & (team_rating_ncaa['year'] == 2023)]
df.tail(5)

Unnamed: 0,rank,team,conf,record,barthag,adj_o,adj_o_rk,adj_d,adj_d_rk,adj_tempo,adj_tempo_rk,proj_record,proj_conf_record,wab,wab_rk,date,year,rec,proj_rec,proj_conf_rec,cur_rk,change,rolling_avg_adj_o_rk
462293,1,Connecticut,BE,29-8,0.959848,121.307492,3,92.048867,11,66.815741,197,29.8-8.2,13-7,4.913699,13,2023-03-30,2023,,,,,,121.297247
462656,1,Connecticut,BE,29-8,0.959848,121.307555,3,92.048846,11,66.815741,197,29.8-8.2,13-7,4.91394,13,2023-03-31,2023,,,,,,121.302814
463019,1,Connecticut,BE,30-8,0.961755,121.191428,3,91.556607,9,66.651999,203,30.7-8.3,13-7,4.962955,13,2023-04-01,2023,,,,,,121.268825
463382,1,Connecticut,BE,30-8,0.961755,121.191428,3,91.556607,9,66.651999,203,30.7-8.3,13-7,4.962955,13,2023-04-02,2023,,,,,,121.230137
463745,1,Connecticut,BE,31-8,0.96428,121.478405,3,91.209361,9,66.72784,200,31-8,13-7,5.03325,13,2023-04-03,2023,,,,,,121.287087


In [None]:
team_season.drop(columns = ['team1_year', 'team1_team'])
selection_sunday_resume.drop(columns = ['net'], inplace = True)
team_season = pd.merge(team_season, selection_sunday_resume, how = 'left', on = ['team', 'year'])