In [1]:
import pandas as pd
import os
import scipy.stats as stats
import matplotlib.pyplot as plt
plt.style.use('seaborn')

filepath = os.path.join("raw_data", "team_game_level_data.csv")
team_game_df = pd.read_csv(filepath)
team_game_df.head()

Unnamed: 0,game_id,date,round,region,seed,team,score,opponent_seed,opponent,opponent_score,overtime,score_diff,win,seed_id
0,0,1985-03-14,Round of 64,East,1,Georgetown,68,16,Lehigh,43,,25,1,1_16_fav
1,0,1985-03-14,Round of 64,East,16,Lehigh,43,1,Georgetown,68,,-25,0,1_16_dog
2,1,1985-03-14,Round of 64,East,4,"Loyola, Illinois",59,13,Iona,58,,1,1,4_13_fav
3,1,1985-03-14,Round of 64,East,13,Iona,58,4,"Loyola, Illinois",59,,-1,0,4_13_dog
4,2,1985-03-14,Round of 64,East,5,Southern Methodist,85,12,Old Dominion,68,,17,1,5_12_fav


In [2]:
# remove play-in games
no_play_ins = team_game_df.loc[team_game_df["round"] != "Opening Round", :]
no_play_ins.head()

Unnamed: 0,game_id,date,round,region,seed,team,score,opponent_seed,opponent,opponent_score,overtime,score_diff,win,seed_id
0,0,1985-03-14,Round of 64,East,1,Georgetown,68,16,Lehigh,43,,25,1,1_16_fav
1,0,1985-03-14,Round of 64,East,16,Lehigh,43,1,Georgetown,68,,-25,0,1_16_dog
2,1,1985-03-14,Round of 64,East,4,"Loyola, Illinois",59,13,Iona,58,,1,1,4_13_fav
3,1,1985-03-14,Round of 64,East,13,Iona,58,4,"Loyola, Illinois",59,,-1,0,4_13_dog
4,2,1985-03-14,Round of 64,East,5,Southern Methodist,85,12,Old Dominion,68,,17,1,5_12_fav


In [3]:
# compare score margins of seeds and check for statistically significant differences

score_margins_by_seed = []

for seed in range(1,17):
    games = no_play_ins.loc[(no_play_ins["seed"] == seed)]
    score_margins = games["score_diff"]
    score_margins_by_seed.append(score_margins)

comparisons = []
pvals = []
stat_significant = []
higher_seeds = []
lower_seeds = []
    
for i in range(1,16):
    (statistic, pval) = stats.ttest_ind(score_margins_by_seed[i-1], score_margins_by_seed[i], equal_var=False)
    pvals.append(pval)
    comparison = f"{i} Seed v. {i+1} Seed"
    comparisons.append(comparison)
    if pval < 0.05:
        significant = True
    else:
        significant = False
    stat_significant.append(significant)
    higher_seed = score_margins_by_seed[i-1].mean()
    higher_seeds.append(higher_seed)
    lower_seed = score_margins_by_seed[i].mean()
    lower_seeds.append(lower_seed)
    

seed_comparisons = pd.DataFrame({
    "Seeds":comparisons,
    "P Value":pvals,
    "Statistically Significant":stat_significant,
    "Higher Seed Avg Score Margin":higher_seeds,
    "Lower Seed Avg Score Margin":lower_seeds
})

seed_comparisons

Unnamed: 0,Higher Seed Avg Score Margin,Lower Seed Avg Score Margin,P Value,Seeds,Statistically Significant
0,11.392193,7.109302,1.159112e-06,1 Seed v. 2 Seed,True
1,7.109302,4.960452,0.02164111,2 Seed v. 3 Seed,True
2,4.960452,3.313846,0.09320409,3 Seed v. 4 Seed,False
3,3.313846,0.892593,0.01742332,4 Seed v. 5 Seed,True
4,0.892593,0.335793,0.5877796,5 Seed v. 6 Seed,False
5,0.335793,-0.585062,0.3765998,6 Seed v. 7 Seed,False
6,-0.585062,-3.281818,0.01387471,7 Seed v. 8 Seed,True
7,-3.281818,-4.22,0.4272341,8 Seed v. 9 Seed,False
8,-4.22,-3.028571,0.313126,9 Seed v. 10 Seed,False
9,-3.028571,-3.524752,0.6685562,10 Seed v. 11 Seed,False


In [4]:
# compare all seeds and find groupings of seeds with similar performance

seed_comparisons = []
pvals = []
significant_difference = []

for seed in range(15):
    for comparison_seed in range(seed+1,16):
        (statistic, pval) = stats.ttest_ind(score_margins_by_seed[seed], score_margins_by_seed[comparison_seed], equal_var=False)
        if pval < 0.05:
            significant = True
        else:
            significant = False
        seed_comparisons.append(f"{seed+1}v{comparison_seed+1}")
        pvals.append(pval)
        significant_difference.append(significant)
        
seed_comparison_df = pd.DataFrame({
    "Seed Comparison":seed_comparisons,
    "P Value":pvals,
    "Significant Difference":significant_difference
})

similar_seeds = seed_comparison_df.loc[seed_comparison_df["Significant Difference"] == False, :]
similar_seeds

Unnamed: 0,P Value,Seed Comparison,Significant Difference
29,0.093204,3v4,False
54,0.58778,5v6,False
55,0.15954,5v7,False
65,0.3766,6v7,False
84,0.427234,8v9,False
85,0.821109,8v10,False
86,0.83407,8v11,False
87,0.305909,8v12,False
92,0.313126,9v10,False
93,0.568254,9v11,False


In [5]:
similar_pairings = [1,2,(3,4),(5,6,7),(8,9,10,11,12),(13,14),15,16]
similar_pairings

[1, 2, (3, 4), (5, 6, 7), (8, 9, 10, 11, 12), (13, 14), 15, 16]