In [1]:
import pandas as pd

In [2]:
data = pd.read_parquet('eda_data.parquet')

In [3]:
by_player_off = data.reset_index()[['off_player_name', 'off_player_pos', 'off_height', 'off_weight']].drop_duplicates()
by_player_def = data.reset_index()[['def_player_name', 'def_player_pos', 'def_height', 'def_weight']].drop_duplicates()
by_player_off.columns = by_player_off.columns.str.split('_').to_series().apply(lambda x: '_'.join(x[1:]))
by_player_def.columns = by_player_def.columns.str.split('_').to_series().apply(lambda x: '_'.join(x[1:]))


by_player = pd.concat([by_player_off, by_player_def]).drop_duplicates()
med_height_weight = by_player.groupby('player_pos').median()
med_height_weight.columns = 'med_' + med_height_weight.columns

med_height_weight_off = med_height_weight.copy()
med_height_weight_off.columns = 'off_' + med_height_weight_off.columns 

med_height_weight_def = med_height_weight.copy()
med_height_weight_def.columns = 'def_' + med_height_weight_def.columns 

In [4]:
data = data.reset_index().merge(med_height_weight_off, left_on = 'off_player_pos', right_on='player_pos')
data = data.merge(med_height_weight_def, left_on = 'def_player_pos', right_on='player_pos')



In [5]:
data['tall_off'] = data.off_height >= data.off_med_height
data['large_off'] = data.off_weight >= data.off_med_weight
data['tall_def'] = data.def_height >= data.def_med_height
data['large_def'] = data.def_weight >= data.def_med_weight




In [6]:
data.groupby('off_player_name').matchup_player_pts.mean()

off_player_name
Aaron Brooks       0.789419
Aaron Gordon       1.618163
Aaron Harrison     0.828753
Aaron Holiday      1.389602
Aaron Jackson      0.591014
                     ...   
Zeke Nnaji         1.084672
Zhaire Smith       0.674991
Zhou Qi            0.308748
Zion Williamson    2.426317
Zylan Cheatham     0.559441
Name: matchup_player_pts, Length: 803, dtype: float64

In [7]:
data.avg_matchup_player_pts

0         0.789419
1         1.389602
2         1.083359
3         1.579532
4         1.061140
            ...   
160936    1.243199
160937    0.964852
160938    0.895889
160939    1.108865
160940    1.147547
Name: avg_matchup_player_pts, Length: 160941, dtype: float64

In [8]:
data['player_pts_off_pct_inc'] = 100 * (data.matchup_player_pts - data.avg_matchup_player_pts) / data.avg_matchup_player_pts
data['team_pts_off_pct_inc'] = 100 * (data.matchup_team_pts - data.avg_matchup_team_pts) / data.avg_matchup_team_pts



In [9]:
player_ppm = data.groupby(['off_player_pos', 'def_player_pos']).player_pts_off_pct_inc\
.agg(['mean', 'count']).rename(columns={'mean': 'avg_pct_increase_player_points_per_min', 'count': 'n_matchups'})\
.reset_index()\
.sort_values('avg_pct_increase_player_points_per_min')
player_ppm.to_csv('eda_player_ppm.csv', index=False)
player_ppm.head()

Unnamed: 0,off_player_pos,def_player_pos,avg_pct_increase_player_points_per_min,n_matchups
12,PG,PG,-29.788373,10933
24,SG,SG,-23.852743,11172
0,C,C,-20.816426,10841
6,PF,PF,-19.975052,7707
18,SF,SF,-19.86297,7718


In [10]:
player_ppm_height = data.groupby(['off_player_pos', 'def_player_pos', 'tall_off', 'tall_def']).player_pts_off_pct_inc\
.agg(['mean', 'count']).rename(columns={'mean': 'avg_pct_increase_player_points_per_min', 'count': 'n_matchups'})\
.reset_index()\
.sort_values('avg_pct_increase_player_points_per_min')
player_ppm_height.to_csv('eda_player_ppm_height.csv', index=False)
player_ppm_height.head()

Unnamed: 0,off_player_pos,def_player_pos,tall_off,tall_def,avg_pct_increase_player_points_per_min,n_matchups
48,PG,PG,False,False,-32.021252,1878
50,PG,PG,True,False,-30.844371,2677
49,PG,PG,False,True,-30.39877,2628
51,PG,PG,True,True,-27.488537,3750
96,SG,SG,False,False,-27.153536,2391


In [11]:
team_ppm = data.groupby(['off_player_pos', 'def_player_pos']).team_pts_off_pct_inc\
.agg(['mean', 'count']).rename(columns={'mean': 'avg_pct_increase_team_points_per_min', 'count': 'n_matchups'})\
.reset_index()\
.sort_values('avg_pct_increase_team_points_per_min')
team_ppm.to_csv('eda_team_ppm.csv', index=False)
team_ppm.head()

Unnamed: 0,off_player_pos,def_player_pos,avg_pct_increase_team_points_per_min,n_matchups
0,C,C,-50.773921,10841
12,PG,PG,-38.877442,10933
6,PF,PF,-36.272055,7707
24,SG,SG,-26.884697,11172
18,SF,SF,-23.35321,7718


In [12]:
team_ppm_height = data.groupby(['off_player_pos', 'def_player_pos', 'tall_off', 'tall_def']).team_pts_off_pct_inc\
.agg(['mean', 'count']).rename(columns={'mean': 'avg_pct_increase_team_points_per_min', 'count': 'n_matchups'})\
.reset_index()\
.sort_values('avg_pct_increase_team_points_per_min')
team_ppm_height.to_csv('eda_team_ppm_height.csv', index=False)
team_ppm_height.head()

Unnamed: 0,off_player_pos,def_player_pos,tall_off,tall_def,avg_pct_increase_team_points_per_min,n_matchups
2,C,C,True,False,-51.934084,2547
0,C,C,False,False,-51.414181,2141
1,C,C,False,True,-50.935549,2785
3,C,C,True,True,-49.355908,3368
48,PG,PG,False,False,-47.28836,1878


In [13]:
player_ppm = player_ppm.set_index(['def_player_pos', 'off_player_pos'])
team_ppm = team_ppm.set_index(['def_player_pos', 'off_player_pos'])

In [14]:
offs = ['PG', 'SG', 'SF', 'PF', 'C']
deffs = ['PG', 'SG', 'SF', 'PF', 'C']
import itertools
matchups = list(itertools.product(offs, deffs))



In [15]:
matchups

[('PG', 'PG'),
 ('PG', 'SG'),
 ('PG', 'SF'),
 ('PG', 'PF'),
 ('PG', 'C'),
 ('SG', 'PG'),
 ('SG', 'SG'),
 ('SG', 'SF'),
 ('SG', 'PF'),
 ('SG', 'C'),
 ('SF', 'PG'),
 ('SF', 'SG'),
 ('SF', 'SF'),
 ('SF', 'PF'),
 ('SF', 'C'),
 ('PF', 'PG'),
 ('PF', 'SG'),
 ('PF', 'SF'),
 ('PF', 'PF'),
 ('PF', 'C'),
 ('C', 'PG'),
 ('C', 'SG'),
 ('C', 'SF'),
 ('C', 'PF'),
 ('C', 'C')]

In [16]:
combos = list(itertools.combinations(matchups, 5))

In [17]:
def is_unique(my_list):
    return len(set(my_list)) == len(my_list)

combos_valid = list(filter(lambda x: is_unique(list(zip(*x))[0]) and is_unique(list(zip(*x))[1]), combos))

In [18]:
ppgs = pd.Series({v_combo: player_ppm.loc[list(v_combo)].avg_pct_increase_player_points_per_min.mean() for v_combo in combos_valid})

In [19]:
# What are best ways to matchup for a defense?
# Not unexpected thats its pg-pg, sg-sg, sf-sf, pf-pf, c-c
ppgs.sort_values()

(PG, PG)  (SG, SG)  (SF, SF)  (PF, PF)  (C, C)    -22.859113
          (SG, SF)  (SF, SG)  (PF, PF)  (C, C)    -20.621511
(PG, SG)  (SG, PG)  (SF, SF)  (PF, PF)  (C, C)    -19.163720
(PG, PG)  (SG, SG)  (SF, PF)  (PF, SF)  (C, C)    -17.058390
(PG, SG)  (SG, SF)  (SF, PG)  (PF, PF)  (C, C)    -16.300319
                                                     ...    
(PG, SF)  (SG, C)   (SF, PF)  (PF, PG)  (C, SG)    33.273682
(PG, PF)  (SG, SF)  (SF, C)   (PF, PG)  (C, SG)    33.637074
(PG, C)   (SG, PF)  (SF, SF)  (PF, PG)  (C, SG)    33.651049
(PG, SF)  (SG, PF)  (SF, C)   (PF, PG)  (C, SG)    34.564305
(PG, PF)  (SG, C)   (SF, SF)  (PF, PG)  (C, SG)    35.718387
Length: 120, dtype: float64