In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

from collections import defaultdict

In [None]:
avg = pd.read_csv("BBGM_TeamMOV_all_seasons_Average_Stats.csv")
game = pd.read_csv("BBGM_TeamMOV_all_seasons_Game_Stats.csv")

In [None]:
#avg = avg.set_index(['pid','Season'])
avg['fakeKey'] = avg['pid'].astype(str) + '_' + avg['Season'].astype(str)
avg

In [None]:
game['fakeKey'] = game['pid'].astype(str) + '_' + game['Season'].astype(str)
game

In [None]:
game_rate = game.merge(avg,on='fakeKey')
game_rate

In [None]:
team_rating = defaultdict(lambda:np.zeros(17))
team_score = {}
team_minutes =defaultdict(float)

real_gids = defaultdict(set)

for row in game_rate.itertuples():
    key = (row[1],row[5])
    if row[5] == row[6]:
        continue
    MP = row[11]
    rt = MP*np.array(row[92:])
    ms,os = [int(_) for _ in row[7].split('-')]

    team_rating[key] += rt
    team_minutes[key] += MP
    team_score[key] = ms-os
    
    real_gids[key[0]].add(key[1])
    
    

In [None]:
game_rate.head(5).to_csv('tmp.csv')

In [None]:
team_rating_n = {k: team_rating[k]/team_minutes[k] for k in team_rating}

In [None]:
team_rating_ovr = {k: v[0] for k,v in team_rating_n.items()}

In [None]:
set({'SAC','LAC'})

In [None]:
game_res = []
for gid,teams in real_gids.items():
    t = list(teams)
    if len(t) != 2:
        continue
    game_res.append( ( team_rating_ovr[(gid,t[0])] - team_rating_ovr[(gid,t[1])], team_score[(gid,t[0])] ) )

In [None]:
game_res = np.array(game_res)

In [None]:
plt.style.use('fivethirtyeight')
plt.scatter(game_res[:,0],game_res[:,1],s=8,alpha=0.1)
plt.xlim(-12,12)
plt.ylim(-50,50)
plt.xlabel('minute-weighted OVR difference')
plt.ylabel('MOV')
plt.plot([-10,10],[-20.7,20.7],c='k',label='Best fit \n(2.07 MOV for 1 OVR)')
plt.legend()

In [None]:
sm.OLS(game_res[:,1],game_res[:,0]).fit().summary()

In [None]:
import gzip
import json
with gzip.GzipFile('BBGM_TeamMOV_2045_preseason.json.gz','r') as fp:
    data = json.loads(fp.read())

In [None]:
tid_to_abbrev = {t['tid']: t['abbrev'] for t in data['teams']}
tid_to_abbrev[-1] = 'AllStar'
tid_to_abbrev[-2] = 'AllStar'

In [None]:
hm_crt ={}
for game in data['games']:
    tids1 = (game['teams'][0]['tid'],game['teams'][1]['tid'])
    if tids1[0] < 0 or tids1[1] < 0:
        continue
    hm_crt[game['gid']] = [tid_to_abbrev[tid] for tid in tids1]

In [None]:
game_res = []
gt = []
for gid,teams in hm_crt.items():
    t = list(teams)
    game_res.append( list(team_rating_n[(gid,t[0])] - team_rating_n[(gid,t[1])]) + [team_score[(gid,t[0])]] )
    gt.append(gid)

In [None]:
gt[20949]

In [None]:
diff_df = pd.DataFrame(np.array(game_res),columns=list(game_rate.columns[91:]) + ['MOV'])

In [None]:
diff_m = sm.OLS(diff_df['MOV'],sm.add_constant(diff_df.drop('MOV',axis=1))).fit()
diff_m = sm.OLS(diff_df['MOV'],sm.add_constant(diff_df['Ovr'])).fit()
#diff_m = sm.OLS(diff_df['MOV'],sm.add_constant(diff_df.drop(['MOV','Ovr','Pot'],axis=1))).fit()


In [None]:
diff_m.summary()

In [None]:
plt.scatter(diff_m.predict(),diff_df['MOV'],s=8,alpha=0.1)
plt.xlabel('Predicted MOV (min-weighted OVR + HFA)')
plt.ylabel('MOV')

In [None]:
tmp = []
for pv, rv in zip(diff_m.predict(),diff_df['MOV']):
    tmp.append(int((pv >0) != (rv >0)))
np.array(tmp).mean()

In [None]:
tmp = []
tmp2 = []
for ovr,mov in zip(diff_df['Ovr'],diff_df['MOV']):
    if ovr < 3:
        continue
    tmp.append(mov)
    if mov < 0:
        tmp2.append(1)

In [None]:
_ = plt.hist(tmp,20,density=True,alpha=0.6)
plt.title('Home Court, Ovr {} > Opponent, Win {:.0f}%'.format(5,(100-100.0*len(tmp2)/len(tmp))))
plt.xlabel('MOV')

In [None]:
_ = plt.hist(diff_df['Ovr'],20,density=True)
plt.xlabel('mintute-weighted OVR difference')
plt.title('Team Differences')

In [None]:
diff_df[(diff_df.MOV < -40) & (diff_df.Ovr > 5)]

In [None]:
pd.DataFrame(diff_df).iloc[18476]

In [None]:
hm_crt[18476]

In [None]:
game_rate[game_rate.gid == 18490].to_csv('test22.csv')

In [None]:
game_rate

In [None]:
game_rate

In [None]:
gid_pids = defaultdict(lambda:defaultdict(list))
for g_p in game_rate.itertuples():
    if g_p[11] > 0:
        gid_pids[g_p[1]][g_p[5]].append(g_p[92])

In [None]:
g_p

In [None]:
g_p

In [None]:
g_p[1],g_p[5],g_p[92],g_p[11]

In [None]:
gid_pids[22]

In [None]:
dm_re = []
dm_re2 = []
ans = []
for gm,teams in hm_crt.items():
    dm_re2.append([h-a for h,a in zip(sorted(gid_pids[gm][teams[0]],reverse=True),sorted(gid_pids[gm][teams[1]],reverse=True))])
    hmt = sum([0.4417 * np.exp(-0.1905 * i)*v for i,v in enumerate(sorted(gid_pids[gm][teams[0]],reverse=True)) if i < 10])
    awt = sum([0.4417 * np.exp(-0.1905 * i)*v for i,v in enumerate(sorted(gid_pids[gm][teams[1]],reverse=True)) if i < 10])
    dm_re.append(hmt-awt)
    ans.append(team_score[(gm,teams[0])])

In [None]:
dm_re2 = np.array([(_ + 10 *[0])[:10] for _ in dm_re2])

In [None]:
dmr_clf = sm.OLS(ans,sm.add_constant(dm_re2)).fit()
dmr_clf.summary()

In [None]:
plt.plot(dmr_clf.params[1:],label='best fit diffs')
plt.plot([0.4417 * np.exp(-0.1905 * i) for i in range(10)],label='current formula')
plt.xlabel('position')
plt.ylabel('coeff')
plt.title('Sorted Team OVR diff \n for predicting team results')
plt.legend()

In [None]:
dmr_clf.predict().shape

In [None]:
diff_m.predict().shape

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(dmr_clf.predict(),diff_m.predict(),s=8,alpha=0.4)
plt.plot([-30,30],[-30,30],c='k')
plt.xlim(-30,30)
plt.ylim(-30,30)
plt.xlabel('Team Rating Formula')
plt.ylabel('Minute-Weighted OVR')

In [None]:
plt.figure(figsize=(12,6))

plt.subplot(1,2,1)
plt.scatter(dmr_clf.predict(),ans,s=8,alpha=0.1)
tc = sm.OLS(ans,sm.add_constant(dmr_clf.predict())).fit()
plt.xlabel('Predicted MOV (Team Rating Diff + HFA)')
plt.ylabel('MOV')
plt.plot([-30,30],[-30*tc.params[1]+tc.params[0],30*tc.params[1]+tc.params[0]],c='k')
plt.title('r-sq: {:.3f}'.format(tc.rsquared))
plt.xlim(-30,30)
plt.ylim(-60,60)

plt.subplot(1,2,2)
plt.title('')
plt.scatter(diff_m.predict(),ans,s=8,alpha=0.1)
plt.xlabel('Predicted MOV (min-weighted OVR Diff + HFA)')
plt.ylabel('MOV')
tc = sm.OLS(ans,sm.add_constant(diff_m.predict())).fit()
plt.plot([-30,30],[-30*tc.params[1]+tc.params[0],30*tc.params[1]+tc.params[0]],c='k')
plt.title('r-sq: {:.3f}'.format(tc.rsquared))

plt.xlim(-30,30)
plt.ylim(-60,60)

In [None]:
tc.summary()

In [None]:
tc.rsquared

In [None]:
tc.params[0]