In [None]:
import json

In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import sklearn.linear_model as linear
from collections import defaultdict

In [None]:
ratings_list = sorted(['hgt', 'stre', 'spd', 'jmp', 'endu', 'ins', 'dnk', 'ft', 'fg', 'tp', 'diq', 'oiq', 'drb', 'pss', 'reb'])

In [None]:
X = []
Y = []
pr = {}

for fi,file in enumerate(['../gs_beta1.json','../gs_random1.json','../gs_rpd1.json','../gs_rpd2.json']):
    with open(file,'rb') as fp:
        data = json.load(fp)
        # get player ratings
        for p in data['players']:
            for r in p['ratings']:
                r2 = np.array([r[rating] for rating in ratings_list])
                key = str(fi) + '_' + str(p['pid']) + '_' + str(r['season'])
                pr[key] = r2
        # get differences
        for g in data['games']:
            t1 = sum([_['pts'] for _ in g['teams'][0]['players']])
            t2 = sum([_['pts'] for _ in g['teams'][1]['players']])
            m1 = sum([_['min'] for _ in g['teams'][0]['players']])
            m2 = sum([_['min'] for _ in g['teams'][1]['players']])
            Y.append(t1-t2)
            vec1 = [pr[str(fi) + '_' + str(_['pid']) + '_' + str(g['season'])]*_['min'] for _ in g['teams'][0]['players']]
            vec1 = np.array(vec1).sum(0)/m1
            vec2 = [pr[str(fi) + '_' + str(_['pid']) + '_' + str(g['season'])]*_['min'] for _ in g['teams'][1]['players']]
            vec2 = np.array(vec2).sum(0)/m2
            X.append(vec1-vec2)

In [None]:
#from sklearn.preprocessing import PolynomialFeatures
#clf_ft = PolynomialFeatures(include_bias=True)
#X2 = clf_ft.fit_transform(X)[:,1:]

In [None]:
reg = linear.RidgeCV(alphas=np.logspace(-2,6,8),cv=3)
#reg = linear.ElasticNetCV(cv=3,l1_ratio=np.logspace(-8,0,9),alphas=np.logspace(-4,4,80))
reg.fit(X,Y)
#min_v = reg.coef_[reg.coef_ >0].min()
#min_v = abs(reg.coef_).mean()/7
#reg.coef_[reg.coef_ < min_v] = min_v
print(reg.score(X,Y)) # 0.146

In [None]:
plt.scatter(reg.predict(X),Y,s=5,alpha=0.1)

In [None]:
reg.intercept_,np.log10(reg.alpha_)#,reg.l1_ratio_

In [None]:
tmp = {}
for r,c in zip(reg.coef_,ratings_list):
    print(np.round(r,3),c)
    tmp[c] = np.round(r,3)

In [None]:
avg = pd.DataFrame(pr).T
avg.columns = ratings_list

In [None]:
# print('Intercept: \n', reg.intercept_)
# print('Coefficients: \n', reg.coef_)

# Adjust old ovrs for the ratings we're skipping
# Recompute Ovr because we want the unscaled version, so scaling can be applied on top in JS
avg['OvrOld'] = (5 * avg['hgt'] + 1 * avg['stre'] + 4 * avg['spd'] + 2 * avg['jmp'] + 1 * avg['endu'] + 1 * avg['ins'] + 2 * avg['dnk'] + 1 * avg['ft'] + 1 * avg['fg'] + 3 * avg['tp'] + 7 * avg['oiq'] + 3 * avg['diq'] + 3 * avg['drb'] + 3 * avg['pss'] + 1 * avg['reb']) / 38

# Scale to match old ovr
mean_old = avg.OvrOld.mean()
std_old = avg.OvrOld.std()

ovr_new_unscaled = reg.predict(avg[ratings_list])-reg.intercept_
mean_new = ovr_new_unscaled.mean()
std_new = ovr_new_unscaled.std()

factor_mult = std_old / std_new
factor_add = mean_old -mean_new*factor_mult
print('factor_mult: \n', factor_mult)
print('factor_add: \n', factor_add)
avg['OvrNew'] = (ovr_new_unscaled) * factor_mult + factor_add
# print(dataset.Ovr)
# print(dataset.OvrNew)

def formatThree(num):
    return str(np.format_float_positional(num, precision=3, unique=False, fractional=False, trim='k'))

print(avg[['OvrOld', 'OvrNew']])


In [None]:
# Output
print('(')
alt_comp = 0
for i in range(len(ratings_list)):
    if i == len(ratings_list) - 1:
        end_part = ''
    else:
        end_part = ' +'
    idx = i
    print('    ' + formatThree(factor_mult * reg.coef_[idx]) + ' * ratings.' + ratings_list[i] + end_part)
    alt_comp = alt_comp + (factor_mult * reg.coef_[idx]) * avg[ratings_list[idx]]
print(') + ' + formatThree(factor_add));
alt_comp += factor_add

# Plot
avg.plot.hexbin(x='OvrOld', y='OvrNew', gridsize=40)
plt.xlim(15, 85)
plt.ylim(15, 85)
plt.xlabel('Old Ovr')  
plt.ylabel('New Ovr')  

plt.plot([0, 100], [0, 100])

plt.show()

In [None]:
(ovr_new_unscaled-mean_new) * factor_mult + factor_add

In [None]:
(ovr_new_unscaled) * factor_mult + (factor_add-mean_new*factor_mult)

In [None]:
plt.hist(avg['OvrNew'],40,alpha=0.5,label='new')
plt.hist(avg['OvrOld'],40,alpha=0.5,label='old')
plt.legend()

In [None]:
avg['diff'] = avg['OvrNew']-avg['OvrOld']
avg2 = avg[avg.index.map(lambda x: x[:2]=='2_' and x[-4:] == '2017')]

In [None]:
avg2[avg2.OvrOld > 60].sort_values('diff',0,False)

In [None]:
import statsmodels.api as sm
sm.OLS(Y,sm.add_constant(pd.DataFrame(X,columns=ratings_list))).fit().summary()