In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, cdist

## Load the historic player data

In [None]:
data = json.load(open('real-player.json','rb'))

In [None]:
df = pd.DataFrame(data['ratings'])

In [None]:
df = df.drop(['fuzz','abbrev_if_new_row'],1)#.set_index(['slug','season'])

In [None]:
df = df.set_index(['slug','season']).reset_index()

In [None]:
cols = list(df.columns[2:])

In [None]:
ratings  = {}
for row in df.itertuples():
    ratings[(row[1],row[2])] = list(row[3:])

In [None]:
data['bios']['abdulka01']

In [None]:
ratings[('jordami01',1985)]

In [None]:
# only use recent-ish players
from collections import defaultdict
player_year_rate = defaultdict(dict)
for i,r in ratings.items():
    if data['bios'][i[0]]['bornYear'] < 1956:
        continue
    if i[1] >= 2019:
        continue
    age=  i[1]-data['bios'][i[0]]['bornYear']
    player_year_rate[i[0]][age] = np.array(r)

In [None]:
# smooth their ratings
plt.style.use('fivethirtyeight')
import scipy
SMOOTHING_STD = 1.0
key = 'malonka01' # greendr01 jamesle01 hardeja01 malonka01
play = player_year_rate[key] 
minY = min(play.keys())
maxY = max(play.keys())
res = []
for i in range(minY,maxY+1):
    #print(i)
    #res.append(play.get(i,[np.nan for j in range(15)]))
    res.append(play[i] if i in play else res[-1])

i = 8
plt.plot(range(minY,maxY+1),np.array(res)[:,i],label='orig')
plt.plot(range(minY,maxY+1),scipy.ndimage.gaussian_filter1d(np.array(res).astype(float),SMOOTHING_STD,mode='nearest',axis=0,truncate=10)[:,i],label='smooth')
plt.legend()
plt.title(key + ' ' + cols[i])

In [None]:
play_year_rateSmooth = {}
for key,play in player_year_rate.items():
    minY = min(play.keys())
    maxY = max(play.keys())
    res = []
    for i in range(minY,maxY+1):
        #res.append(play.get(i,[np.nan for j in range(15)]))
        res.append(play[i] if i in play else res[-1])
    res = np.array(res).astype(float)
    reS = scipy.ndimage.gaussian_filter1d(res,SMOOTHING_STD,mode='nearest',axis=0,truncate=10)
    p2 = {}
    for idx,age in enumerate(range(minY,maxY+1)):
        if age in play:
            p2[age] = reS[idx]
    play_year_rateSmooth[key] = p2

In [None]:
TRANS_FUNC = lambda x: x**(1/2)#np.sqrt(x)
INV_FUNC = lambda x: x**2#x**2
r1 = []
r2 = []
for play in play_year_rateSmooth.values():
    for age,r in play.items():
        if age-1 in play:
            age2 = age-1
            if age2 > 36:
                continue
            r1.append(TRANS_FUNC(play[age]) -TRANS_FUNC(play[age-1]))
            r2.append(age2)
r1 = np.array(r1)
r2 = np.array(r2)


## Model development

In [None]:
age_res = []
for age in sorted(np.unique(r2)):
    age_res.append(r1[r2==age].mean(0))
age_res = np.array(age_res)
for i in range(15):
    plt.plot(sorted(np.unique(r2)),age_res[:,i],label=cols[i],c=plt.cm.tab20(i))
plt.xlim(right=35)
plt.legend()
#plt.ylim(-0.2,0.2)

In [None]:
import sklearn.linear_model as linear_model
from scipy.stats import pearsonr
TIMES_TO_FIT = 1

clf_models = []
for i in range(TIMES_TO_FIT):
    clf = linear_model.RidgeCV(np.logspace(-5,5,11),cv=5)#SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9,eta0=1e-5)
    clf.fit(np.repeat(r2,15)[:,None],r1.ravel())
    #score = clf.score(np.repeat(r2,15)[:,None],r1.ravel())
    score = pearsonr(clf.predict(np.repeat(r2,15)[:,None]),r1.ravel())
    clf_models.append((score,i,clf))
best_model = sorted(clf_models)[0]
clf = best_model[2]
print(best_model[0])
main_model = (clf.coef_[0] , clf.intercept_) # 0.0855008819536307 # 0.003181372399186653
# (0.28535187287328784, 0.0) linear
# (0.27993511134791604, 0.0) log/exp
# (0.2882236878163602, 0.0) squared
# (0.28735272142114926, 0.0) cubed

In [None]:
plt.plot(np.unique(r2),np.unique(r2)*main_model[0] +main_model[1])
plt.plot([19,35],[0,0],c='k',ls='--')
plt.grid(True)

In [None]:
models = []
for i in range(r1.shape[1]):
    clf_models = []
    for j in range(TIMES_TO_FIT):
        clf = linear_model.RidgeCV(np.logspace(-5,5,11),cv=5)#SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9,eta0=1e-5)
        clf.fit(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        score = clf.score(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        clf_models.append((score,j,clf))
    best_model = sorted(clf_models)[-1]
    clf = best_model[2]
    print(cols[i],best_model[0])
    models.append((clf.coef_[0],clf.intercept_))

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.unique(r2)*models[i][0]+models[i][1],label=cols[i],c=plt.cm.tab20(i))
plt.legend()
#plt.xlim(19,34)
#plt.ylim(-4,4)
plt.grid(True)

In [None]:
means_expected = []
for i in range(r1.shape[1]):
    means_expected.append((models[i][0]*r2 + models[i][1]) * (main_model[0]*r2+main_model[1]) )

In [None]:
# rank1 approximations of this would be really cool
# but sampling multivariate Gaussians seems... annoying?
removed_means = r1 - np.array(means_expected).T

In [None]:
plt.figure(figsize=(20,20))
i = 1
for age in sorted(np.unique(r2)):
    if (r2 == age).sum() < 2:
          continue
    plt.subplot(4,6,i)
    i += 1
    covar = np.cov(removed_means[r2 == age],rowvar=False)
    plt.imshow(covar)
    plt.xticks(np.arange(15),cols,rotation=45)
    plt.yticks(np.arange(15),cols)
    plt.title('age={}  max={:.1f}'.format(age,covar.max()))
plt.tight_layout(pad=0.1,h_pad=0)
plt.gcf().subplots_adjust(hspace=-0.6)

In [None]:
age_w = []
ages = sorted(np.unique(r2))
age_stds = []
for age in ages:
    age_w.append((r2==age).sum())
    age_stds.append(removed_means[r2==age].std(axis=0))
age_stds = np.array(age_stds)
age_w = np.array(age_w)
age_w = age_w/age_w.mean()

In [None]:
clf = linear_model.RidgeCV()#SGDRegressor(loss='epsilon_insensitive',alpha=0,epsilon=0)
clf.fit(np.repeat(ages,15)[:,None],age_stds.ravel(),sample_weight=np.repeat(age_w,15))
base_model = list(main_model) + [clf.coef_[0],clf.intercept_]

In [None]:
plt.plot(np.unique(r2),np.unique(r2)*clf.coef_[0] +clf.intercept_,lw=3)

In [None]:
std_models = []
for i in range(15):
    clf = linear_model.RidgeCV()#SGDRegressor(loss='epsilon_insensitive',alpha=0,epsilon=0)
    clf.fit(np.array(ages)[:,None],np.maximum(0,age_stds[:,i]-(np.array(ages)*base_model[2] + base_model[3])),sample_weight = age_w)
    std_models.append((clf.coef_[0],clf.intercept_))

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.unique(r2)*std_models[i][0] + std_models[i][1],label=cols[i],c=plt.cm.tab20(i),lw=3)
plt.legend()
plt.xlim(19,34)
plt.grid(True)

In [None]:
models

In [None]:
clf.intercept_

In [None]:
dat_print = {cols[i]:tuple(np.round(row,4)) for i,row in enumerate(np.hstack([models,std_models]))}

In [None]:
print('{} {},'.format("base",list(np.round(base_model,4))))
for k,v in dat_print.items():
    if k == 'hgt':continue
    print('{}: {},'.format(k,list(v)))

In [None]:
np.quantile(means_expected,0.99,axis=0).mean(),np.quantile(means_expected,0.01,axis=0).mean()

In [None]:
np.quantile(r1,0.99,axis=0).mean(),np.quantile(r1,0.01,axis=0).mean()

## Model Rookies

In [None]:
ovr_weights =  {'diq': 0.093,
 'dnk': 0.0424,
 'drb': 0.0968,
 'endu': 0.0075,
 'fg': -0.0093,
 'ft': 0.049,
 'hgt': 0.225,
 'ins': -0.0143,
 'jmp': 0.0505,
 'oiq': 0.0971,
 'pss': 0.0657,
 'reb': 0.0534,
 'spd': 0.156,
 'stre': 0.0962,
 'tp': 0.105}
ovr_v = np.array([ovr_weights[cols[i]] for i in range(len(cols))])

In [None]:
models_np = np.array(models)


In [None]:
youth = []
names = []
positions = []
for k,p in data['bios'].items():
    if 'bornYear' not in p or p['bornYear'] is None:
        continue
    yr = p['draftYear']
    age = yr-p['bornYear']
    if yr<2019 and yr >= 2000 and (k,yr+1) in ratings and age <= 23 and p['draftPick'] <= 45:
        update = main_model[0]*age+main_model[1]
        rt = ratings[(k,yr+1)]
        rt_n = TRANS_FUNC(np.array(rt)+0.1) - ( (main_model[0]*age+main_model[1]) + (models_np[:,0]*age + models_np[:,1]) )
        rt_n = np.clip(INV_FUNC(rt_n),1,100)
        youth.append([age] + list(rt_n))
        names.append(k)
        positions.append(p['pos'])
youth = np.array(youth)

In [None]:
all_play = np.array([v for k,v in ratings.items() if k[1] > 2000 and k[1] < 2019])

In [None]:
_ = plt.hist((youth/youth.mean(0)).ravel(),50)

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

s1 = (youth[:,1:]*ovr_v).sum(1)
s1 = s1 > 0*np.median(s1)

clf_pca = PCA(whiten =False)#TSNE(perplexity=55)
clf_pca.fit(youth[:,1:])
emb = clf_pca.transform(youth[s1,1:].astype(np.float32))

In [None]:
pos_set = ['PG','G','SG',"GF",'SF','F','PF','FC',"C"]
plt.scatter(emb[:,0],emb[:,1],c=[pos_set.index(_) for _,t in zip(positions,s1) if t],cmap='RdBu')

In [None]:
fix_mean = np.array([5., 0., 0., 5., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0., 0.])
for c,v in zip(cols,np.round(clf_pca.mean_,1)):
    print(c,':',v,',')

In [None]:
clf_pca.explained_variance_ratio_

In [None]:
COMP =3
hgt = youth[:,1+cols.index('hgt')]
plt.hist(hgt,25,density=True)
rand_hgt = np.random.randn(1500)*13.6 + 47.5
plt.hist(rand_hgt,25,alpha=0.5,density=True)
f_hgt = np.array(list(hgt) + list(rand_hgt))

X_hgt = hgt[s1,None]# np.vstack([hgt,hgt**2]).T
pred_res = []
hgt_models = []
for i in range(COMP):
    clf = linear_model.RidgeCV(cv=3,alphas=np.logspace(-5,3,9))
    clf.fit(X_hgt,emb[:,i])
    clf_s = clf.score( X_hgt,emb[:,i])
    pred_res.append(clf.predict(f_hgt[:,None]))
    print(clf_s)
    hgt_models.append(list(clf.coef_) + [clf.intercept_])
pred_res = np.array(pred_res).T

In [None]:
np.set_printoptions(suppress=True)
np.round(hgt_models,2)

In [None]:
plt.imshow(clf_pca.components_[:COMP,:],cmap='RdBu',vmin=-0.5,vmax=0.5)
_ = plt.xticks(np.arange(15),cols,rotation=45)
clf_pca.components_[:COMP,:]

In [None]:
set1 = np.array([8.9, 19.9, 7.8])
set2 = np.array([  -0.77, -11.06 ,  1.77])
set3 = np.array([0.81, 0.41])
ADD_VAR = set1*(np.random.randn(f_hgt.shape[0],COMP))+set2
MUL_VAR = np.random.uniform(set3[0],set3[0]+set3[1],size=(f_hgt.shape[0],15))
pred_vec = ((ADD_VAR+pred_res) @ clf_pca.components_[:COMP,:]) + clf_pca.mean_ 
pred_vec *= MUL_VAR
pred_vec[:,cols.index('hgt')] = f_hgt


In [None]:
plt.hist(pred_vec.ravel(),50,alpha=0.5,density=True)
_= plt.hist(youth[:,1:].ravel(),50,alpha=0.5,density=True)

In [None]:
if False:
    with open('beta_sim_p.json','rb') as fp:
        beta = json.load(fp)
    pV = []
    for p in beta['players']:
        if p['ratings'][0]['season'] != p['draft']['year']:
            continue
        pV.append([p['ratings'][0][_] for _ in cols]) 
    pV = np.array(pV)

In [None]:
_ = plt.hist((youth[:,1:]*ovr_v).sum(1)-6.4,20,alpha=0.5,density=True,label='rpd')
_ = plt.hist((pred_vec*ovr_v).sum(1)-6.4,20,alpha=0.5,density=True,label='gen')
#_ = plt.hist((beta_p2*ovr_v).sum(1)-6.4,20,alpha=0.5,density=True,label='beta')

plt.legend()
print(youth[:,1:].mean(1).std(),pred_vec.mean(1).std())

In [None]:
#(pV*ovr_v).sum(1).mean(),(pV*ovr_v).sum(1).std()

In [None]:
(pred_vec*ovr_v).sum(1).mean(),(pred_vec*ovr_v).sum(1).std()

In [None]:
plt.subplot(1,2,1)
plt.imshow(np.cov(youth[:,1:],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('real players')
plt.xticks(np.arange(15),cols,rotation=45)
plt.yticks(np.arange(15),cols)
plt.subplot(1,2,2)
plt.imshow(np.cov(pred_vec,rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('generated')
plt.xticks(np.arange(15),cols,rotation=45)
_ = plt.yticks(np.arange(15),cols)

In [None]:
PC = 50
s1 = (youth[:,1:]*ovr_v).sum(1)
s1 = s1 > np.percentile(s1,PC)
s2 = (pred_vec*ovr_v).sum(1)
s2 = s2 > np.percentile(s2,PC)
plt.subplot(1,2,1)
plt.imshow(np.cov(youth[s1,1:],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('real players')
plt.xticks(np.arange(15),cols,rotation=45)
plt.yticks(np.arange(15),cols)
plt.subplot(1,2,2)
plt.imshow(np.cov(pred_vec[s2],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('generated')
plt.xticks(np.arange(15),cols,rotation=45)
_ = plt.yticks(np.arange(15),cols)

In [None]:
[(abs(_).mean(),abs(_).max()) for _ in [youth[:,1:].mean(0)-pred_vec.mean(0),youth[s1,1:].mean(0)-pred_vec[s2].mean(0),youth[s1,1:].std(0)-pred_vec[s2].std(0)]]

In [None]:
v1 = (ovr_v*youth[s1,1:]).sum(1)
v2 = (ovr_v*pred_vec[s2]).sum(1)
tb = int(np.ceil(v1.max()))
bb = int(np.floor(v1.min()))
sN = tb-bb
hist_set = np.linspace(bb,tb,sN)
h1=np.histogram(v1,hist_set,density=True)[0]+1e-6
h2 = np.histogram(v2,hist_set,density=True)[0]+1e-6
((h1-h2)**2).sum()
kl1 = (np.log(h1/h2)*h1).sum()
kl2 = (np.log(h2/h1)*h2).sum()
plt.hist(v1,alpha=0.5,density=True)
plt.hist(v2,alpha=0.5,density=True)
kl2+kl2

In [None]:
def eval_f(params):
    #np.random.seed(43)
    set1 = np.exp(params[:COMP])
    set2 = np.array(params[-COMP:])
    set3 = np.exp(params[COMP:COMP+2])
    res = []
    #print(set1,set2,set3)
    for i in range(60):
        np.random.seed(242+i)
        ADD_VAR = set1*(np.random.randn(f_hgt.shape[0],COMP))+set2

        MUL_VAR = np.random.uniform(set3[0],set3[0]+set3[1],size=(f_hgt.shape[0],15))
        pred_vec = ((ADD_VAR+pred_res) @ clf_pca.components_[:COMP,:]) + clf_pca.mean_ 
        pred_vec *= MUL_VAR
        pred_vec[:,cols.index('hgt')] = f_hgt
        N = youth.shape[0]
        # filter to only the top half with good stats
        s1 = (youth[:,1:]*ovr_v).sum(1)
        s1 = s1 > np.percentile(s1,PC)
        s2 = (pred_vec*ovr_v).sum(1)
        s2 = s2 > np.percentile(s2,PC)
        
        cov_err = ((np.cov(youth[s1,1:],rowvar=False)-np.cov(pred_vec[s2],rowvar=False))**2).mean()
        cov_err2 = ((np.cov(youth[:,1:],rowvar=False)-np.cov(pred_vec,rowvar=False))**2).mean()

        v1 = (ovr_v*youth[s1,1:]).sum(1)
        v2 = (ovr_v*pred_vec[s2]).sum(1)
        
        tb = int(np.ceil(v1.max()))
        bb = int(np.floor(v1.min()))
        sN = tb-bb
        hist_set1 = np.linspace(bb,tb,sN)

        tb = int(np.ceil(v2.max()))
        bb = int(np.floor(v2.min()))
        sN = tb-bb
        hist_set2 = np.linspace(bb,tb,sN)
        
        h1 = np.histogram(v1,hist_set1,density=True)[0]+1e-6
        h2 = np.histogram(v2,hist_set1,density=True)[0]+1e-6
        kl1 = (np.log(h1/h2)*h1).sum()
        kl2 = (np.log(h2/h1)*h2).sum()
        h1 = np.histogram(v1,hist_set2,density=True)[0]+1e-6
        h2 = np.histogram(v2,hist_set2,density=True)[0]+1e-6
        kl12 = (np.log(h1/h2)*h1).sum()
        kl22 = (np.log(h2/h1)*h2).sum()
        mean_err = kl1+kl2+kl12+kl22
        
        mean_err2 = ((youth[s1,1:].mean(0)-pred_vec[s2].mean(0))**2).sum()
        mean_err3 = ((youth[:,1:].mean(0)-pred_vec.mean(0))**2).sum()
        std_err1 = ((youth[s1,1:].std(0)-pred_vec[s2].std(0))**2).sum()
        std_err2 = ((youth[:,1:].std(0)-pred_vec.std(0))**2).sum()

        res.append( std_err1*(std_err2**0.1)*cov_err*mean_err*mean_err2*(mean_err3**0.1)*(cov_err2**0.1 )) # *mean_err3*cov_err2
        if np.isnan(res[-1]):
            return 1e20
    return np.mean(sorted(res)) -10*set3[0]+10*set3[1] -np.sum(set1)+ np.linalg.norm(set2)

x0 = np.array([  2.18455945,   2.99055438,   2.05511653,  -0.2050731 ,
        -0.88629717,  -0.76933933, -11.05606385,   1.76523402])
eval_f(x0)
#plt.hist(eval_f(x0)[1],20,alpha=0.5,density=True)
#plt.hist((ovr_v*youth[:,1:]).sum(1),20,density=True,alpha=0.5)

In [None]:
import cma
es = cma.CMAEvolutionStrategy(x0,0.5,{'popsize':25.5})
es.optimize(eval_f)

In [None]:
with open('beta_t.json','rb') as  fp:
    rand_data = json.load(fp)
beta_p2 = []
for p in rand_data['players']:
    if p['draft']['year'] != p['ratings'][0]['season']:
        continue
    #for rt1 in p['ratings']:
    rt1 = p['ratings'][0]
    beta_p2.append(np.array([rt1[_] for _ in cols]))
beta_p2 = np.array(beta_p2)