In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Load the historic player data

In [None]:
data = json.load(open('real-player.json','rb'))

In [None]:
df = pd.DataFrame(data['ratings'])

In [None]:
df = df.drop(['fuzz','abbrev_if_new_row'],1)#.set_index(['slug','season'])

In [None]:
df = df.set_index(['slug','season']).reset_index()

In [None]:
cols = list(df.columns[2:])

In [None]:
ratings  = {}
for row in df.itertuples():
    ratings[(row[1],row[2])] = list(row[3:])

In [None]:
data['bios']['abdulka01']

In [None]:
ratings[('jordami01',1985)]

In [None]:
# only use recent-ish players
from collections import defaultdict
player_year_rate = defaultdict(dict)
for i,r in ratings.items():
    if data['bios'][i[0]]['bornYear'] < 1956:
        continue
    if i[1] > 2019:
        continue
    age=  i[1]-data['bios'][i[0]]['bornYear']
    player_year_rate[i[0]][age] = np.array(r)

In [None]:
# smooth their ratings
import scipy
SMOOTHING_STD = 0.65
play = player_year_rate['malonka01'] # greendr01 jamesle01 hardeja01 malonka01
minY = min(play.keys())
maxY = max(play.keys())
res = []
for i in range(minY,maxY+1):
    #print(i)
    #res.append(play.get(i,[np.nan for j in range(15)]))
    res.append(play[i] if i in play else res[-1])

i = 8
plt.plot(range(minY,maxY+1),np.array(res)[:,i],label='orig')
plt.plot(range(minY,maxY+1),scipy.ndimage.gaussian_filter1d(np.array(res).astype(float),SMOOTHING_STD,mode='nearest',axis=0,truncate=10)[:,i],label='new')
plt.legend()
plt.title(cols[i])

In [None]:
play_year_rateSmooth = {}
for key,play in player_year_rate.items():
    minY = min(play.keys())
    maxY = max(play.keys())
    res = []
    for i in range(minY,maxY+1):
        #res.append(play.get(i,[np.nan for j in range(15)]))
        res.append(play[i] if i in play else res[-1])
    res = np.array(res).astype(float)
    reS = scipy.ndimage.gaussian_filter1d(res,SMOOTHING_STD,mode='nearest',axis=0,truncate=10)
    p2 = {}
    for idx,age in enumerate(range(minY,maxY+1)):
        if age in play:
            p2[age] = reS[idx]
    play_year_rateSmooth[key] = p2

In [None]:
r1 = []
r2 = []
r5 = []
for play in play_year_rateSmooth.values():
    for age,r in play.items():
        if age-1 in play:
            age2 = age-1
            r1.append(play[age]-play[age-1])
            r2.append(age2)
            r5.append(play[age-2]-play[age-1] if age-2 in play else 0*play[age])
r1 = np.array(r1)
r2 = np.array(r2)
r5 = np.array(r5)

## Model development

In [None]:
age_res = []
for age in sorted(np.unique(r2)):
    age_res.append(r1[r2==age].mean(0))
age_res = np.array(age_res)
for i in range(15):
    plt.plot(sorted(np.unique(r2)),age_res[:,i],label=cols[i],c=plt.cm.tab20(i))
plt.xlim(right=35)
plt.legend()

In [None]:
import sklearn.linear_model as linear_model

TIMES_TO_FIT = 35

clf_models = []
for i in range(TIMES_TO_FIT):
    clf = linear_model.SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9)
    clf.fit(np.repeat(r2,15)[:,None],r1.ravel())
    score = clf.score(np.repeat(r2,15)[:,None],r1.ravel())
    clf_models.append((score,clf))
best_model = sorted(clf_models)[-1]
clf = best_model[1]
print(best_model[0])
main_model = (clf.coef_[0] , clf.intercept_[0])

In [None]:
plt.plot(np.unique(r2),np.unique(r2)*main_model[0] +main_model[1])
plt.grid(True)

In [None]:
models = []
for i in range(r1.shape[1]):
    clf_models = []
    for _ in range(TIMES_TO_FIT):
        clf = linear_model.SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9)
        clf.fit(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        score = clf.score(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        clf_models.append((score,clf))
    best_model = sorted(clf_models)[-1]
    clf = best_model[1]
    print(best_model[0])
    models.append((clf.coef_[0],clf.intercept_[0]))

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.unique(r2)*models[i][0]+models[i][1],label=cols[i],c=plt.cm.tab20(i))
plt.legend()
plt.xlim(19,34)
plt.ylim(-4,4)
plt.grid(True)

In [None]:
means_expected = []
for i in range(r1.shape[1]):
    means_expected.append(models[i][0]*r2 + models[i][1] + (main_model[0]*r2+main_model[1]) )

In [None]:
# rank1 approximations of this would be really cool
# but sampling multivariate Gaussians seems... annoying?
removed_means = r1 - np.array(means_expected).T

In [None]:
plt.figure(figsize=(20,20))
i = 1
for age in sorted(np.unique(r2)):
    if (r2 == age).sum() < 2:
          continue
    plt.subplot(4,6,i)
    i += 1
    covar = np.cov(removed_means[r2 == age],rowvar=False)
    plt.imshow(covar)
    plt.xticks(np.arange(15),cols,rotation=45)
    plt.yticks(np.arange(15),cols)
    plt.title('age={}  max={:.0f}'.format(age,covar.max()))
plt.tight_layout(pad=0.1,h_pad=0)
plt.gcf().subplots_adjust(hspace=-0.6)

In [None]:
age_w = []
ages = sorted(np.unique(r2))
age_stds = []
for age in ages:
    age_w.append((r2==age).sum())
    age_stds.append(removed_means[r2==age].std(axis=0))
age_stds = np.array(age_stds)
age_w = np.array(age_w)
age_w = age_w/age_w.mean()

In [None]:
clf = linear_model.RidgeCV()#SGDRegressor(loss='epsilon_insensitive',alpha=0,epsilon=0)
clf.fit(np.repeat(ages,15)[:,None],age_stds.ravel(),sample_weight=np.repeat(age_w,15))
base_model = list(main_model) + [clf.coef_[0],clf.intercept_]

In [None]:
plt.plot(np.unique(r2),np.unique(r2)*clf.coef_[0] +clf.intercept_,lw=3)

In [None]:
std_models = []
for i in range(15):
    clf = linear_model.RidgeCV()#SGDRegressor(loss='epsilon_insensitive',alpha=0,epsilon=0)
    clf.fit(np.array(ages)[:,None],np.maximum(0,age_stds[:,i]-(np.array(ages)*base_model[2] + base_model[3])),sample_weight = age_w)
    std_models.append((clf.coef_[0],clf.intercept_))

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.unique(r2)*std_models[i][0] + std_models[i][1],label=cols[i],c=plt.cm.tab20(i),lw=3)
plt.legend()
plt.xlim(19,34)
plt.grid(True)

In [None]:
models

In [None]:
clf.intercept_

In [None]:
dat_print = {cols[i]:tuple(np.round(row,3)) for i,row in enumerate(np.hstack([models,std_models]))}

In [None]:
print('{} {},'.format("base",list(np.round(base_model,3))))
for k,v in dat_print.items():
    if k == 'hgt':continue
    print('{}: {},'.format(k,list(v)))

In [None]:
np.quantile(means_expected,0.99,axis=0).mean(),np.quantile(means_expected,0.01,axis=0).mean()

In [None]:
np.quantile(r1,0.99,axis=0).mean(),np.quantile(r1,0.01,axis=0).mean()

## Model Rookies

In [None]:
p

In [None]:
youth = []
names = []
positions = []
for k,p in data['bios'].items():
    if 'bornYear' not in p or p['bornYear'] is None:
        continue
    yr = p['draftYear']
    age = yr-p['bornYear']
    if yr<2020 and yr >= 2000 and (k,yr+1) in ratings and age < 25:# and p['draftPick'] < 45:
        youth.append([age] + ratings[(k,yr+1)])
        names.append(k)
        positions.append(p['pos'])
youth = np.array(youth)

In [None]:
_ = plt.hist((youth/youth.mean(0)).ravel(),50)

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
clf_pca = PCA(whiten =False)#TSNE(perplexity=55)
emb = clf_pca.fit_transform(youth[:,1:].astype(np.float32))

In [None]:
pos_set = ['PG','G','SG',"GF",'SF','F','PF','FC',"C"]
plt.scatter(emb[:,0],emb[:,1],c=[pos_set.index(_) for _ in positions],cmap='RdBu')

In [None]:
for c,v in zip(cols,np.round(clf_pca.mean_,1)):
    print(c,':',v,',')

In [None]:
clf_pca.explained_variance_ratio_

In [None]:
COMP =3
hgt = youth[:,1+cols.index('hgt')]
X_hgt = hgt[:,None]# np.vstack([hgt,hgt**2]).T
pred_res = []
hgt_models = []
for i in range(COMP):
    clf = linear_model.RidgeCV(cv=3,alphas=np.logspace(-5,3,9))
    clf.fit(X_hgt,emb[:,i])
    clf_s = clf.score( X_hgt,emb[:,i])
    pred_res.append(clf.predict(X_hgt))
    print(clf_s)
    hgt_models.append(list(clf.coef_) + [clf.intercept_])
pred_res = np.array(pred_res).T

In [None]:
np.round(hgt_models,2)

In [None]:
clf_pca.components_[:COMP,:]

In [None]:
ADD_VAR = 10*np.random.randn(X_hgt.shape[0],COMP)
MUL_VAR = 1+0.8*(np.random.rand(X_hgt.shape[0],15)-0.5)
pred_vec = ((ADD_VAR+pred_res) @ clf_pca.components_[:COMP,:]) + clf_pca.mean_
pred_vec *= MUL_VAR

In [None]:
abs(pred_vec - youth[:,1:]).mean(0)

In [None]:
_ = plt.hist(youth[:,1:].mean(1),50,alpha=0.5,density=True)
_ = plt.hist(pred_vec.mean(1),50,alpha=0.5,density=True)
print(youth[:,1:].mean(1).std(),pred_vec.mean(1).std())