In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist, cdist
from collections import defaultdict

## Load the historic player data

In [None]:
data = json.load(open('real-player.json','rb'))

In [None]:
df = pd.DataFrame(data['ratings'])

In [None]:
df = df.drop(['fuzz','abbrev_if_new_row'],1)#.set_index(['slug','season'])

In [None]:
df = df.set_index(['slug','season']).reset_index()

In [None]:
cols = list(df.columns[2:])
ovr_weights =  {'diq': 0.093,
 'dnk': 0.0424,
 'drb': 0.0968,
 'endu': 0.0075,
 'fg': -0.0093,
 'ft': 0.049,
 'hgt': 0.225,
 'ins': -0.0143,
 'jmp': 0.0505,
 'oiq': 0.0971,
 'pss': 0.0657,
 'reb': 0.0534,
 'spd': 0.156,
 'stre': 0.0962,
 'tp': 0.105}
ovr_v = np.array([ovr_weights[cols[i]] for i in range(len(cols))])

In [None]:
ratings  = defaultdict(lambda: dict())
stats  = defaultdict(lambda: dict())

for row in df.itertuples():
    yr = data['bios'][row[1]]['draftYear']
    born = data['bios'][row[1]]['bornYear']
    name = row[1]
    r_yr = row[2]
    if yr is None or yr == 0:
        continue
    if born is None or born ==0:
        continue
    if yr == r_yr:
        continue
    if r_yr >= 2019:
        continue
    if r_yr <= 1983:
        continue
    if yr+1 == r_yr or name in ratings:
        ratings[name][r_yr-born] = list(row[3:])

In [None]:
len(ratings.keys())

In [None]:
ratings['malonka01']

In [None]:
ages = np.unique(sum([list(v.keys()) for k,v in ratings.items()],[]))
all_play = sum([[[yr] + v2 for yr, v2 in v.items()] for k,v in ratings.items()],[])
all_play = np.array(all_play)


In [None]:
youth = np.array([[min(v.keys())] + v[min(v.keys())] for k,v in ratings.items()])
youth = youth[youth[:,0] < 24]
rookie_progs = np.array([[min(v.keys())] + np.array(v[min(v.keys())+1]) - np.array(v[min(v.keys())]) for k,v in ratings.items() if min(v.keys())+1 in v])
rookie_progs = rookie_progs[rookie_progs[:,0]<24]
plt.hist((youth[:,1:] * ovr_v).sum(1)-6.4,12)

In [None]:
for a in ages:
    if a > 38:
        continue
    plt.subplot(4,5,a-18)
    r_age = np.array([v[a] for k,v in ratings.items() if a in v])
    plt.imshow(np.cov(r_age,rowvar=False),cmap='RdBu',vmin=-150,vmax=150)
    plt.axis('off')
    #plt.title(str(a))
plt.tight_layout(h_pad=0.1,w_pad=0)

## model classes

In [None]:
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, MiniBatchKMeans

In [None]:
class_points = all_play[:,1:].astype(np.float32)
class_points /= ((class_points * ovr_v).sum(1)-6.4)[:,None]

clfk = MiniBatchKMeans(3,n_init=100)
clfk.fit(class_points)

In [None]:
clfg = GaussianMixture(3,means_init=clfk.cluster_centers_,covariance_type='full')
_ = clfg.fit(class_points)

In [None]:
class_scale = clfg.means_/class_points.mean(0)
tmp = ['a' for _ in range(3)]
des = ['guard','wing','big']
fix_c = {}
for i2,i in enumerate(np.argsort(class_scale[:,cols.index('hgt')])):
    tmp[i] = des[i2]
    fix_c[i] =i2
c_df = pd.DataFrame(class_scale,columns=cols,index=tmp).round(2)
print(fix_c)
c_df

In [None]:
types_new = {_[0]:{c:v for c,v in zip(cols,_[1:]) if abs(v-1) > 0.001} for _ in c_df.itertuples()}
types_og = {
"guard": {
		"jmp": 1.65,
		"spd": 1.65,
		"drb": 1.5,
		"pss": 1.5,
		"ft": 1.4,
		"fg": 1.4,
		"tp": 1.4,
		"oiq": 1.2,
		"endu": 1.4,
	},
	"wing": {
		"drb": 1.2,
		"dnk": 1.5,
		"jmp": 1.4,
		"spd": 1.4,
		"ft": 1.2,
		"fg": 1.2,
		"tp": 1.2,
	},
	"big": {
		"stre": 1.2,
		"ins": 1.6,
		"dnk": 1.5,
		"reb": 1.4,
		"ft": 0.8,
		"fg": 0.8,
		"tp": 0.8,
		"diq": 1.2,
	},
}


In [None]:
clabel = clfg.predict(class_points)
clabel = np.array([fix_c[_] for _ in clabel])
chistbin = np.linspace(0.25,1.75,18)
cmean = all_play[:,1:].mean(0)[6]
c_n = []
chist = []
for i in range(3):
    c_n.append((clabel==i).sum())
    chist.append(np.histogram(class_points[clabel==i,6],chistbin,density=True)[0]+1e-4)
    plt.hist(class_points[clabel==i,6],chistbin,alpha=0.5,density=True,label=str(i))
plt.legend()

In [None]:
x0_c = [  0.23, -11,  1.2]
x = x0_c
def eval_c(x):
    pred_c = x[0]*all_play[:,7]+x[1]
    np.random.seed(42)
    pred_c = pred_c + x[2]*np.random.randn(all_play.shape[0])
    rnd = np.clip(np.round(pred_c),0,2)
    kl = 0
    c_t = []
    for i in range(3):
        c_t.append((rnd==i).sum())
        phist = np.histogram(all_play[rnd==i,7]/cmean,chistbin,density=True)[0]+1e-4
        kl += (phist * np.log(phist/chist[i])).sum()
        kl += (chist[i] * np.log(chist[i]/phist)).sum()
    if np.isnan(kl):
        return 1e9
    return kl*((np.array(c_t)-np.array(c_n))**2).sum()
print(eval_c(x0_c))

#import cma
#es = cma.CMAEvolutionStrategy(x0_c,0.1)
#res = es.optimize(eval_c)

In [None]:
#es.mean

In [None]:
pred_c = x0_c[0]*all_play[:,7]+x0_c[1]
pred_cf = pred_c + x0_c[2]*np.random.randn(all_play.shape[0])
pred_c = np.clip(pred_cf,0,2)
rnd = np.clip(np.round(pred_c),0,2)
plt.scatter(all_play[:,7],pred_cf,c=rnd)

In [None]:
for i in range(3):
    plt.hist(all_play[rnd==i,7],cmean*chistbin,alpha=0.5,density=True)

## model features

In [None]:
types_opt={'guard': {'diq': 1.2,
  'dnk': 1.8,
  'drb': 1.5,
  'endu': 1.5,
  'fg': 1.5,
  'ft': 1.3,
  'ins': 1.2,
  'jmp': 1.6,
  'oiq': 1.3,
  'pss': 1.6,
  'spd': 1.3,
  'tp': 1.3},
 'wing': {'diq': 0.9,
  'dnk': 2.1,
  'drb': 1.2,
  'endu': 1.1,
  'fg': 1.3,
  'ft': 1.2,
  'ins': 1.2,
  'jmp': 1.4,
  'oiq': 1.2,
  'pss': 0.8,
  'reb': 1.2,
  'spd': 1.1,
  'tp': 1.2},
 'big': {'dnk': 2.1,
  'drb': 1.2,
  'endu': 1.3,
  'fg': 1.2,
  'ins': 1.4,
  'jmp': 1.3,
  'oiq': 1.2,
  'pss': 1.1,
  'reb': 1.2,
  'spd': 1.1,
  'stre': 1.2}}

In [None]:
athleticismRatings = ["stre", "spd", "jmp", "endu", "dnk"]
shootingRatings = ["ft", "fg", "tp"]
skillRatings = ["oiq", "diq", "drb", "pss", "reb"]

v1 = np.array([int(_ in athleticismRatings) for _ in cols])
v2 = np.array([int(_ in shootingRatings) for _ in cols])
v3 = np.array([int(_ in skillRatings) for _ in cols])
v4 = np.array([int(_=='ins') for _ in cols])

vmul = np.array([v1,v2,v3,v4])

if False:
    mean_v = np.array([22, 27, 37, 17, 32, 32, 0, 27, 40, 22, 37, 37, 40, 37, 32])
    x0_c = [  0.23, -11,  1.2]
    r_std = [3]
    v_std = [0.2,0.2,0.2,0.2]
    types = types_og
elif False:
    mean_v = np.array([43, 48, 50, 38, 46, 43, 0, 44, 57, 40, 47, 46, 52, 47, 46])
    r_std = [2.9]
    v_std = [0.12,0.17,0.12,0.22]
    types = types_new
elif False:
    mean_v = np.array([42, 43, 38, 28, 36, 36, 0, 40, 36, 37, 37, 46, 36, 49, 38])
    r_std = [3.3]
    v_std = [0.104,0.115,0.104,0.13]
    types = types_og
else:
    mean_v = np.array([42, 43, 38, 29, 36, 35, 0, 40, 36, 37, 37, 45, 36, 49, 38])
    r_std = [3.9]
    v_std = [0.10,0.14,0.11,0.20]
    types = types_og

c_mul = np.array([[types[t].get(c,1) for c in cols] for t in ['guard','wing','big']])


In [None]:
for c, r in zip(cols,mean_v):
    if c != 'hgt':
        print(c,':',r,',')

In [None]:
rand_hgt = np.random.randn(12500)*13.6 + 47.5
f_hgt = np.array(list(youth[:,7]) + list(rand_hgt))

simN = f_hgt.shape[0]
pred_c = x0_c[0]*f_hgt+x0_c[1]
pred_cf = pred_c + x0_c[2]*np.random.randn(simN)
pred_c = np.clip(pred_cf,0,2)
rnd = np.clip(np.round(pred_c),0,2).astype(int)

v_m = (((np.random.randn(simN,4) * np.array(v_std) ) @ vmul) + 1)
pred_vec = (mean_v  + r_std[0]*np.random.randn(simN,15))
pred_vec[:,6] = f_hgt
pred_vec = v_m * c_mul[rnd] * pred_vec
pred_vec[:,6] = f_hgt


In [None]:
_ = plt.hist((youth[:,1:]*ovr_v).sum(1)-6.4,25,alpha=0.5,density=True,label='rpd')
_ = plt.hist((pred_vec*ovr_v).sum(1)-6.4,25,alpha=0.5,density=True,label='gen')
#_ = plt.hist((beta_p2*ovr_v).sum(1)-6.4,20,alpha=0.5,density=True,label='beta')

plt.legend()
print(youth[:,1:].mean(1).std(),pred_vec.mean(1).std())

In [None]:
plt.subplot(1,2,1)
plt.imshow(np.cov(youth[:,1:],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('real players')
plt.xticks(np.arange(15),cols,rotation=45)
plt.yticks(np.arange(15),cols)
plt.subplot(1,2,2)
plt.imshow(np.cov(pred_vec,rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('generated')
plt.xticks(np.arange(15),cols,rotation=45)
_ = plt.yticks(np.arange(15),cols)

In [None]:
PC = 50
s1 = (youth[:,1:]*ovr_v).sum(1)
s1 = s1 > np.percentile(s1,PC)
s2 = (pred_vec*ovr_v).sum(1)
s2 = s2 > np.percentile(s2,PC)
plt.subplot(1,2,1)
plt.imshow(np.cov(youth[s1,1:],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('real players')
plt.xticks(np.arange(15),cols,rotation=45)
plt.yticks(np.arange(15),cols)
plt.subplot(1,2,2)
plt.imshow(np.cov(pred_vec[s2],rowvar=False),vmin=-130,vmax=130,cmap='RdBu')
plt.title('generated')
plt.xticks(np.arange(15),cols,rotation=45)
_ = plt.yticks(np.arange(15),cols)

In [None]:
def eval_f(params):
    #np.random.seed(43)
    mean_v = np.exp(params[:15])
    r_std = np.exp(params[15:16])
    v_std = np.exp(params[16:20])
    cmul2 = c_mul#np.exp(params[20:]).reshape((3,15))
    res = []
    print(mean_v,r_std,v_std,cmul2)
    for i in range(30):
        np.random.seed(542+i)
        
        rand_hgt = np.random.randn(1500)*13.6 + 47.5
        f_hgt = np.array(list(youth[:,7]) + list(rand_hgt))

        simN = f_hgt.shape[0]
        pred_c = x0_c[0]*f_hgt+x0_c[1]
        pred_cf = pred_c + x0_c[2]*np.random.randn(simN)
        pred_c = np.clip(pred_cf,0,2)
        rnd = np.clip(np.round(pred_c),0,2).astype(int)

        v_m = (((np.random.randn(simN,4) * np.array(v_std) ) @ vmul) + 1)
        pred_vec = (mean_v  + r_std[0]*np.random.randn(simN,15))
        pred_vec[:,6] = f_hgt
        pred_vec = v_m * cmul2[rnd] * pred_vec
        pred_vec[:,6] = f_hgt

        # filter to only the top half with good stats
        s1 = (youth[:,1:]*ovr_v).sum(1)
        s1 = s1 > np.percentile(s1,50)
        s2 = (pred_vec*ovr_v).sum(1)
        s2 = s2 > np.percentile(s2,50)
        
        cov_err = ((np.cov(youth[s1,1:],rowvar=False)-np.cov(pred_vec[s2],rowvar=False))**2).mean()
        cov_err2 = ((np.cov(youth[:,1:],rowvar=False)-np.cov(pred_vec,rowvar=False))**2).mean()

        v1 = (ovr_v*youth[s1,1:]).sum(1)
        v2 = (ovr_v*pred_vec[s2]).sum(1)
        
        tb = min(100,int(np.ceil(max(v1.max(),v2.max()))))
        bb = max(1,int(np.floor(min(v1.min(),v2.min()))))
        sN = tb-bb
        hist_set = np.linspace(bb,tb,sN)
        h1 = np.histogram(v1,hist_set,density=True)[0]+1e-4
        h2 = np.histogram(v2,hist_set,density=True)[0]+1e-4
        kl1 = (np.log(h1/h2)*h1).sum()
        kl2 = (np.log(h2/h1)*h2).sum()
        mean_err = kl1+kl2
        
  
        mean_err2 = ((youth[s1,1:].mean(0)-pred_vec[s2].mean(0))**2).sum()
        mean_err3 = ((youth[:,1:].mean(0)-pred_vec.mean(0))**2).sum()
        std_err1 = ((youth[s1,1:].std(0)-pred_vec[s2].std(0))**2).sum()
        std_err2 = ((youth[:,1:].std(0)-pred_vec.std(0))**2).sum()
        
        youth_pred_pos =  x0_c[0]*youth[:,7]+x0_c[1]+ x0_c[2]*np.random.randn(youth.shape[0])
        youth_pred_pos = np.clip(np.round(youth_pred_pos),0,2).astype(int)
        errs_pos = []
        for i in range(3):
            m1 = youth[s1 & (youth_pred_pos==i),1:]
            m2 = pred_vec[s2 & (rnd==i)]
            errs_pos.append(np.linalg.norm(m1.mean(0)-m2.mean(0)))
            errs_pos.append(np.linalg.norm(m1.std(0)-m2.std(0)))
            errs_pos.append(((np.cov(m1,rowvar=False)-np.cov(m2,rowvar=False))**2).mean())
        
        err_pos = np.prod(errs_pos)#**(1/len(errs_pos))
        
        res.append( err_pos* std_err1*(std_err2**0.1)*cov_err*mean_err*mean_err2*(mean_err3**0.1)*(cov_err2**0.1 ) )
        if np.isnan(res[-1]):
            return 1e20
    params_err = np.sqrt((np.exp(params[20:])-1)**2).sum()
    return ( np.mean(sorted(res)) ) ** (1/5.3) # 11.3 ?
x0 = list(mean_v+1e-2) + r_std + v_std

#x0 = np.hstack([x0,c_mul.ravel()])
x0 = np.array([4.23519324e+01, 4.29759128e+01, 3.80452913e+01, 2.85411070e+01,
       3.58534876e+01, 3.52766767e+01, 9.03919879e-04, 4.03432693e+01,
       3.64751848e+01, 3.70790930e+01, 3.65710473e+01, 4.53851179e+01,
       3.61714860e+01, 4.91546420e+01, 3.76236452e+01, 3.90696845e+00,
       9.88179754e-02, 1.44887541e-01, 1.07581577e-01, 1.97162278e-01])
eval_f(np.log(x0))

In [None]:
import cma
es = cma.CMAEvolutionStrategy(np.log(x0),0.001)
es.optimize(eval_f)

In [None]:
np.exp(es.mean)

In [None]:
#types_opt = {}
#for c,vec in zip(["guard","wing","big"],np.exp(es.best.x)[20:].reshape((3,15))):
#    types_opt[c] = {c2:round(r,1) for c2,r in zip(cols,vec) if round(r,1) != 1.0 and c2 != 'hgt'}
#types_opt

## Model development

In [None]:
import scipy
ratings2  = {}
for row in df.itertuples():
    ratings2[(row[1],row[2])] = list(row[3:])
    

# only use recent-ish players
from collections import defaultdict
player_year_rate = defaultdict(dict)
for i,r in ratings2.items():
    if data['bios'][i[0]]['bornYear'] < 1956:
        continue
    if i[1] >= 2019:
        continue
    age=  i[1]-data['bios'][i[0]]['bornYear']
    player_year_rate[i[0]][age] = np.array(r)
SMOOTHING_STD = 1.0   
play_year_rateSmooth = {}
for key,play in player_year_rate.items():
    minY = min(play.keys())
    maxY = max(play.keys())
    res = []
    for i in range(minY,maxY+1):
        #res.append(play.get(i,[np.nan for j in range(15)]))
        res.append(play[i] if i in play else res[-1])
    res = np.array(res).astype(float)
    reS = scipy.ndimage.gaussian_filter1d(res,SMOOTHING_STD,mode='nearest',axis=0,truncate=10)
    p2 = {}
    for idx,age in enumerate(range(minY,maxY+1)):
        if age in play:
            p2[age] = reS[idx]
    play_year_rateSmooth[key] = p2

In [None]:
TRANS_FUNC = lambda x: x**(1/2)#np.sqrt(x)
INV_FUNC = lambda x: x**2#x**2
r1 = []
r2 = []
for play in play_year_rateSmooth.values():
    for age,r in play.items():
        if age-1 in play:
            age2 = age-1
            if age2 > 36:
                continue
            r1.append(TRANS_FUNC(play[age]) -TRANS_FUNC(play[age-1]))
            r2.append(age2)
r1 = np.array(r1)
r2 = np.array(r2)


In [None]:
age_res = []
for age in sorted(np.unique(r2)):
    age_res.append(r1[r2==age].mean(0))
age_res = np.array(age_res)
for i in range(15):
    plt.plot(sorted(np.unique(r2)),age_res[:,i],label=cols[i],c=plt.cm.tab20(i))
plt.xlim(right=35)
plt.legend()
#plt.ylim(-0.2,0.2)

In [None]:
import sklearn.linear_model as linear_model

TIMES_TO_FIT = 1

clf_models = []
for i in range(TIMES_TO_FIT):
    clf = linear_model.RidgeCV(np.logspace(-5,5,11),cv=5)#SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9,eta0=1e-5)
    clf.fit(np.repeat(r2,15)[:,None],r1.ravel())
    score = clf.score(np.repeat(r2,15)[:,None],r1.ravel())
    clf_models.append((score,i,clf))
best_model = sorted(clf_models)[-1]
clf = best_model[2]
print(best_model[0])
main_model = (clf.coef_[0] , clf.intercept_) # 0.0855008819536307

In [None]:
plt.plot(np.unique(r2),np.unique(r2)*main_model[0] +main_model[1])
plt.plot([19,35],[0,0],c='k',ls='--')
plt.grid(True)

In [None]:
models = []
for i in range(r1.shape[1]):
    clf_models = []
    for j in range(TIMES_TO_FIT):
        clf = linear_model.RidgeCV(np.logspace(-5,5,11),cv=5)#SGDRegressor('epsilon_insensitive',alpha=1e-5,epsilon=0,max_iter=10000,tol=1e-9,eta0=1e-5)
        clf.fit(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        score = clf.score(np.array(r2)[:,None],r1[:,i]-(main_model[0]*r2+main_model[1]))
        clf_models.append((score,j,clf))
    best_model = sorted(clf_models)[-1]
    clf = best_model[2]
    print(cols[i],best_model[0])
    models.append((clf.coef_[0],clf.intercept_))

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.unique(r2)*models[i][0]+models[i][1],label=cols[i],c=plt.cm.tab20(i))
plt.legend()
#plt.xlim(19,34)
#plt.ylim(-4,4)
plt.grid(True)

In [None]:
means_expected = []
for i in range(r1.shape[1]):
    means_expected.append((models[i][0]*r2 + models[i][1]) * (main_model[0]*r2+main_model[1]) )

In [None]:
# rank1 approximations of this would be really cool
# but sampling multivariate Gaussians seems... annoying?
removed_means = r1 - np.array(means_expected).T

In [None]:
age_w = []
ages = sorted(np.unique(r2))
age_stds = []
for age in ages:
    age_w.append((r2==age).sum())
    age_stds.append(removed_means[r2==age].std(axis=0))
age_stds = np.array(age_stds)
age_w = np.array(age_w)
age_w = age_w/age_w.mean()

In [None]:
base_model = list(main_model) + [age_stds.mean()]

In [None]:
std_models = []
for i in range(15):
    std_models.append((age_stds[:,i]-base_model[2]).std())

In [None]:
plt.style.use('seaborn-white')
for i in range(r1.shape[1]):
    plt.plot(np.unique(r2),np.ones_like(np.unique(r2))*std_models[i],label=cols[i],c=plt.cm.tab20(i),lw=3)
plt.legend()
plt.xlim(19,34)
plt.grid(True)

In [None]:
models

In [None]:
dat_print = {cols[i]:tuple(np.round(row,4)) for i,row in enumerate(np.hstack([models,np.array(std_models)[:,None]]))}

In [None]:
print('{} {},'.format("base",list(np.round(base_model,4))))
for k,v in dat_print.items():
    if k == 'hgt':continue
    print('{}: {},'.format(k,list(v)))

In [None]:
np.quantile(means_expected,0.99,axis=0).mean(),np.quantile(means_expected,0.01,axis=0).mean()

In [None]:
np.quantile(r1,0.99,axis=0).mean(),np.quantile(r1,0.01,axis=0).mean()