In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.linear_model import LinearRegression,Ridge,ElasticNet,ElasticNetCV,LassoCV,SGDRegressor,RidgeCV
from collections import defaultdict
import json
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import lightgbm as lgb
import os, sys

import fnmatch
plt.style.use('seaborn-white')

In [None]:
file = 'no_inj.json'
data = json.load(open(file,'rt',encoding='utf-8-sig'))
player_ratings = {}
player_picks = {}
replacement_ovrs = []
player_born = {}
player_seasons = defaultdict(int)
team_rosters = defaultdict(list)
contracts = {}
draft_picks = defaultdict(list)
draft_salaries = {}

team_players = defaultdict(set)
for p in data['players']:
    player_born[p['pid']] = p['born']['year']
    dp = p['draft']['pick']+ (p['draft']['round']-1)*30
    if dp < 0:
        dp = 0
    player_picks[p['pid']] = (dp,p['draft']['year'])
    dtid = p['draft']['originalTid']
    if dtid >= 0:
        draft_picks[(dtid,p['draft']['year'])].append(dp)
    
    for r in p['ratings']:
        player_seasons[p['pid']] += 1
        player_ratings[(p['pid'],r['season'])] ={k:v for k,v in r.items() if type(v) == int and k != 'season'}
    for r in p['stats']:
        team_players[(r['tid'],r['season'])].add(p['pid'])
        
    td = {s['season']: s['amount'] for s in p['salaries']}
    new_c = {s:td[s+1] for s in td if s+1 in td and td[s+1]!=td[s]}

    for s,c in new_c.items():
        if c == 750:
            replacement_ovrs.append(player_ratings[(p['pid'],s)]['ovr'])
            
    for k,a in td.items():
        yl = 0
        ko = k + 0

        while k+1 in td:
            if td[k+1] == a:
                yl += 1
            else:
                break
            k+=1
        contracts[(p['pid'],ko)] = (a,yl)

    for k,a in td.items():
        if p['draft']['year']+1 == k:
            draft_salaries[dp] = a
            break

#for p in data['players']:
#    for r in p['stats']:
#        if r['min'] > 0:
#            pass

In [None]:
{i:draft_salaries[i+1] for i in range(60)}

In [None]:
plt.hist(replacement_ovrs)
RL = np.mean(replacement_ovrs)
RL

In [None]:
team_movs = {}

for t in data['teams']:
    tid = t['tid']
    for ts in t['stats']:
        if not ts['playoffs'] and ts['gp'] > 0:
            season = ts['season']
            mov = (ts['pts'] - ts['oppPts']) / ts['gp'];
            team_movs[(tid,season)] = mov


In [None]:
won_cup = {}
season_ratings = defaultdict(list)

for team in data['teams']:
    tid = team['tid']
    for season in team['seasons']:
        s = season['season']
        if season['gp'] >= 82:
            won_cup[(tid,s)] = float(season['playoffRoundsWon'] >= 3) # not really finals -- 4
            season_ratings[s].append((season['won'],team_movs[(tid,s)]))


In [None]:
won_cup2 = np.array([[team_movs[k],won_cup[k]] for k in won_cup])
plt.scatter(won_cup2[:,0],won_cup2[:,1],s=5,alpha=0.5)
cB,cA = sm.Logit(won_cup2[:,1],sm.add_constant(won_cup2[:,0])).fit().params

xp = np.linspace(-20,20)
win_p = 1.0/(1+np.exp(-xp*cA -cB))
plt.plot(xp,win_p,c='r')
cA,cB

In [None]:
mov_to_pos = []
for k in season_ratings:
    mov_to_pos += [(i,_[1]) for i,_ in enumerate(sorted(season_ratings[k]))]
arr = np.array(mov_to_pos)
clf3 = sm.OLS(arr[:,0],sm.add_constant(arr[:,1])).fit()
plt.scatter(arr[:,0],arr[:,1])
clf3.summary()

In [None]:
clf_pos = sm.Logit(arr[:,0]/29,sm.add_constant(arr[:,1])).fit()

xp = np.linspace(-15,15)
yp = 1/(1+np.exp(8.416 - 0.9239*(xp)))
plt.plot(xp,yp)
plt.scatter(arr[:,1],clf_pos.predict(),c='r',alpha=0.1)

clf_pos.summary()


In [None]:
YEARS_TO_MODEL = 12
YEARS_TO_USE = 20

In [None]:
offset_years = defaultdict(list)
for k,v in team_movs.items():
    tid,s = k
    for i in range(1,YEARS_TO_USE):
        if (tid,s+i) in team_movs:
            offset_years[i].append([v,team_movs[(tid,s+i)]])

In [None]:
tmov_mul = {0:1}
for k,v in offset_years.items():
    arr = np.array(offset_years[k])
    plt.figure()
    plt.scatter(arr[:,0],arr[:,1])
    plt.title(k)
    clf3 = sm.RLM(arr[:,0],arr[:,1]).fit()
    xp = np.linspace(-15,15)
    plt.plot(xp,xp*clf3.params[0],c='r')
    tmov_mul[k] = clf3.params[0]
    print(k,clf3.params,)

In [None]:
clf3.summary()

In [None]:
team_ovrs = {k: sorted([player_ratings[(p,k[1])]['ovr'] for p in team_players[k]],reverse=True)[:10] for k in team_players}

In [None]:
sorted(team_ovrs[(23,2023)],reverse=True)[:10]

In [None]:
Xs1 = []
ys1 = []
for t, s in team_movs:
    Xs1.append(team_ovrs[(t,s)])
    ys1.append(team_movs[(t,s)])

In [None]:
import statsmodels.api as sm

In [None]:
res = sm.OLS(ys1,sm.add_constant(Xs1)).fit()

In [None]:
import scipy.optimize as opt
def best_fit_func_exp(x):
    return np.linalg.norm(( Xs1 @ np.exp(x[0]*np.arange(10))*x[1]-x[2] - ys1))
a,b = np.polyfit(np.arange(10),np.log(np.array(res.params[1:])),1)
res2 = opt.minimize(best_fit_func_exp,[a,np.exp(b),-125],method='Nelder-Mead')
print(res2.x)

In [None]:
plt.plot(res.params[1:])
plt.plot( np.exp(res2.x[0]*np.arange(10))*res2.x[1])
plt.xlabel('value of Nth best player')
plt.grid(True)
_ = plt.xticks(np.arange(10),np.arange(10)+1)

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(ys1,res.predict(),alpha=0.1,s=10)
plt.xlabel('true MOV')
plt.ylabel('predicted MOV')
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.grid(True)

In [None]:
res.summary()

In [None]:
((res.predict()-np.array(ys1))**2).mean(), (np.array(ys1)**2).mean()

In [None]:
def get_rating(x):
    vec = []
    offset = len(x)//3
    x = np.round(x,2)
    x[:offset] *= 10
    x[-offset:] *= 10
    #x[:3] = np.array([72.8,61.5,39.5])
    #x[3:] = np.array([4.1,1.1,0.79, 13.2,9.5,8.8])
    for row in Xs1:
        tot = 0
        for i in range(offset):
            tot += (x[offset+i])*sum([np.tanh((_-x[i])/x[2*offset+i])+1.0 for _ in row])
            #print(x[i],row)
            #tot += 5*sum([float(_ > x[i]) for _ in row])

        vec.append(tot)
    vec = np.array(vec)
    vec -= vec.mean()
    return ((vec-ys1)**2).mean()#,vec
#es.best.x

iv = [78.2,62.3,40.4, 4.73,1.63,0.85, 13.9,8.8,9.5]
iv = [72.67,61.14,39.29, 4.27,1.03,0.80, 13.26,9.45,8.9]
iv = [73.4, 59.3, 53.9, 36.7,  3.9,  2. , -0.8,  1.3, 11.7, 10.7, 13.7,
        10.5]
iv = [ 7.59,  6.1 ,  5.59,  4.18,  4.26,  2.09, -0.89,  1.03,  1.27,
         0.95,  1.47,  0.94]
iv = [ 7.61,  6.03,  5.56,  4.31,  4.29,  2.19, -0.9 ,  0.88,  1.05,
         0.83,  1.48,  0.96]
iv = [7.13, 6.65, 1.74]
r2 = get_rating(iv)
#plt.scatter(ys,r2[1],alpha=0.1,s=10)
r2

In [None]:
import scipy.optimize as opt
import cma

es = cma.CMAEvolutionStrategy(iv,0.5, {'tolx':1e-6,'tolfun':1e-6,'maxfevals':1000})
es.optimize(get_rating)

In [None]:
get_rating(es.best.x)

In [None]:
tv = np.round(es.best.x,2)
tv,get_rating(tv)

In [None]:
iv = np.array(iv)
iv[:len(iv)//3] *=10
iv[-len(iv)//3:] *=10

In [None]:
iv

In [None]:
def get_rating_vec(x):
    vec = []
    offset = len(x)//3
    for row in Xs1:
        tot = 0
        for i in range(offset):
            tot += (x[offset+i])*sum([np.tanh((_-x[i])/x[2*offset+i])+1.0 for _ in row] ) 
            #print(x[i],row)
            #tot += 5*sum([float(_ > x[i]) for _ in row])

        vec.append(tot)
    vec = np.array(vec)
    #vec -= vec.mean()
    return vec
plt.figure(figsize=(5,5))
vt = get_rating_vec(iv)
vtm = vt.mean()/10
plt.scatter(ys1,vt-vtm*10,alpha=0.1,s=10)
#plt.scatter(ys,res.predict(),alpha=0.1,s=10)

plt.xlabel('true MOV')
plt.ylabel('predicted MOV')
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.grid(True)
print(-vtm)

In [None]:
def get_mov(x):
    offset = len(iv)//3
    tot = -vtm
    for i in range(offset):
        tot += (iv[offset+i])*(np.tanh((x-iv[i])/iv[2*offset+i])+1.0)
    return tot
xp = np.linspace(0,101)
yp = [get_mov(_) for _ in xp]
plt.plot(xp,yp)
plt.grid(True)
print(vtm)

In [None]:
r_lvl = get_mov(int(round(RL)))
r_lvl

In [None]:
mov = {i:get_mov(i) for i in range(101)}

In [None]:
players_years = defaultdict(list)
for k,v in player_ratings.items():
    pid,s = k
    players_years[pid].append(s)

In [None]:
pick_res = defaultdict(list)
for k,v in player_ratings.items():
    pid,s = k
    if len(players_years[pid]) < 8: # skip the forgettable ones
        continue
    pr,ps = player_picks[pid]
    year = s-ps
    #if year < 6:
    #pick_res[(pr,year)].append(np.argmin((v['ovr']-ovrs)**2))
    pick_res[(pr,year)].append(v['ovr'])

In [None]:
draft_value = np.array([[np.quantile(pick_res[(p,yr)],0.75) for p in range(61)] for yr in range(YEARS_TO_MODEL)]).T
draft_value = np.array([[np.mean([v for v in pick_res[(p,yr)] if v > RL] + [RL] ) for p in range(61)] for yr in range(YEARS_TO_MODEL)]).T


In [None]:
plt.style.use('fivethirtyeight')
plt.imshow(np.array(draft_value), aspect='auto',cmap='Greys_r')
plt.xlabel('years since drafted')
plt.ylabel('draft pick')
plt.title('draft value')
plt.colorbar()
plt.grid(False)
#plt.ylim(30,0)

In [None]:
a = (np.array(draft_value)-r_lvl).sum(1)[1:]
a[:30].mean(),a[30:].mean()

In [None]:
plt.plot((np.array(draft_value)).sum(1)[1:])
plt.xlabel('pick')
plt.ylabel('draft pick MOVaR')
plt.title('5year value')

In [None]:
tA = np.array(draft_value)#-weights[-1]
tA = tA[1:]#np.roll(tA,-1,0)

In [None]:
for i in range(YEARS_TO_MODEL):
    plt.figure()
    plt.xlabel('pick')
    plt.ylabel('MOV')
    plt.title('year {}'.format(i+1))
    plt.plot(tA[:,i])

In [None]:
coeffs = []
for i in range(YEARS_TO_MODEL):
    x = [0.10,10,0.7]
    def testX(x):
        return np.linalg.norm(x[1]*np.exp(-x[0]*np.arange(60)**x[2]) - tA[:,i]+RL)
    coeffs.append(opt.fmin(testX,x))

In [None]:
[list(_) for _ in coeffs]

In [None]:
for i,x in enumerate(coeffs[::-1]):
    #plt.figure()
    plt.plot( x[1]*np.exp(-x[0]*np.arange(61)**x[2]) + RL,label='year {}'.format(YEARS_TO_MODEL-i))
plt.legend()
#plt.plot([0,60],[0,0],c='k',lw=1.5,ls='--')
plt.xlabel('draft pick')
plt.ylabel('MOV above replacement')

In [None]:
for i,x in enumerate(coeffs[::-1]):
    plt.figure()
    plt.plot(tA[:,YEARS_TO_MODEL-1-i])
    plt.plot(x[1]*np.exp(-x[0]*np.arange(61)**x[2])+RL)
    plt.title('year {}'.format(YEARS_TO_MODEL-i))
    plt.xlabel('draft pick')
    plt.ylabel('MOV above replacement')

In [None]:
age_loop = defaultdict(list)
for pid,s in player_ratings:
    if player_seasons[pid] < 14:
        continue
    k = (pid,s)
    nk =(pid,s+1)
    age = s-player_born[pid]
    if nk in player_ratings:
        age_loop[age].append(player_ratings[nk]['ovr']-player_ratings[k]['ovr'])
    #player_ratings[k]['ovr']

In [None]:
from scipy.stats import trim_mean
age_shift = {k:trim_mean(age_loop[k],.01) for k in sorted(age_loop.keys()) if k < 36}

In [None]:
{k:round(v,1) for k,v in age_shift.items()}

In [None]:
valid_seasons = sorted(set([s for tid,s in team_movs]))
valid_seasons = valid_seasons[:-YEARS_TO_USE]
max_shift = min(age_shift.values())

In [None]:
ovr_salaries = defaultdict(list)
sal_ovrd = defaultdict(list)
for p,s in player_ratings:
    v = player_ratings[(p,s)]
    if (p,s) in contracts:
        c = contracts[(p,s)][0]
        ovr_salaries[v['ovr']].append(c)
        sal_ovrd[c].append(v['ovr'])
sal_ovr = np.array([[s,mov[round(np.mean(c))]-r_lvl] for s,c in sal_ovrd.items()])
sal_ovr[:,0]/=30000

sO = sm.RLM(sal_ovr[:,1],sal_ovr[:,0]).fit()
plt.scatter(sal_ovr[:,0],sal_ovr[:,1])
plt.scatter(sal_ovr[:,0],sO.predict())
plt.xlabel('% of max salary')
plt.ylabel('+/- above replacement')
sA = sO.params[0]
sA

In [None]:
Xs = defaultdict(list)
ys = defaultdict(list)

term_data_x = []
term_data_y = []

for tid, s in team_movs:
    if s in valid_seasons:
        itmov = team_movs[(tid,s)]
        
        pars = defaultdict(list)
        tss = defaultdict(int)

        dpars = defaultdict(list)
        dtss = defaultdict(int)
        for i in range(YEARS_TO_MODEL):
            for pid in team_players[(tid,s)]:
                if (pid,s) in contracts and contracts[(pid,s)][1] >= i:
                    tss[i] += contracts[(pid,s)][0]
                    age = s-player_born[pid]
                    ovr = player_ratings[(pid,s)]['ovr']
                    for j in range(i):
                        ovr+=age_shift.get(age+j,max_shift)
                    pars[i].append(mov[round(np.clip(ovr,0,100))])
                    #print(i,pid,age,ovr)
                elif False and (pid,s) in contracts:
                    age = s-player_born[pid]
                    ovr = player_ratings[(pid,s)]['ovr']
                    for j in range(i):
                        ovr+=age_shift.get(age+j,max_shift)
                    mov_p = mov[round(np.clip(ovr,0,100))]
                    if mov_p > 0:
                        RESIGN_CHANCE = 0.5
                        est_con = min(1,(mov_p-r_lvl)/sA) * 30000 
                        tss[i] += est_con*RESIGN_CHANCE
                        pars[i].append(RESIGN_CHANCE*mov_p)  
        for i in range(1,YEARS_TO_MODEL):
            ppos = int(round(29*clf_pos.predict([1,tmov_mul[i]*itmov])[0]))
            dpicks = [ppos,ppos+30]
            #dpicks = draft_picks[(tid,s+i)]
            #print(i,dpicks)
            for p in dpicks:
                dsal = draft_salaries[p]
                for k,j in enumerate(range(i,YEARS_TO_MODEL)):
                    dtss[j] += dsal
                    x = coeffs[k]
                    dpars[j].append(x[1]*np.exp(-x[0]*(p**x[2])))
        
            #print(contracts[(pid,s)],(player_ratings[(pid,s)]['ovr']),)
        for i in range(YEARS_TO_MODEL):
            tmov = team_movs[(tid,s+i)]
            play = [p for p in pars[i] if p >= r_lvl]
            lp = len(play)
            if lp < 10:
                play= play + (10-lp)*[r_lvl]
            play = sorted(play,reverse=True)[:10]
            play_s = sum(play)
            play_d = sum([_ for _ in dpars[i]])
            
            cap_hit = tss[i] + (10-lp)*750 #+ dtss[i]
            
            diff = (90000-cap_hit)/30000
            cap_space = np.maximum(diff,0.1*diff)
            draft_pick_max_slots = dtss[i]/30000
            
            Xs[i].append([sA*cap_space,play_s,itmov*int(i!=0),play_d])
            ys[i].append(tmov)
        term_data_x.append([draft_pick_max_slots,play_d,itmov ])
        total_reward = []
        for i in range(YEARS_TO_MODEL,YEARS_TO_USE):
            tmov = 1.0/(1+np.exp(-team_movs[(tid,s+i)]*cA -cB)) 
            
            total_reward.append(tmov * (0.9 ** (i-YEARS_TO_MODEL)))
        term_data_y.append(sum(total_reward))


In [None]:
np.array(term_data_x).shape


In [None]:
predT = sm.OLS(term_data_y,sm.add_constant(term_data_x)).fit()
predT.summary()

In [None]:
plt.hist(np.array(Xs[1])[:,3])

In [None]:
plt.scatter(term_data_y,predT.predict())

In [None]:
np.std(np.array(Xs[1]),axis=0),np.mean(np.array(Xs[1]),axis=0)

In [None]:
 np.set_printoptions(suppress=True)
for i in range(YEARS_TO_MODEL):
    pred = sm.RLM(ys[i],(Xs[i])).fit()
    print(i,":",list(np.round(pred.params,3)),',')#,',',pred.rsquared)

In [None]:
0 : [-0.173, 0.986, 0.0, 0.0] , 0.6569516133476782
1 : [0.617, 0.571, 0.362, 0.097] , 0.3644063232746716
2 : [0.765, 0.631, 0.18, 0.029] , 0.14830963454692692
3 : [0.897, 0.745, 0.05, 0.06] , 0.047642853317683254
4 : [0.779, 0.731, 0.045, 0.057] , 0.018632355348438878
5 : [0.113, 0.401, 0.041, 0.081] , 0.0027072676702022402

In [None]:
pred.summary()

In [None]:
xp = sorted([k for k in ovr_salaries.keys() if k > 45])
yp = [np.mean(ovr_salaries[k]) for k in xp]
yp2 = [1000*(mov[k]-r_lvl) for k in xp]

plt.plot(xp,yp)


In [None]:
vvec = np.array(yp2)/np.array(yp)
plt.plot(xp,vvec)
#plt.plot(xp,0.11*np.log(np.array(xp)-43))
plt.xlabel('ovr')
plt.ylabel('value over replacement per $1M')
#def log_fit(x):
#    return np.linalg.norm(x[0]*np.log(np.array(xp)-43)-vvec)
#opt.optimize.fmin(log_fit,[1])

In [None]:
yp = {}
for i in range(YEARS_TO_MODEL):
    v = np.array(Xs[i])
    diff = (1.0-v[:,0])/3
    cap_space = np.maximum(diff,0.3*diff)
    mov_from_cap = cap_space*sA
    #pred_mov = v[:,1] + mov_from_cap
    RV = np.array([v[:,1], v[:,2],v[:,3],mov_from_cap]).T
    #if i==0:
    #    RV[:,1] *= 0
    #if i== 4:
    #    RV[:,0] *= 0
    pred = sm.OLS(ys[i],RV).fit()
    print(i,":",[0] + list(np.round(pred.params,2)),",",pred.rsquared)
    yp[i] = pred.predict()
    #print(i,mov_from_cap.mean(),v[:,1].mean())

In [None]:
np.std(np.array(Xs[1]),axis=0),np.mean(np.array(Xs[1]),axis=0)

In [None]:
plt.hist(yp[0])

In [None]:
plt.hist(np.array(Xs[0])[:,1])

In [None]:
for i in range(YEARS_TO_MODEL):
    plt.figure()
    plt.scatter(ys[i],yp[i])
