In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.linear_model import LinearRegression,Ridge,ElasticNet,ElasticNetCV,LassoCV,SGDRegressor,RidgeCV
from collections import defaultdict
import json
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import lightgbm as lgb
import os, sys
import scipy.optimize as opt


import fnmatch
plt.style.use('seaborn-white')

In [None]:
file = 'equal_bud.json'
data = json.load(open(file,'rt',encoding='utf-8-sig'))
player_ratings = {}
player_picks = {}
replacement_ovrs = []
player_born = {}
player_seasons = defaultdict(int)
contracts = {}
draft_picks = defaultdict(list)
player_draft_ageovr = {}
draft_salaries = {}

draft_pos_avg = {}

team_players = defaultdict(set)
for p in data['players']:
    player_born[p['pid']] = p['born']['year']
    dp = p['draft']['pick']+ (p['draft']['round']-1)*30
    if dp < 0:
        dp = 0
    player_picks[p['pid']] = (dp,p['draft']['year'])
    dtid = p['draft']['originalTid']
    if dtid >= 0:
        draft_picks[(dtid,p['draft']['year'])].append(dp)
        draft_pos_avg[p['pid']] = dp
    if p['ratings'][0]['season'] ==  p['draft']['year']:
        player_draft_ageovr[p['pid']] = (p['draft']['ovr'],p['draft']['pot'],p['draft']['year']-p['born']['year'])
    for r in p['ratings']:
        player_seasons[p['pid']] += 1
        player_ratings[(p['pid'],r['season'])] ={k:v for k,v in r.items() if type(v) == int and k != 'season'}
    
    for r in p['stats']:
        team_players[(r['tid'],r['season'])].add(p['pid'])
        
    td = {s['season']: s['amount'] for s in p['salaries']}
    new_c = {s:td[s+1] for s in td if s+1 in td and td[s+1]!=td[s]}

    for s,c in new_c.items():
        if c == 750:
            replacement_ovrs.append(player_ratings[(p['pid'],s)]['ovr'])
            
    for k,a in td.items():
        yl = 0
        ko = k + 0

        while k+1 in td:
            if td[k+1] == a:
                yl += 1
            else:
                break
            k+=1
        contracts[(p['pid'],ko)] = (a,yl)

    for k,a in td.items():
        if p['draft']['year']+1 == k:
            draft_salaries[dp] = a
            break

#for p in data['players']:
#    for r in p['stats']:
#        if r['min'] > 0:
#            pass

In [None]:
plt.hist(replacement_ovrs)
RL = np.mean(replacement_ovrs)
RL

In [None]:
team_movs = {}

for t in data['teams']:
    tid = t['tid']
    for ts in t['stats']:
        if not ts['playoffs'] and ts['gp'] > 0:
            season = ts['season']
            mov = (ts['pts'] - ts['oppPts']) / ts['gp'];
            team_movs[(tid,season)] = mov


In [None]:
won_cup = {}
season_ratings = defaultdict(list)
draft_pos_win = defaultdict(list)
draft_age_ovr_winX = []
draft_age_ovr_winy = []
draft_age_ovr_winp = []

for team in data['teams']:
    tid = team['tid']
    for season in team['seasons']:
        s = season['season']
        if season['gp'] >= 82:
            they_won = float(season['playoffRoundsWon'] >= 3) # not really finals -- 4
            won_cup[(tid,s)] = they_won
            
            season_ratings[s].append((season['won'],team_movs[(tid,s)]))
            for i,v in enumerate(sorted([(player_ratings[(_,s)]['ovr'],draft_pos_avg[_]) for _ in team_players[(tid,s)] if _ in draft_pos_avg],reverse=True)):
                r,p = v
                if they_won:
                    draft_pos_win[p].append(i == 0)
            for i,v in enumerate(sorted([(player_ratings[(_,s)]['ovr'],player_draft_ageovr[_],draft_pos_avg[_]) for _ in team_players[(tid,s)] if _ in player_draft_ageovr and _ in draft_pos_avg],reverse=True)):
                if they_won:
                    draft_age_ovr_winX.append(v[1])
                    draft_age_ovr_winy.append(i == 0)
                    draft_age_ovr_winp.append(v[2]-1)


In [None]:
winX = np.array(draft_age_ovr_winX)
winY = np.array(draft_age_ovr_winy)
dp_ovr = sm.Logit(winY,sm.add_constant(winX)).fit()

plt.hist(dp_ovr.predict())
dp_ovr.summary()

In [None]:
xv = 4.3341 + winX[:,0]*0.1294 + winX[:,1]*0.0343 + winX[:,2]*(-0.7099)
1/(1+np.exp(-xv)),dp_ovr.predict()

In [None]:
winX[np.argmin(dp_ovr.predict())],100*min(dp_ovr.predict())

In [None]:
won_cup2 = np.array([[team_movs[k],won_cup[k]] for k in won_cup])
plt.scatter(won_cup2[:,0],won_cup2[:,1],s=5,alpha=0.5)
cB,cA = sm.Logit(won_cup2[:,1],sm.add_constant(won_cup2[:,0])).fit().params

xp = np.linspace(-20,20)
win_p = 1.0/(1+np.exp(-xp*cA -cB))
plt.plot(xp,win_p,c='r')
cA,cB

In [None]:
plt.style.use('fivethirtyeight')
tv = np.array(sorted([(k,np.mean(v)) for k,v in draft_pos_win.items()]))

def testX(x):
    return np.linalg.norm(x[1]*np.exp(-x[0]*np.arange(60)**x[2]) - tv[:,1])
draftP = opt.fmin(testX,[.3,.3,.6])
    
plt.plot(tv[:,1],label='pos raw')
plt.plot(draftP[1]*np.exp(-draftP[0]*np.arange(60)**draftP[2]),label='pos exp fit')
plt.plot(pd.DataFrame([draft_age_ovr_winp,dp_ovr.predict()]).T.groupby(0).mean(),label='ovr/pot/age logit')
plt.plot([0,60],[1/15,1/15],c='k',ls='--')
plt.legend()
plt.xlabel('draft pick')
plt.ylabel('chance of being the best player\non a finals roster')

In [None]:
mov_to_pos = []
for k in season_ratings:
    mov_to_pos += [(i,_[1]) for i,_ in enumerate(sorted(season_ratings[k]))]
arr = np.array(mov_to_pos)
clf3 = sm.OLS(arr[:,0],sm.add_constant(arr[:,1])).fit()
plt.scatter(arr[:,0],arr[:,1])
clf3.summary()

In [None]:
clf_pos = sm.Logit(arr[:,0]/29,sm.add_constant(arr[:,1])).fit()

xp = np.linspace(-15,15)
yp = 1/(1+np.exp( 0.0048 - 0.4037*(xp)))
plt.plot(xp,yp)
plt.scatter(arr[:,1],clf_pos.predict(),c='r',alpha=0.1)

clf_pos.summary()


In [None]:
YEARS_TO_MODEL = 3
YEARS_TO_USE = 3

In [None]:
offset_years = defaultdict(list)
for k,v in team_movs.items():
    tid,s = k
    for i in range(1,YEARS_TO_USE):
        if (tid,s+i) in team_movs:
            offset_years[i].append([v,team_movs[(tid,s+i)]])

In [None]:
tmov_mul = {0:1}
for k,v in offset_years.items():
    arr = np.array(offset_years[k])
    plt.figure()
    plt.scatter(arr[:,0],arr[:,1])
    plt.title(k)
    clf3 = sm.RLM(arr[:,0],arr[:,1]).fit()
    xp = np.linspace(-15,15)
    plt.plot(xp,xp*clf3.params[0],c='r')
    tmov_mul[k] = clf3.params[0]
    print(k,clf3.params,)

In [None]:
clf3.summary()

In [None]:
team_ovrs = {k: sorted([player_ratings[(p,k[1])]['ovr'] for p in team_players[k]],reverse=True)[:10] for k in team_players}

In [None]:
sorted(team_ovrs[(23,2023)],reverse=True)[:10]

In [None]:
Xs1 = []
ys1 = []
for t, s in team_movs:
    Xs1.append(team_ovrs[(t,s)])
    ys1.append(team_movs[(t,s)])

In [None]:
import statsmodels.api as sm

In [None]:
res = sm.OLS(ys1,sm.add_constant(Xs1)).fit()

In [None]:
def best_fit_func_exp(x):
    return np.linalg.norm(( Xs1 @ np.exp(x[0]*np.arange(10))*x[1]-x[2] - ys1))
a,b = np.polyfit(np.arange(10),np.log(np.array(res.params[1:])),1)
res2 = opt.minimize(best_fit_func_exp,[a,np.exp(b),101],method='Nelder-Mead')
print(res2.x)

In [None]:
plt.plot(res.params[1:])
plt.plot( np.exp(res2.x[0]*np.arange(10))*res2.x[1])
plt.xlabel('value of Nth best player')
plt.grid(True)
_ = plt.xticks(np.arange(10),np.arange(10)+1)

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(ys1,res.predict(),alpha=0.1,s=10)
plt.xlabel('true MOV')
plt.ylabel('predicted MOV')
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.grid(True)

In [None]:
players_years = defaultdict(list)
for k,v in player_ratings.items():
    pid,s = k
    players_years[pid].append(s)

In [None]:
age_loop = defaultdict(list)
for pid,s in player_ratings:
    if player_seasons[pid] < 15:
        continue
    k = (pid,s)
    nk =(pid,s+1)
    age = s-player_born[pid]
    if nk in player_ratings:
        age_loop[age].append(player_ratings[nk]['ovr']-player_ratings[k]['ovr'])
    #player_ratings[k]['ovr']

In [None]:
from scipy.stats import trim_mean
age_shift = {k:trim_mean(age_loop[k],.0001) for k in sorted(age_loop.keys()) if k < 36}

In [None]:
{k:round(v,1) for k,v in age_shift.items()}

In [None]:
valid_seasons = sorted(set([s for tid,s in team_movs]))
valid_seasons = valid_seasons[:-YEARS_TO_USE]
max_shift = min(age_shift.values())

In [None]:
def get_rating(x):
    vec = []
    offset = len(x)//3
    x = np.round(x,2)
    x[:offset] *= 10
    x[-offset:] *= 10
    #x[:3] = np.array([72.8,61.5,39.5])
    #x[3:] = np.array([4.1,1.1,0.79, 13.2,9.5,8.8])
    for row in Xs1:
        tot = 0
        for i in range(offset):
            tot += (x[offset+i])*sum([np.tanh((_-x[i])/x[2*offset+i])+1.0 for _ in row])
            #print(x[i],row)
            #tot += 5*sum([float(_ > x[i]) for _ in row])

        vec.append(tot)
    vec = np.array(vec)
    vec -= vec.mean()
    return ((vec-ys1)**2).mean()#,vec

iv = [7.27, 7.28, 1.81]
r2 = get_rating(iv)
r2

In [None]:
import scipy.optimize as opt
import cma

es = cma.CMAEvolutionStrategy(iv,0.5, {'tolx':1e-6,'tolfun':1e-6,'maxfevals':1000})
es.optimize(get_rating)

In [None]:
get_rating(es.best.x)

In [None]:
tv = np.round(es.best.x,2)
tv,get_rating(tv)

In [None]:
iv = np.array(iv)
iv[:len(iv)//3] *=10
iv[-len(iv)//3:] *=10
iv

In [None]:
def get_rating_vec(x):
    vec = []
    offset = len(x)//3
    for row in Xs1:
        tot = 0
        for i in range(offset):
            tot += (x[offset+i])*sum([np.tanh((_-x[i])/x[2*offset+i])+1.0 for _ in row] ) 
            #print(x[i],row)
            #tot += 5*sum([float(_ > x[i]) for _ in row])

        vec.append(tot)
    vec = np.array(vec)
    #vec -= vec.mean()
    return vec
plt.figure(figsize=(5,5))
vt = get_rating_vec(iv)
vtm = vt.mean()/10
plt.scatter(ys1,vt-vtm*10,alpha=0.1,s=10)
#plt.scatter(ys,res.predict(),alpha=0.1,s=10)

plt.xlabel('true MOV')
plt.ylabel('predicted MOV')
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.grid(True)
print(-vtm)

In [None]:
def get_mov(x):
    offset = len(iv)//3
    tot = -vtm
    for i in range(offset):
        tot += (iv[offset+i])*(np.tanh((x-iv[i])/iv[2*offset+i])+1.0)
    return tot
xp = np.linspace(0,101)
yp = [get_mov(_) for _ in xp]
plt.plot(xp,yp)
plt.grid(True)
print(vtm)

In [None]:
r_lvl = get_mov(int(round(RL)))
r_lvl

In [None]:
mov = {i:get_mov(i) for i in range(101)}

In [None]:
ovr_salaries = defaultdict(list)
sal_ovrd = defaultdict(list)
for p,s in player_ratings:
    v = player_ratings[(p,s)]
    if (p,s) in contracts:
        c = contracts[(p,s)][0]
        ovr_salaries[v['ovr']].append(c)
        sal_ovrd[c].append(v['ovr'])
sal_ovr = np.array([[s,mov[round(np.mean(c))]-r_lvl] for s,c in sal_ovrd.items()])
sal_ovr[:,0]/=30000

sO = sm.RLM(sal_ovr[:,1],sal_ovr[:,0]).fit()
plt.scatter(sal_ovr[:,0],sal_ovr[:,1])
plt.scatter(sal_ovr[:,0],sO.predict())
plt.xlabel('% of max salary')
plt.ylabel('+/- above replacement')
sA = sO.params[0]
sA

In [None]:
Xs = defaultdict(list)
ys = defaultdict(list)

for tid, s in team_movs:
    if s in valid_seasons:
        itmov = team_movs[(tid,s)]
        
        pars = defaultdict(list)
        tss = defaultdict(int)

        dpars = []
        for i in range(YEARS_TO_MODEL):
            for pid in team_players[(tid,s)]:
                if (pid,s) in contracts and contracts[(pid,s)][1] >= i:
                    tss[i] += contracts[(pid,s)][0]
                    age = s-player_born[pid]
                    ovr = player_ratings[(pid,s)]['ovr']
                    for j in range(max(0,i-1)):
                        ovr+=age_shift.get(age+j,max_shift)
                    pars[i].append(ovr)
        for pid in team_players[(tid,s)]:
            if (pid,s) in contracts and contracts[(pid,s)][1] >= YEARS_TO_MODEL:
                age = s-player_born[pid]
                
                ovr2 = player_ratings[(pid,s)]['ovr']
                povrs = [ovr2]
                for j in range(contracts[(pid,s)][1]-1):
                    ovr2+=age_shift.get(age+j,max_shift)
                    povrs.append(ovr2)
                pmovs = [mov[int(np.clip(np.round(ovr2),0,100))] for ovr2 in povrs]
                ccont = [30000*(pmov-r_lvl)/sA for pmov in pmovs]
                cvals = [c-contracts[(pid,s)][0] for c in ccont]
                amount_to_add = sum(cvals[YEARS_TO_MODEL:])/YEARS_TO_MODEL
                for i  in range(YEARS_TO_MODEL):
                    tss[i] -= amount_to_add
                    
        for i in range(YEARS_TO_MODEL):
            tmov = team_movs[(tid,s+i)]
            play = [p for p in pars[i] if p >= RL]
            lp = len(play)
            if lp < 10:
                play= play + (10-lp)*[RL]
            play = sorted(play,reverse=True)[:10]
            play_s = sum([np.exp(i*res2.x[0])*p for i,p in enumerate(play)])*res2.x[1] -res2.x[2]
            #play_d = sum([_ for _ in dpars[i]])
            
            cap_hit = tss[i] + (10-lp)*750 #+ dtss[i]
            
            diff = (90000-cap_hit)/90000
            cap_space = np.maximum(diff,0.1*diff)
            
            Xs[i].append([cap_space,play_s,])
            ys[i].append(tmov)


In [None]:
np.set_printoptions(suppress=True)
scales=[]
rsq = [] 
for i in range(YEARS_TO_MODEL):
    pred = sm.OLS(ys[i],(Xs[i])).fit()
    scales.append(np.std(ys[i])/np.std(pred.predict()))
    rsq.append(pred.rsquared)
    print(i,": (",list(np.round(pred.params,3)),',', np.round(scales[-1],2),'),')
rsq

In [None]:
[0.6356287420262297, 0.2814157259822434, 0.1303198654171429]

In [None]:
draftP,sA

In [None]:
pred.summary()

In [None]:
xp = sorted([k for k in ovr_salaries.keys() if k > 45])
yp = [np.mean(ovr_salaries[k]) for k in xp]
yp2 = [1000*(mov[k]-r_lvl) for k in xp]

plt.plot(xp,yp)


In [None]:
vvec = np.array(yp2)/np.array(yp)
plt.plot(xp,vvec)
#plt.plot(xp,0.11*np.log(np.array(xp)-43))
plt.xlabel('ovr')
plt.ylabel('value over replacement per $1M')
#def log_fit(x):
#    return np.linalg.norm(x[0]*np.log(np.array(xp)-43)-vvec)
#opt.optimize.fmin(log_fit,[1])

In [None]:
np.std(np.array(Xs[1]),axis=0),np.mean(np.array(Xs[1]),axis=0)

In [None]:
plt.hist(np.array(Xs[0])[:,1])