In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.linear_model import *
from collections import defaultdict
import json

import os, sys
import scipy.optimize as opt


import fnmatch
plt.style.use('seaborn-white')

In [None]:
file = 'mid2021.json' #'equal_bud.json' # eq_fa_fix
data = json.load(open(file,'rt',encoding='utf-8-sig'))

# Step 1: Get a bunch of stats about contracts, players, etc.

In [None]:
player_ratings = {}
player_picks = {}
replacement_ovrs = []
player_born = {}
player_seasons = defaultdict(int)
contracts = {}
full_contracts =  {}
player_teams = {}
draft_picks = defaultdict(list)
draft_picks_avg_ovr = defaultdict(list)
draft_picks_avg_age = defaultdict(list)
player_draft_ageovr = {}
draft_salaries = {}

draft_pos_avg = {}
made_all_nba = []

team_players = defaultdict(set)
for p in data['players']:
    player_born[p['pid']] = p['born']['year']
    dp = p['draft']['pick']+ (p['draft']['round']-1)*30
    if dp < 0:
        dp = 0
    player_picks[p['pid']] = (dp,p['draft']['year'])
    dtid = p['draft']['originalTid']
    if dtid >= 0:
        draft_picks[(dtid,p['draft']['year'])].append(dp)
        draft_pos_avg[p['pid']] = dp
    if p['ratings'][0]['season'] ==  p['draft']['year']:
        player_draft_ageovr[p['pid']] = (p['draft']['ovr'],p['draft']['pot'],p['draft']['year']-p['born']['year'])
        draft_picks_avg_ovr[dp].append(p['draft']['ovr'])
        draft_picks_avg_age[dp].append(p['draft']['year']-p['born']['year'])

    for r in p['ratings']:
        player_seasons[p['pid']] += 1
        player_ratings[(p['pid'],r['season'])] ={k:v for k,v in r.items() if type(v) == int and k != 'season'}
    
    for r in p['stats']:
        if r['playoffs']:
            continue
        team_players[(r['tid'],r['season'])].add(p['pid'])
        player_teams[(p['pid'],r['season'])] = r['tid']
        
    td = {s['season']: s['amount'] for s in p['salaries']}
    for k,a in td.items():
        full_contracts[(p['pid'],k)] = a
    new_c = {s:td[s+1] for s in td if s+1 in td and td[s+1]!=td[s]}

    for s,c in new_c.items():
        if c == 750:
            replacement_ovrs.append(player_ratings[(p['pid'],s)]['ovr'])
            
    for k,a in td.items():
        yl = 0
        ko = k + 0

        while k+1 in td:
            if td[k+1] == a:
                yl += 1
            else:
                break
            k+=1
        contracts[(p['pid'],ko)] = (a,yl)

    for k,a in td.items():
        if p['draft']['year']+1 == k:
            draft_salaries[dp] = a
            break
    all_nba_years = set([_['season'] for _ in p['awards'] if _['type'] == 'First Team All-League'])
    all_nba = [[r['ovr'],int(r['season'] in all_nba_years)] for r in p['ratings']] 
    made_all_nba += all_nba

In [None]:
# replacement level is based on minimum contract
plt.hist(replacement_ovrs)
RL = np.mean(replacement_ovrs)
RL

# Step 2: Get a player value curve
Here it's just a sigmoid from ovr to a blend of VORP/EWA/WS. Ensure that 0 is always replacement level

In [None]:
res = []
for p in data['players']:
    ss = {_['season']:_ for _ in p['stats']}
    rs ={_['season']:_ for _ in p['ratings']}
    for k in ss:
        # average of EWA/VORP/WS
        # 2 * VORP and 1.5 * WS is to get on same scale as EWA
        res.append([rs[k]['ovr'],0.333*(ss[k]['ewa'] + 2*ss[k]['vorp'] + 1.5*(ss[k]['ows'] + ss[k]['dws'])),ss[k]['min']])
res = np.array(res)

In [None]:
res2 = res[res[:,2] > 800]
plt.scatter(res2[:,0],res2[:,1],s=np.sqrt(res2[:,2])*0.3,alpha=0.05)
xp = np.linspace(30,90)
xpm = np.array([6.411, -61.762, 0.094, 6.89])
yp = xpm[3]*np.tanh(xpm[2]*(xp+xpm[1]))+xpm[0]
plt.plot(xp,yp,c='r')

In [None]:
def err_fit(xpm):
    # going to force intercept so replacement level is 0
    INT = (np.arctanh(- xpm[0]/ xpm[3])/ xpm[2]-RL)
    INT = 60 if np.isnan(INT) else INT
    curv = xpm[3]*np.tanh(xpm[2]*(res[:,0]+INT))+xpm[0]
    return ((res[:,2]) * abs(curv-res[:,1])**2).mean()
err_fit(xpm)
opt_res = opt.minimize(err_fit,xpm)

In [None]:
yp = xpm[3]*np.tanh(xpm[2]*(xp+xpm[1]))+xpm[0]
plt.plot(xp,yp,label='init')
INT = (np.arctanh(- opt_res.x[0]/ opt_res.x[3])/ opt_res.x[2]-RL)
yp = opt_res.x[3]*np.tanh(opt_res.x[2]*(xp+INT))+opt_res.x[0]
plt.plot(xp,yp,label='opt')
plt.legend()
plt.xlabel('ovr')
plt.ylabel('war')
plt.grid(True)
def value_predict(overall):
    xpm = np.array([6.411, -61.762, 0.094, 6.89])
    return xpm[3]*np.tanh(xpm[2]*(overall+xpm[1]))+xpm[0]

In [None]:
res_params = list(np.round(opt_res.x,3))
res_params[1] = round(INT,3)
print(res_params)

# Time for some formulas
Ovr to All-NBA, Draft pick to OVR, etc.

In [None]:
made_all_nba = np.array(made_all_nba)

clf = LogisticRegression(C=1)
clf.fit(made_all_nba[:,:1],made_all_nba[:,1])
clf.score(made_all_nba[:,:1],made_all_nba[:,1])

In [None]:
fTA,fTB = clf.coef_.ravel()[0], clf.intercept_[0]
xp = np.linspace(0,100)
make_all_nba = 1.0/(1+np.exp(-xp*fTA -fTB))
plt.plot(xp,make_all_nba,c='r')
plt.scatter(made_all_nba[:,0],made_all_nba[:,1],s=5,alpha=0.5)
fTA,fTB

In [None]:
plt.style.use('seaborn-white')
dp_ovr = np.array(sorted([(k,np.mean(v)) for k,v in draft_picks_avg_ovr.items()]))[1:,1]

WEG = np.ones(60)
WEG[:15] = 4
FACTOR = 0.5
def best_fit_func_exp(x):
    return np.linalg.norm(( WEG*(np.exp(x[0]*np.arange(60)**FACTOR) *x[1]+x[2] - dp_ovr)))
res2 = opt.minimize(best_fit_func_exp,[-0.05,22,16],method='Nelder-Mead')
dp_ovr_x = res2.x

plt.plot(dp_ovr)
plt.plot(np.exp(dp_ovr_x[0]*np.arange(60)**FACTOR) *dp_ovr_x[1]+dp_ovr_x[2])

dp_ovr_x

In [None]:
team_movs = {}

for t in data['teams']:
    tid = t['tid']
    for ts in t['stats']:
        if not ts['playoffs'] and ts['gp'] > 0:
            season = ts['season']
            mov = (ts['pts'] - ts['oppPts']) / ts['gp'];
            team_movs[(tid,season)] = mov


In [None]:
won_cup = {}
season_ratings = defaultdict(list)
draft_pos_win = defaultdict(list)
draft_age_ovr_winX = []
draft_age_ovr_winy = []
draft_age_ovr_winp = []

for team in data['teams']:
    tid = team['tid']
    for season in team['seasons']:
        s = season['season']
        if season['gp'] >= 82:
            they_won = float(season['playoffRoundsWon'] >= 2) # not really finals -- 4
            #they_won = float(season['won'] >= 52)
            won_cup[(tid,s)] = they_won
            season_ratings[s].append((season['won'],team_movs[(tid,s)]))
            for i,v in enumerate(sorted([(player_ratings[(_,s)]['ovr'],draft_pos_avg[_]) for _ in team_players[(tid,s)] if _ in draft_pos_avg],reverse=True)):
                r,p = v
                if they_won:
                    draft_pos_win[p].append(i == 0)
            for i,v in enumerate(sorted([(player_ratings[(_,s)]['ovr'],player_draft_ageovr[_],draft_pos_avg[_]) for _ in team_players[(tid,s)] if _ in player_draft_ageovr and _ in draft_pos_avg],reverse=True)):
                if they_won:
                    draft_age_ovr_winX.append(v[1])
                    draft_age_ovr_winy.append(i == 0)
                    draft_age_ovr_winp.append(v[2]-1)


In [None]:
winX = np.array(draft_age_ovr_winX).astype(float)
winY = np.array(draft_age_ovr_winy)
dp_ovrW = LogisticRegression(C=0.1)
dp_ovrW.fit(winX,winY)

plt.hist(dp_ovrW.predict_proba(winX)[:,1])
print([dp_ovrW.intercept_[0]] + list(dp_ovrW.coef_[0]) )

In [None]:
winX[np.argmax(dp_ovrW.predict_proba(winX)[:,1])],100*max(dp_ovrW.predict_proba(winX)[:,1])

In [None]:
won_cup2 = np.array([[team_movs[k],won_cup[k]] for k in won_cup])
plt.scatter(won_cup2[:,0],won_cup2[:,1],s=5,alpha=0.5)
clf_win = LogisticRegression()
clf_win.fit(won_cup2[:,:1],won_cup2[:,1])
cB,cA = clf_win.intercept_[0],clf_win.coef_[0,0]

xp = np.linspace(-20,20)
win_p = 1.0/(1+np.exp(-xp*cA -cB))
plt.plot(xp,win_p,c='r')
cA,cB

In [None]:
plt.style.use('fivethirtyeight')
tv = np.array(sorted([(k,np.mean(v)) for k,v in draft_pos_win.items()]))

def testX(x):
    return np.linalg.norm(x[1]*np.exp(-x[0]*np.arange(60)**x[2]) - tv[:,1])
draftP = opt.fmin(testX,[.2,.2,.65])
    
plt.plot(tv[:,1],label='pos raw')
plt.plot(draftP[1]*np.exp(-draftP[0]*np.arange(60)**draftP[2]),label='pos exp fit')
plt.plot(pd.DataFrame([draft_age_ovr_winp,dp_ovrW.predict_proba(winX)[:,1]]).T.groupby(0).mean(),label='ovr/pot/age logit')
plt.plot([0,60],[1/15,1/15],c='k',ls='--')
plt.legend()
plt.xlabel('draft pick')
plt.ylabel('chance of being the best player\non a finals roster')
draftP

In [None]:
mov_to_pos = []
for k in season_ratings:
    mov_to_pos += [(i,_[1]) for i,_ in enumerate(sorted(season_ratings[k]))]
arr = np.array(mov_to_pos)
clf3 = LinearRegression().fit(arr[:,:1],arr[:,1])
plt.scatter(arr[:,0],arr[:,1])

In [None]:
import statsmodels.api as sm
clf_pos = sm.Logit(arr[:,0]/29,sm.add_constant(arr[:,1])).fit()

xp = np.linspace(-15,15)
yp = 1/(1+np.exp( 0.0009 - 0.3430*(xp)))
plt.plot(xp,yp)
plt.scatter(arr[:,1],clf_pos.predict(),c='r',alpha=0.1)

clf_pos.summary()


In [None]:
YEARS_TO_MODEL = 10
YEARS_TO_USE = 10

In [None]:
offset_years = defaultdict(list)
for k,v in team_movs.items():
    tid,s = k
    for i in range(1,YEARS_TO_USE):
        if (tid,s+i) in team_movs:
            offset_years[i].append([v,team_movs[(tid,s+i)]])

In [None]:
tmov_mul = {0:1}
for k,v in offset_years.items():
    arr = np.array(offset_years[k])
    plt.figure()
    plt.scatter(arr[:,0],arr[:,1],alpha=0.1,s=5)
    plt.title(k)
    clf3 = sm.RLM(arr[:,0],arr[:,1]).fit()
    xp = np.linspace(-15,15)
    plt.plot(xp,xp*clf3.params[0],c='r')
    tmov_mul[k] = clf3.params[0]
    print(k,clf3.params,)

In [None]:
clf3.summary()

In [None]:
team_ovrs = {k: sorted([player_ratings[(p,k[1])]['ovr'] for p in team_players[k]],reverse=True)[:10] for k in team_players}

In [None]:
sorted(team_ovrs[(23,2023)],reverse=True)[:10]

In [None]:
Xs1 = []
ys1 = []
for t, s in team_movs:
    Xs1.append(team_ovrs[(t,s)])
    ys1.append(team_movs[(t,s)])

In [None]:
Xs1

In [None]:
res_ltr = sm.OLS(ys1,sm.add_constant(Xs1)).fit()

In [None]:
def best_fit_func_exp(x):
    return np.linalg.norm(( Xs1 @ np.exp(x[0]*np.arange(10))*x[1]+x[2] - ys1))
a,b = np.polyfit(np.arange(10),np.log(np.array(res_ltr.params[1:])),1)
res_tr = opt.minimize(best_fit_func_exp,[a,np.exp(b),-101],method='Nelder-Mead')
print(list(res_tr.x))

In [None]:
plt.plot(res_ltr.params[1:])
plt.plot( np.exp(res_tr.x[0]*np.arange(10))*res_tr.x[1])
plt.xlabel('value of Nth best player')
plt.grid(True)
_ = plt.xticks(np.arange(10),np.arange(10)+1)

In [None]:
plt.figure(figsize=(5,5))
plt.scatter(ys1,res_ltr.predict(),alpha=0.1,s=10)
plt.xlabel('true MOV')
plt.ylabel('predicted MOV')
plt.xlim(-15,15)
plt.ylim(-15,15)
plt.grid(True)

In [None]:
players_years = defaultdict(list)
for k,v in player_ratings.items():
    pid,s = k
    players_years[pid].append(s)

In [None]:
age_loop = defaultdict(list)
for pid,s in player_ratings:
    if player_seasons[pid] <7:
        continue
    k = (pid,s)
    nk =(pid,s+1)
    age = s-player_born[pid]
    if nk in player_ratings:
        age_loop[age].append(player_ratings[nk]['ovr']-player_ratings[k]['ovr'])
    #player_ratings[k]['ovr']

In [None]:
from scipy.stats import trim_mean
age_shift = {k:trim_mean(age_loop[k],.1) for k in sorted(age_loop.keys()) if k < 40}

In [None]:
big_age = sum([[[age,_] for _ in age_loop[age]] for age in age_loop],[])
big_age = np.array(big_age)

In [None]:
{k:round(v,1) for k,v in age_shift.items()}

In [None]:
age_curve = np.array([[k,v] for k,v in age_shift.items()])
plt.plot(age_curve[:,0],age_curve[:,1])
progs_vec = [4,-21,-0.112,9]
xp = np.linspace(15,45)

def best_fit_func_exp2(x):
    return np.linalg.norm(( x[3]*np.tanh(x[2]*(big_age[:,0]+x[1])))+x[0] - big_age[:,1])
res_progs = opt.minimize(best_fit_func_exp2,progs_vec,method='Nelder-Mead')
progs_vec = res_progs.x
def get_shift(age):
    return progs_vec[3]*np.tanh(progs_vec[2]*(age+progs_vec[1]))+progs_vec[0]
plt.plot(xp,get_shift(xp))
res_progs

In [None]:
valid_seasons = sorted(set([s for tid,s in team_movs]))
valid_seasons = valid_seasons[:-YEARS_TO_USE]

In [None]:
ovr_salaries = defaultdict(list)
sal_ovrd = defaultdict(list)
for p,s in player_ratings:
    v = player_ratings[(p,s)]
    if (p,s) in contracts:
        c = contracts[(p,s)][0]
        ovr_salaries[v['ovr']].append(c)
        sal_ovrd[c].append(v['ovr'])
sal_ovr = np.array([[s,value_predict(np.array(c)).mean()] for s,c in sal_ovrd.items()])
sal_ovr[:,0]/=30000

sO = sm.RLM(sal_ovr[:,1],sal_ovr[:,0]).fit()
plt.scatter(sal_ovr[:,0],sal_ovr[:,1],alpha=0.5)
plt.scatter(sal_ovr[:,0],sO.predict())
plt.xlabel('% of max salary')
plt.ylabel('wins above replacement')
sA = sO.params[0]
sA

In [None]:
Xs = defaultdict(list)
ys = defaultdict(list)

PV = defaultdict(list)
WC = defaultdict(list)

for tid, s in team_movs:
    if s in valid_seasons:
        itmov = team_movs[(tid,s)]
        
        pars = defaultdict(list)
        tss = defaultdict(int)

        dpars = []
        for pid in team_players[(tid,s)]:
            age = s-player_born[pid]
            ovr = player_ratings[(pid,s)]['ovr']
            pars[0].append(ovr)
            tss[0] += full_contracts.get((pid,s),750)
            for j in range(1,YEARS_TO_MODEL):
                ovr+=get_shift(age+j)#,max_shift)
                if (pid,s+j) in player_teams and player_teams[(pid,s+j)] == tid:
                    pars[j].append(ovr)
                    tss[j]+=full_contracts.get((pid,s+j),750)
            
        for i in range(YEARS_TO_MODEL):
            tmov = team_movs[(tid,s+i)]
            play = [p for p in pars[i] if p >= RL]
            lp = len(play)
            if lp < 10:
                play= play + (10-lp)*[RL]
            play = sorted(play,reverse=True)[:10]
            play_s = sum([np.exp(i*res_tr.x[0])*p for i,p in enumerate(play)])*res_tr.x[1] -res_tr.x[2]
            #play_d = sum([_ for _ in dpars[i]])
            
            cap_hit = tss[i] + (10-lp)*750 #+ dtss[i]
            
            diff = (90000-cap_hit)/90000
            cap_space = np.maximum(diff,0.1*diff)
            
            Xs[i].append([cap_space])
            PV[i].append(play_s)
            ys[i].append(tmov-play_s)
            WC[i].append(won_cup[(tid,s+i)])


In [None]:
{k:len(v) for k,v in pars.items()}

In [None]:
tid,s,j

In [None]:
from sklearn.metrics import r2_score
import copy 
np.set_printoptions(suppress=True)
scales=[]
means = []
rsq = []
llf = []
plt.figure(figsize=(9,3))
for i in range(YEARS_TO_MODEL):

    pred = sm.OLS(ys[i],sm.add_constant(Xs[i])).fit()

    play_v = np.array(PV[i])
    pred_y = pred.predict()+play_v
    true_y = np.array(ys[i])+play_v
    

    rsq.append(r2_score(true_y,pred_y))
    
    plt.subplot(1,YEARS_TO_MODEL,i+1)
    
    ALT = np.hstack([np.array(Xs[i]),np.array(PV[i])[:,None]])
    if i == 0:
        print(ALT.shape)
        ALT[:,0] *= 0
    predWC = sm.Logit(WC[i],sm.add_constant(ALT)).fit(disp=0,method='minimize')
    llf.append(predWC.llf)
    #xp = np.linspace(-15,15)
    #tmp = predWC.params[0]  + predWC.params[1]*xp
    #yp = 1/(1+np.exp(-tmp))
    plt.ylim(-0.1,1)
    #plt.xlabel('pred MOV')
    #plt.plot(xp,yp)
    plt.scatter(true_y,predWC.predict(),s=5,alpha=0.5)
    predWC.summary()
    #print(i,": (",list(np.round(pred.params,3)) + list(np.round(predWC.params,3)),'),')
    print(i,": (",list(np.round(predWC.params,3)),'),')

    plt.title('{}'.format(i))

rsq,llf

In [None]:
draftP,sA

In [None]:
predWC.summary()

In [None]:
xp = sorted([k for k in ovr_salaries.keys() if k > 40])
yp = [np.mean(ovr_salaries[k]) for k in xp]
yp2 = [1000*value_predict(k) for k in xp]

plt.plot(xp,yp)


In [None]:
vvec = np.array(yp2)/np.array(yp)
plt.plot(xp,vvec)
#plt.plot(xp,0.11*np.log(np.array(xp)-43))
plt.xlabel('ovr')
plt.ylabel('value over replacement per $1M')
#def log_fit(x):
#    return np.linalg.norm(x[0]*np.log(np.array(xp)-43)-vvec)
#opt.optimize.fmin(log_fit,[1])

In [None]:
np.std(np.array(Xs[1]),axis=0),np.mean(np.array(Xs[1]),axis=0)