In [None]:
import glob
import json
import matplotlib.pyplot as plt  
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

def get_cols():
    cols = {
        'QB': [],
        'RB1': [],
        'RB2': [],
        'TE1': [],
        'TE2': [],
        'WR1': [],
        'WR2': [],
        'WR3': [],
        'WR4': [],
        'WR5': [],
        'OL1': [],
        'OL2': [],
        'OL3': [],
        'OL4': [],
        'OL5': [],
        'CB1': [],
        'CB2': [],
        'CB3': [],
        'S1': [],
        'S2': [],
        'S3': [],
        'LB1': [],
        'LB2': [],
        'LB3': [],
        'LB4': [],
        'DL1': [],
        'DL2': [],
        'DL3': [],
        'DL4': [],
        'K': [],
        'P': [],
        "mov": [],
    }
    cols2 = []
    files = glob.glob('data*.json')
    files.sort()
    print(files)

    for file in files:
        with open(file, "r", encoding='utf-8-sig') as read_file:
            data = json.load(read_file)

        def get_ovrs(tid, season):
            ovrs_by_pos = {
                'QB': [],
                'RB': [],
                'TE': [],
                'WR': [],
                'OL': [],
                'CB': [],
                'S': [],
                'LB': [],
                'DL': [],
                'K': [],
                'P': [],
            }

            for p in data['players']:
                if tid in p['statsTids']:
                    for ps in p['stats']:
                        if ps['season'] == season and ps['tid'] == tid:
                            found_ratings = False
                            for pr in p['ratings']:
                                if pr['season'] == season:
                                    found_ratings = True
                                    ovrs_by_pos[pr['pos']].append(pr['ovr'])
                                    break
                            if not found_ratings:
                                raise Exception("No ratings found")
                            break
                        elif ps['season'] > season:
                            break

            for key in ovrs_by_pos.keys():
                ovrs_by_pos[key].sort(reverse=True)

            return ovrs_by_pos

        for t in data['teams']:
            tid = t['tid']
            for ts in t['stats']:
                if not ts['playoffs'] and ts['gp'] > 0:
                    season = ts['season']
                    mov = (ts['pts'] - ts['oppPts']) / ts['gp'];
                    cols['mov'].append(mov)

                    ovrs = get_ovrs(tid, season)
                    ovrs['mov'] = mov
                    cols2.append(ovrs)
                    default_ovr = 20

                    cols['QB'].append(ovrs['QB'][0] if len(ovrs['QB']) >= 1 else default_ovr)
                    cols['RB1'].append(ovrs['RB'][0] if len(ovrs['RB']) >= 1 else default_ovr)
                    cols['RB2'].append(ovrs['RB'][1] if len(ovrs['RB']) >= 2 else default_ovr)
                    cols['TE1'].append(ovrs['TE'][0] if len(ovrs['TE']) >= 1 else default_ovr)
                    cols['TE2'].append(ovrs['TE'][1] if len(ovrs['TE']) >= 2 else default_ovr)
                    cols['WR1'].append(ovrs['WR'][0] if len(ovrs['WR']) >= 1 else default_ovr)
                    cols['WR2'].append(ovrs['WR'][1] if len(ovrs['WR']) >= 2 else default_ovr)
                    cols['WR3'].append(ovrs['WR'][2] if len(ovrs['WR']) >= 3 else default_ovr)
                    cols['WR4'].append(ovrs['WR'][3] if len(ovrs['WR']) >= 4 else default_ovr)
                    cols['WR5'].append(ovrs['WR'][4] if len(ovrs['WR']) >= 5 else default_ovr)
                    cols['OL1'].append(ovrs['OL'][0] if len(ovrs['OL']) >= 1 else default_ovr)
                    cols['OL2'].append(ovrs['OL'][1] if len(ovrs['OL']) >= 2 else default_ovr)
                    cols['OL3'].append(ovrs['OL'][2] if len(ovrs['OL']) >= 3 else default_ovr)
                    cols['OL4'].append(ovrs['OL'][3] if len(ovrs['OL']) >= 4 else default_ovr)
                    cols['OL5'].append(ovrs['OL'][4] if len(ovrs['OL']) >= 5 else default_ovr)
                    cols['CB1'].append(ovrs['CB'][0] if len(ovrs['CB']) >= 1 else default_ovr)
                    cols['CB2'].append(ovrs['CB'][1] if len(ovrs['CB']) >= 2 else default_ovr)
                    cols['CB3'].append(ovrs['CB'][2] if len(ovrs['CB']) >= 3 else default_ovr)
                    cols['S1'].append(ovrs['S'][0] if len(ovrs['S']) >= 1 else default_ovr)
                    cols['S2'].append(ovrs['S'][1] if len(ovrs['S']) >= 2 else default_ovr)
                    cols['S3'].append(ovrs['S'][2] if len(ovrs['S']) >= 3 else default_ovr)
                    cols['LB1'].append(ovrs['LB'][0] if len(ovrs['LB']) >= 1 else default_ovr)
                    cols['LB2'].append(ovrs['LB'][1] if len(ovrs['LB']) >= 2 else default_ovr)
                    cols['LB3'].append(ovrs['LB'][2] if len(ovrs['LB']) >= 3 else default_ovr)
                    cols['LB4'].append(ovrs['LB'][3] if len(ovrs['LB']) >= 4 else default_ovr)
                    cols['DL1'].append(ovrs['DL'][0] if len(ovrs['DL']) >= 1 else default_ovr)
                    cols['DL2'].append(ovrs['DL'][1] if len(ovrs['DL']) >= 2 else default_ovr)
                    cols['DL3'].append(ovrs['DL'][2] if len(ovrs['DL']) >= 3 else default_ovr)
                    cols['DL4'].append(ovrs['DL'][3] if len(ovrs['DL']) >= 4 else default_ovr)
                    cols['K'].append(ovrs['K'][0] if len(ovrs['K']) >= 1 else default_ovr)
                    cols['P'].append(ovrs['P'][0] if len(ovrs['P']) >= 1 else default_ovr)

    return cols,cols2

cols,cols2 = get_cols()

dataset = pd.DataFrame(cols)

In [None]:
cols2

In [None]:

reg = LinearRegression(normalize=True)
fit_cols = ['QB', 'RB1', 'RB2', 'TE1', 'TE2', 'WR1', 'WR2', 'WR3', 'WR4', 'WR5', 'OL1', 'OL2', 'OL3', 'OL4', 'OL5', 'CB1', 'CB2', 'CB3', 'S1', 'S2', 'S3', 'LB1', 'LB2', 'LB3', 'LB4', 'DL1', 'DL2', 'DL3', 'DL4', 'K', 'P']

reg.fit(dataset[fit_cols], dataset['mov'])
dataset['mov_predicted'] = reg.predict(dataset[fit_cols])
alt_v = np.maximum(reg.coef_,0)
print('Intercept: \n', reg.intercept_)
print('Coefficients: \n', reg.coef_)
print('r2: ', r2_score(dataset['mov'], dataset['mov_predicted']))
for l,a in zip(fit_cols, 100*alt_v/alt_v.sum()):
    print(l,'\t',round(a,1))

print(dataset)


dataset.plot.hexbin(x='mov', y='mov_predicted', gridsize=20)
# dataset.plot.scatter(x='mov', y='mov_predicted', alpha=0.2)
plt.xlabel('Actual MOV')  
plt.ylabel('Predicted MOV')  

plt.plot([-20, 20], [-20, 20])

plt.show()


In [None]:
fit_col_I = [''.join(filter(lambda x: x.isalpha(),_)) for _ in fit_cols]

In [None]:
from collections import defaultdict
import scipy.optimize as opt

init_s = defaultdict(list)
# group groups
for c,g in zip(reg.coef_,fit_col_I):
    init_s[g].append(c)
# add filler
for s in init_s:
    if len(init_s[s]) == 1:
        init_s[s].append(init_s[s][0]/4)

In [None]:
plt.style.use('fivethirtyeight')
plt.style.use('seaborn-white')
group_x = {}
i = 1
for group in init_s:
    plt.subplot(4,3,i)
    i+=1
    data = init_s[group]
    data = sorted(np.maximum(data,0),reverse=True)
    def sig_fit(x):
        return sum([(x[0]*(np.tanh(x[1]*i+x[2])/2+0.5)-c)**2 for i,c in enumerate(data)])

    resx = opt.minimize(sig_fit,[data[0],-1,0])
    optx = resx.x
    group_x[group] = optx
    #plt.figure()
    plt.title(group)
    plt.plot(data)
    plt.grid(True)
    plt.ylim(bottom=0)
    plt.plot([optx[0]*(np.tanh(optx[1]*i+optx[2])/2+0.5) for i,c in enumerate(data)])
#plt.tight_layout()

In [None]:
from sklearn.metrics import r2_score
def rsquared(x, y):
    """ Return R^2 where x and y are array-like."""

    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(x, y)
    return r_value**2

In [None]:
num_x = len(group_x) * 3
pos_key = sorted(group_x.keys())
init_xO = np.hstack([group_x[p] for p in pos_key])
init_x = np.hstack([group_x[p] for p in pos_key])# + [mean_x]

In [None]:
mov_target = [_['mov'] for _ in cols2]

In [None]:
import scipy
def fit_coef(x):
    xR = x.reshape((-1,3))
    x_g = {p:x for p,x in zip(pos_key,xR)}
    pmovs = []
    for d in cols2:
        pmov = 0
        for p in pos_key:
            ovrs = d.get(p,[])

            for i,o in enumerate(ovrs):
                x = x_g[p]
                f = x[0]*(np.tanh(x[1]*i+x[2])/2+0.5)
                pmov += f * o
        pmovs.append(pmov)
    return -rsquared(mov_target, pmovs)
fit_coef(init_x)

In [None]:
#opt_res = opt.minimize(fit_coef,init_x)

In [None]:
import cma

In [None]:
es = cma.CMAEvolutionStrategy(init_x,0.01)
es.optimize(fit_coef)

In [None]:
fit_coef(init_x),fit_coef(es.best.x),fit_coef(init_x)>fit_coef(es.best.x)

In [None]:
xR = es.best.x.reshape((-1,3))
best_group = {p:x for p,x in zip(pos_key,xR)}
init_x = es.best.x

In [None]:
plt.style.use('fivethirtyeight')
plt.style.use('seaborn-white')
i = 1
plt.figure(figsize=(8,8))
for group in init_s:
    plt.subplot(4,3,i)
    i+=1
    data = init_s[group]
    #data = sorted(np.maximum(data,0),reverse=True)
    optx = best_group[group]
    #plt.figure()
    plt.title(group)
    plt.plot(data)
    plt.grid(True)
    alt_vec = [optx[0]*(np.tanh(optx[1]*i+optx[2])/2+0.5) for i in range(len(data)+1)]
    alt_vec = np.array(alt_vec)
    #alt_vec *= max(data)/alt_vec[0]
    plt.plot(alt_vec)
    plt.ylim(bottom=0)

    plt.tight_layout()

In [None]:
max(data)

In [None]:
old_init_x = np.array([ 0.65275177, -0.89241521,  1.59863404,  2.18819211, -0.34037103,
        0.33929506,  0.48856575, -6.19370476,  0.41388857,  0.44215849,
       -3.15943921,  4.97916601,  1.12291262, -0.58793127,  2.17572672,
        2.80326994, -4.54268599, -1.19648499,  1.47707633, -5.20456951,
        1.76582943,  0.3279489 , -0.24147471, -0.325921  ,  0.37174943,
       -2.29992604,  4.81809201,  0.28041435, -2.72713033,  3.58258538,
        0.17499069, -0.29672423,  0.76752692])

In [None]:
xR = old_init_x.reshape((-1,3))
x_g = {p:x for p,x in zip(pos_key,xR)}
pmovs = []
for d in cols2:
    pmov = 0
    for p in pos_key:
        ovrs = d.get(p,[])

        for i,o in enumerate(ovrs):
            x = x_g[p]
            f = x[0]*(np.tanh(x[1]*i+x[2])/2+0.5)
            pmov += f * o
    pmovs.append(pmov)

In [None]:
ms,us = dataset['mov_predicted'].mean(),dataset['mov_predicted'].std()
pmovs = np.array(pmovs)#.mean()
plt.scatter(mov_target,ms+us*(pmovs-pmovs.mean())/pmovs.std())

In [None]:
plt.scatter(mov_target,dataset['mov_predicted'],s=5,alpha=0.5)
plt.scatter(mov_target,ms+us*(pmovs-pmovs.mean())/pmovs.std(),s=5,alpha=0.5)

In [None]:
init_x