In [None]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.linear_model import LinearRegression,Ridge,ElasticNet,ElasticNetCV,LassoCV,SGDRegressor,RidgeCV
from collections import defaultdict
import json
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import os, sys

import fnmatch

In [None]:
# hgt, spd, hpw, con, eye for offense
# hgt, spd, (field gnd, fly, thr), (catcher's cat), (pitcher's ppw, ctrl, mov, end)
# off - def = pts_scored
# for each team
off_cols = ['hgt', 'spd', 'hpw', 'con', 'eye']

dbs_cols = ['hgt','spd']
fld_cols = ['gnd','fly','thr']
cat_cols = ['cat']
ptc_cols = ['ppw','ctl','mov','endu']

def_cols = dbs_cols + fld_cols + cat_cols + ptc_cols
def_idx = {k:i for i,k in enumerate(def_cols)}

In [None]:
Xso = []
scores = []
for file in sorted(os.listdir('.')):
    if fnmatch.fnmatch(file, 'ZGMB_League*.json'):
        print(file)
        data = json.load(open(file,'rt',encoding='utf-8-sig'))

        player_ratings = {}

        for p in data['players']:
            for r in p['ratings']:
                player_ratings[(p['pid'],r['season'])] ={k:v for k,v in r.items() if type(v) == int and k != 'season'}

        for g in data['games']:
            season = g['season']
            if g['won']['tid'] == g['teams'][0]['tid']: #home team won
                pt_t = ( g['won']['pts'] , g['lost']['pts'] )
            else:
                pt_t = ( g['lost']['pts'] , g['won']['pts'] )

            r2 = []
            for tid_l in range(2):
                off_w = []
                off_r = []

                def_w = []
                def_r = []

                for p in g['teams'][tid_l]['players']:
                    rt = player_ratings[(p['pid'],g['season'])] 
                    pos = p['pos']
                    pa = p['pa']/4
                    bf = p['bf']/27
                    def_r_p = np.zeros(len(def_cols))
                    def_w_p = 0*np.ones(len(def_cols))

                    if pa > 0: # offensive contrib
                        off_w.append(pa)
                        off_r.append([pa*rt[_] for _ in off_cols])
                        if 'P' != pos and 'DH' != pos: # fielding contrib
                            for c in dbs_cols + fld_cols:
                                i = def_idx[c]
                                def_w_p[i] += pa
                                def_r_p[i] += pa*rt[c]
                            if pos == 'C':
                                for c in cat_cols:
                                    i = def_idx[c]
                                    def_w_p[i] += pa
                                    def_r_p[i] += pa*rt[c]
                    if bf > 0: # pitching contribu
                        for c in ptc_cols:
                            i = def_idx[c]
                            def_w_p[i] += bf
                            def_r_p[i] += bf*rt[c]
                    if def_w_p.sum() > 0:
                        def_w.append(def_w_p)
                        def_r.append(def_r_p)
                off_vec = (np.array(off_r)).sum(axis=0)/np.sum(off_w)
                def_vec = (np.array(def_r)).sum(axis=0)/np.sum(def_w,axis=0)

                row_vec = list(off_vec) + list(-def_vec)
                y_vec = pt_t[tid_l]
                Xso.append(row_vec)
                scores.append(y_vec)

In [None]:
Xs[0],Xs[1]

In [None]:
y[:2]

In [None]:
Xs = np.nan_to_num(Xso,0)
fx = StandardScaler()
X2 = fx.fit_transform(Xs)
y = np.array(scores).astype(float)

reg = ElasticNetCV(l1_ratio=[.1,.4,.5,.7,.725,.75,.775,.8,.9],cv=3,n_alphas=250,positive=False,max_iter=2e4,fit_intercept=True)#(alpha=0.1,l1_ratio=0.7)#CV(cv=10)#ElasticNetCV(.7,cv=10,)
#reg = ElasticNet(1e-3,0.01,positive=True,max_iter=2e4)#(alpha=0.1,l1_ratio=0.7)#CV(cv=10)#ElasticNetCV(.7,cv=10,)

#reg = lgb.LGBMRegressor()
reg.fit(X2,y)
print(X2.shape,reg.score(X2,y))

In [None]:

plt.style.use('seaborn-white')
plt.scatter(reg.predict(X2),y,s=5,alpha=0.1)
#plt.ylim(-60,60)
#plt.xlim(-60,60)
plt.xlabel('predicted margin')
plt.ylabel('actual margin')

In [None]:
exp_lbl = ['off_' + _ for _ in off_cols] + ['def_' + _ for _ in def_cols] 
filt_lbl = exp_lbl
sorted([(np.round(i,3),n) for i,n in zip(reg.coef_,exp_lbl)],reverse=True)

In [None]:
X3f = pd.DataFrame(Xs,columns=filt_lbl)
X3 = sm.add_constant(X3f)
est = sm.OLS(y, X3)
est2 = est.fit()
est2.summary()

In [None]:
print('home field adv is {:.1f} points'.format(reg.intercept_))

In [None]:
future_use = defaultdict(dict)
for pos in ['off','def']:
    res = sorted([(r,n,p) for n,r,p in zip(filt_lbl,est2.params[1:],est2.pvalues[1:]) if pos+ '_' in n],reverse=True)
    print(pos+' : { ')
    for p in res:
        if np.linalg.norm(p[0]) > 1e-3 and p[2] < 0.05:
            key = p[1].split('_')[1]
            future_use[pos][key]  = p[0]
            print('\t{}: [{:.3f}, 1],'.format(key,p[0]))
    print('},')

In [None]:
ratings_to_use = [k for k,v in (est2.pvalues < 0.1).items() if v if k != 'const']
reg_small = ElasticNetCV(positive=True,cv=3)
reg_small.fit(Xs[:,[list(filt_lbl).index(r) for r in ratings_to_use]],y)

In [None]:
res = sorted([(abs(v),k) for k,v in zip(ratings_to_use,reg_small.coef_)],reverse=True)
for v,k in res:
    print('{}\t{}\t{:.3f}'.format('',k,v))
print()

In [None]:
valid_pos = set([k.split('_')[0] for k in ratings_to_use])

In [None]:
future_use = defaultdict(dict)
for pos in sorted(list(valid_pos)):
    res = sorted([(r,n) for n,r in zip(exp_lbl,reg.coef_) if pos+ '_' in n and r > 0],reverse=True)
    print(pos+' : { ')
    for p in res:
        if np.linalg.norm(p[0]) > 1e-3:
            #future_use[row[0]][res[i][1]] = row[1][res[i][1]]
            print('\t{}: [{:.3f}, 1],'.format(p[1].split('_')[1],p[0]))
    print('},')