In [1]:
import pandas as pd
import numpy as np
import time
import os, platform
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
if platform.system()=='Darwin':
    directory = '/Users/phil/Google Drive/projects/nfl/data/all'
elif os.environ.get("USERNAME")=='phil':
    directory = 'C:/Users/phil/Google Drive/nfl/data/all'
elif os.environ.get("USERNAME")=='lyncp010':
    directory = 'C:/Users/lyncp010/projects/nfl/data/all'
print directory

/Users/phil/Google Drive/projects/nfl/data/all


# import data

In [3]:
df = pd.read_csv(directory + '/offense_player_ff_points_s02w01_s16w16.csv', low_memory=False)

#### only players from current season

In [4]:
df = df[df.season == 2016].copy()

In [5]:
df.head()

Unnamed: 0,season,week,bsID,team,player_id,position,ff_points_total,home,opponent,passAtt,...,recLong,rushAtt,rushYds,rushTd,rushLong,fumbles,fumblesLost,passYdsOver300,rushYdsOver100,recYdsOver100
71153,2016,1,201609080den,car,BenjKe00,WR,15.1,0,den,0.0,...,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
71154,2016,1,201609080den,car,BrowPh00,WR,1.1,0,den,0.0,...,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
71155,2016,1,201609080den,car,FuncDe00,WR,0.9,0,den,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
71156,2016,1,201609080den,car,GinnTe00,WR,2.5,0,den,0.0,...,5.0,1.0,20.0,0.0,20.0,0.0,0.0,0,0,0
71157,2016,1,201609080den,car,NewtCa00,QB,22.16,0,den,33.0,...,0.0,11.0,54.0,1.0,12.0,0.0,0.0,0,0,0


# matrix factorization w/ gradient descent

### creating ids

In [6]:
from sklearn import preprocessing

#### team ids

In [7]:
# create index for team names
team_list = np.sort(df['team'].unique())
team_le = preprocessing.LabelEncoder()
team_le.fit(team_list)

LabelEncoder()

In [8]:
# add team ids
df['opponent_id'] = team_le.transform(df['opponent'])

#### player ids

In [9]:
# create index for team names
player_list = np.sort(df['player_id'].unique())
player_le = preprocessing.LabelEncoder()
player_le.fit(player_list)

LabelEncoder()

In [10]:
# add team ids
df['player_n_id'] = player_le.transform(df['player_id'])

### splitting data

In [11]:
# create train testing index for each game.
np.random.seed(7)

In [12]:
nfolds = 5

In [45]:
df['kfold'] = np.random.choice(nfolds, size=len(df), )

In [None]:
df['ff_points_total'].mean()

In [None]:
np.c_[df['ff_points_total'].mean(), np.random.rand(1)]

In [None]:
np.random.rand(2)

In [None]:
foo = np.random.rand(2)
bar = np.c_[df['ff_points_total'].mean(), np.random.rand(1)]
print foo.shape
print bar.reshape(2,)

## model functions

In [46]:
def player_oppo_mfsgd(dtrain, dtest=False, latent_features=2, max_iter=500, alpha=0.0001, beta=0.01, mu=0.8, seed=0):
    
    np.random.seed(seed)
    team_size = len(team_list)
    player_size = len(player_list)
    rmse = []
    rmse_test = []
    
    eps = 1e-5
    err_lim = 1e6
    
    alpha0 = alpha # learning rate start point
    vB, vP, vO = 0, 0, 0 # velocity starts 0
    
    # expansion matrix
    player_expansion = np.eye(player_size)
    oppo_expansion = np.eye(team_size)
    
    # initialize weights
#     B = np.random.rand(2) # bias + home_indicator
    B = np.c_[df['ff_points_total'].mean(), np.random.rand(1)].reshape(2,) # global_bias + home_indicator
    P = np.random.rand(player_size, latent_features) # player weights
    P = np.maximum(P, eps)
    O = np.random.rand(team_size, latent_features) # oppo weights
    O = np.maximum(O, eps)
    
    # training data
    X = np.c_[np.ones(len(dtrain)), dtrain['home'].as_matrix()]
    y = dtrain['ff_points_total'].as_matrix()
    
    # testing data
    if type(dtest)!=bool:
        X_test = np.c_[np.ones(len(dtest)), dtest['home'].as_matrix()]
        y_test = dtest['ff_points_total'].as_matrix()
    
    for i in xrange(max_iter):
        #print '\rStep {}/{}'.format(i+1, max_iter),
        #alpha = alpha0/(1 + .005*i)
        
        # embeddings for mf
        player_embed = P[dtrain['player_n_id'].as_matrix(),:]
        oppo_embed = O[dtrain['opponent_id'].as_matrix(),:]

        # calc y_hat
        y_hat = np.dot(X, B) + np.einsum('ij,ji->i', player_embed, oppo_embed.T)

        # calc errors
        e = y - y_hat

        # ---- derivatives ----
        # betas
        B_deriv = (np.c_[(-e),(-e)] * X).sum(axis=0)

        # latent factors
#         print 'e', np.array([(-e) for _ in range(latent_features)]).shape
#         print 'p', player_expansion[dtrain['player_n_id'].as_matrix(),:].shape
#         print 'O', O.shape
#         print 'eO', 
        P_deriv = np.dot(np.array([(-e) for _ in range(latent_features)]) * oppo_embed.T,
                         player_expansion[dtrain['player_n_id'].as_matrix(),:]
                        ).T# * O
        O_deriv = np.dot(np.array([(-e) for _ in range(latent_features)]) * player_embed.T,
                         oppo_expansion[dtrain['opponent_id'].as_matrix(),:]
                        ).T# * T
        
        # ???? add regularization ????
        B_deriv -= beta * B
        P_deriv -= beta * P
        O_deriv -= beta * O
        
        # ???? use MacKays quick n' dirty variance ratio ????
        var_resid = np.var(e)
        beta_P = P.var()/var_resid
        beta_O = O.var()/var_resid
        # print beta_T, beta_O
        P_deriv -= (beta_P * P) + (beta * P)
        O_deriv -= (beta_O * O) + (beta * O)
        
        # ---- parameter updates ----
        B -= alpha * B_deriv
        P -= alpha * P_deriv
        O -= alpha * O_deriv
        
        # ???? add momentum ????
        vB = mu * vB - alpha * B_deriv
        vP = mu * vP - alpha * P_deriv
        vO = mu * vO - alpha * O_deriv
        B += mu * vB
        P += mu * vP
        O += mu * vO
        
        # save error
        # - training
        rmse_i = np.sqrt(np.mean(e**2))
        rmse.append(rmse_i)
        
        # - test error
        if type(dtest)!=bool:
            player_embed_test = P[dtest['player_n_id'].as_matrix(),:]
            oppo_embed_test = O[dtest['opponent_id'].as_matrix(),:]
            y_hat_test = np.dot(X_test, B) + np.einsum('ij,ji->i', player_embed_test, oppo_embed_test.T)
            e_test = y_test - y_hat_test
            rmse_test_i = np.sqrt(np.mean(e_test**2))
            rmse_test.append(rmse_test_i)
        
        if rmse_i > err_lim:
            #print '\nError limit reached :('
            rmse += [np.nan]*(max_iter - i - 1)
            rmse_test += [np.nan]*(max_iter - i - 1)
            break
    
    # return results if not using test set
    if type(dtest)==bool:
        print '\rDone!'
        #print 'RMSE min {}'.format(min(rmse))
        #print 'RMSE end {}'.format(rmse_i)
        return B, P, O, rmse
    else:
        rmse_test = pd.Series(rmse_test, index=range(1, max_iter + 1))
        rmse_test.index.names = ['iter']
        return rmse_test

In [47]:
def split_data_to_train_test(df, index, fold):
    df_train = df[(df[index]!=fold)]
    df_test =  df[(df[index]==fold)]
    return df_train, df_test

In [48]:
def kfold_cv(data, nfolds, lf, alpha, beta, mu):
    # results df
    model_i = pd.DataFrame()
    
    # loop over folds
    for fold in range(nfolds):
        # create train test set
        trn, tst = split_data_to_train_test(data, 'kfold', fold)
        # train matrix factorization
        model_i['rmse_fold_{}'.format(fold)] = player_oppo_mfsgd(trn, tst,
                                                                 latent_features=lf,
                                                                 alpha=alpha, beta=beta, mu=mu
                                                                )
        
    # set hyper param indices
    model_i['position'] = p
    model_i['latent_features'] = lf
    model_i['alpha'] = alpha
    model_i['beta'] = beta
    model_i['mu'] = mu
    model_i.reset_index(inplace=True)
    model_i.set_index(['position', 'latent_features', 'alpha', 'beta', 'mu', 'iter'], inplace=True)
    
    # return results
    return model_i

## testing on ALL hyper parameters

In [49]:
import itertools

In [50]:
# index of results parameters
offense_positions = ['QB', 'RB', 'TE', 'WR']

max_latent_features = 5
latent_features_list = range(1, max_latent_features+1)

max_iter = 300
max_iter_list = range(1, max_iter+1)

In [51]:
# list of hyper param
grid = {
    'lf':latent_features_list,
    'alpha':[0.0001, 0.0005, 0.001],
    'beta':[0.005, 0.01, 0.02],
    'mu':[0.3, 0.5, 0.7, 0.9]
}

# number of grid combincations
grid_n = 1
for key in grid.keys():
    grid_n *= len(grid[key])
print '{} combinations'.format(grid_n)
grid

180 combinations


{'alpha': [0.0001, 0.0005, 0.001],
 'beta': [0.005, 0.01, 0.02],
 'lf': [1, 2, 3, 4, 5],
 'mu': [0.3, 0.5, 0.7, 0.9]}

In [53]:
# train
model_results = pd.DataFrame()
print 'Training...'
for p in offense_positions:
    #data = df[df['position']==p].copy()
    data = df[df['position']==p].copy()
    for i, gs in enumerate(itertools.islice(itertools.product(
        grid['lf'],
        grid['alpha'],
        grid['beta'],
        grid['mu']
    ), grid_n)):
        lf = gs[0]
        alpha = gs[1]
        beta = gs[2]
        mu = gs[3]
        print '\r({}/{}) Position {} - lf {} - alpha {} - beta {} - mu {}'.format(i+1, grid_n,
                                                                                  p, lf, alpha, beta, mu),
        
        n_folds = len(data['kfold'].unique())
        
        model_results_i = kfold_cv(data, n_folds, lf, alpha, beta, mu)        
        model_results = pd.concat([model_results, model_results_i])
print '\rDone!'

 Training...
Done!


In [56]:
model_results.dropna(inplace=True)
model_results['rmse_fold_avg'] = model_results.mean(axis=1)

In [85]:
model_results.loc[idx['QB'],:].tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,rmse_fold_0,rmse_fold_1,rmse_fold_2,rmse_fold_3,rmse_fold_4,rmse_fold_avg
latent_features,alpha,beta,mu,iter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5,0.001,0.02,0.9,158,18.406133,45.02206,11.949913,23.663461,19.672798,23.74287
5,0.001,0.02,0.9,159,18.713126,51.4911,12.078652,24.23944,20.000302,25.30452
5,0.001,0.02,0.9,160,19.03622,484.8908,12.216987,24.86367,20.351281,112.2718
5,0.001,0.02,0.9,161,19.376954,52855.26,12.366241,25.542908,20.7287,10586.66
5,0.001,0.02,0.9,162,19.737074,4.677659e+17,12.528,26.285238,21.136044,9.355318e+16


In [59]:
idx = pd.IndexSlice
best_param = {}
for p in offense_positions:
    best_param_pos = model_results.loc[idx[p,:,:,:,:,:],]['rmse_fold_avg'].idxmin()
    best_param[best_param_pos[0]] = {'lf'   : best_param_pos[1],
                                     'alpha': round(best_param_pos[2], 4),
                                     'beta' : round(best_param_pos[3], 2),
                                     'mu'   : round(best_param_pos[4], 1),
                                     'iter' : best_param_pos[5]
                                    }
best_param

{'QB': {'alpha': 0.0001, 'beta': 0.01, 'iter': 500, 'lf': 5, 'mu': 0.3},
 'RB': {'alpha': 0.0001, 'beta': 0.01, 'iter': 285, 'lf': 2, 'mu': 0.3},
 'TE': {'alpha': 0.0001, 'beta': 0.01, 'iter': 252, 'lf': 4, 'mu': 0.3},
 'WR': {'alpha': 0.0001, 'beta': 0.01, 'iter': 257, 'lf': 4, 'mu': 0.3}}

In [64]:
# foo = {'QB': {'alpha': 0.0001, 'beta': 0.01, 'iter': 500, 'lf': 5, 'mu': 0.3},
#        'RB': {'alpha': 0.0001, 'beta': 0.01, 'iter': 285, 'lf': 2, 'mu': 0.3},
#        'TE': {'alpha': 0.0001, 'beta': 0.01, 'iter': 252, 'lf': 4, 'mu': 0.3},
#        'WR': {'alpha': 0.0001, 'beta': 0.01, 'iter': 257, 'lf': 4, 'mu': 0.3}}

In [60]:
gb = model_results.groupby(level=[0])['rmse_fold_avg']
gb.nsmallest(5)

position  position  latent_features  alpha   beta   mu   iter
QB        QB        5                0.0001  0.005  0.3  500     7.270726
                                                         499     7.270734
                                                         498     7.270747
                                                         497     7.270764
                                                         496     7.270787
RB        RB        2                0.0001  0.005  0.3  285     5.855207
                                                         284     5.855211
                                                         286     5.855219
                                                         283     5.855229
                                                         287     5.855245
TE        TE        4                0.0001  0.005  0.3  252     4.309197
                                                         253     4.309199
                                                  

# testing

In [None]:
player_oppo_mfsgd

In [81]:
p='WR'

In [2]:
# training with all data
print best_param[p]
B, P, O, rmse = player_oppo_mfsgd(data[data['position']=='QB'],
                                  latent_features = best_param[p]['lf'],
                                  max_iter = best_param[p]['iter'],
                                  alpha = best_param[p]['alpha'],
                                  beta = best_param[p]['beta'],
                                  mu = best_param[p]['mu']
                                 )
# print B
# print P
# print O

array([ 6.69717512,  0.54920555])

In [73]:
data

Unnamed: 0,season,week,bsID,team,player_id,position,ff_points_total,home,opponent,passAtt,...,rushLong,fumbles,fumblesLost,passYdsOver300,rushYdsOver100,recYdsOver100,opponent_id,player_n_id,tt_fold_index,kfold
71153,2016,1,201609080den,car,BenjKe00,WR,15.1,0,den,0.0,...,0.0,0.0,0.0,0,0,0,9,33,4,3
71154,2016,1,201609080den,car,BrowPh00,WR,1.1,0,den,0.0,...,0.0,0.0,0.0,0,0,0,9,66,1,1
71155,2016,1,201609080den,car,FuncDe00,WR,0.9,0,den,0.0,...,0.0,0.0,0.0,0,0,0,9,170,3,3
71156,2016,1,201609080den,car,GinnTe00,WR,2.5,0,den,0.0,...,20.0,0.0,0.0,0,0,0,9,178,3,1
71167,2016,1,201609080den,den,NorwJo00,WR,0.0,1,car,0.0,...,0.0,0.0,0.0,0,0,0,2,352,0,1
71168,2016,1,201609080den,den,SandEm00,WR,4.9,1,car,0.0,...,0.0,0.0,0.0,0,0,0,2,415,3,0
71170,2016,1,201609080den,den,ThomDe03,WR,4.8,1,car,0.0,...,0.0,0.0,0.0,0,0,0,2,465,3,2
71174,2016,1,201609110atl,atl,HardJu00,WR,0.0,1,tam,0.0,...,0.0,0.0,0.0,0,0,0,30,206,0,4
71176,2016,1,201609110atl,atl,JoneJu02,WR,12.6,1,tam,0.0,...,0.0,0.0,0.0,0,0,0,30,264,3,3
71178,2016,1,201609110atl,atl,SanuMo00,WR,14.0,1,tam,0.0,...,0.0,0.0,0.0,0,0,0,30,416,2,4
