In [1]:
import pandas as pd
import numpy as np
import time
import os, platform
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
if platform.system()=='Darwin':
    directory = '/Users/phil/Google Drive/projects/nfl/data/current_season'
elif os.environ.get("USERNAME")=='phil':
    directory = 'C:/Users/phil/Google Drive/nfl/data/current_season'
elif os.environ.get("USERNAME")=='lyncp010':
    directory = 'C:/Users/lyncp010/projects/nfl/data/current_season'
print directory

/Users/phil/Google Drive/projects/nfl/data/current_season


# import data

In [3]:
# list of dataframes
gmin = 'gameInfo'
inst = 'injuryStatus'
psdr = 'passDirections'
snct = 'snapCounts'
strt = 'starters'
sdef = 'statsDefense'
skck = 'statsKicking'
soff = 'statsOffense'
srtn = 'statsReturns'
stm  = 'statsTeam'
tmrs = 'teamRoster'

dfl = [gmin, inst, psdr, snct, strt, sdef, skck, soff, srtn, stm, tmrs]

In [9]:
dfIn = {
    'gameInfo'      :['season','week','bsID'                   ],
    'injuryStatus'  :['season','week',       'team','player_id'],
    'passDirections':['season','week','bsID','team','player_id'],
    'snapCounts'    :['season','week','bsID','team','player_id'],
    'starters'      :['season','week','bsID','team','player_id'],
    'statsDefense'  :['season','week','bsID','team','player_id'],
    'statsKicking'  :['season','week','bsID','team','player_id'],
    'statsOffense'  :['season','week','bsID','team','player_id'],
    'statsReturns'  :['season','week','bsID','team','player_id'],
    'statsTeam'     :['season','week','bsID','team'            ],
    'teamRoster'    :['season',              'team','player_id'],
}

In [11]:
# import csvs into dataframes
csv_names = '_s16w01_s16w16'

d = {}
for key in dfIn:
    d[key] = pd.read_csv(directory + '/{}{}.csv'.format(key, csv_names), low_memory=False)
    d[key].set_index(dfIn[key], inplace=True)
    d[key].sortlevel(inplace=True)
print 'tables imported'

tables imported


In [13]:
d[gmin].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,weekday,startTime,home,away,winner,homeScore,awayScore,line,overUnder,roof,surface,temp,relHumidity,windChill,windMPH
season,week,bsID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016,1,201609080den,2016-09-08,Thursday,8:40pm,den,car,car,20,21,-3.0,40.5,outdoors,grass,85.0,12.0,0.0,10
2016,1,201609110atl,2016-09-11,Sunday,1:05pm,atl,tam,atl,31,24,-2.5,46.5,dome,fieldturf,70.0,,70.0,0
2016,1,201609110clt,2016-09-11,Sunday,4:27pm,clt,det,clt,39,35,-2.5,51.0,retractable roof (open),fieldturf,70.0,,70.0,0
2016,1,201609110crd,2016-09-11,Sunday,7:30pm,crd,nwe,crd,23,21,-9.0,44.5,retractable roof (closed),grass,70.0,,70.0,0
2016,1,201609110dal,2016-09-11,Sunday,4:27pm,dal,nyg,dal,20,19,-1.0,47.5,retractable roof (closed),matrixturf,70.0,,70.0,0


#### fixing data

In [14]:
idx = pd.IndexSlice

In [15]:
# start with offense stats
df = d[soff].copy().reset_index()

In [16]:
# change new team abbreviations to old/consistent ones (i.e. lar > ram)
new_team_name_dict = {
    'ari':'crd',
    'bal':'rav',
    'hou':'htx',
    'ind':'clt',
    'lar':'ram',
    'oak':'rai',
    'ten':'oti'
}

df['team'].replace(new_team_name_dict, inplace=True)

In [17]:
# add home indicator
df['home'] = (df['bsID'].str[-3:]==df['team']).astype(int)

In [19]:
# add opponent
homeAwayOpp = {True:'away', False:'home'}
for i, r in df.iterrows():
    seas, week, bsid, team, plid = r['season'], r['week'], r['bsID'], r['team'], r['player_id']
    foo = d[gmin].loc[idx[[seas],[week],[bsid]],]
    df.loc[i, 'opponent'] = foo[homeAwayOpp[foo['home'][0]==team]][0]

#### player positions

In [None]:
# merge in player positions
df = pd.merge(df.reset_index(),
              d[tmrs].reset_index()[['season', 'player_id', 'position']],
              how='left',
              on = ['season', 'player_id']
             )

In [None]:
# set index (if needed)
# df.set_index(['season','week','bsID','team','player_id'], inplace=True)

#### fix missing positions

In [None]:
# get players with missing positions
df[df['position'].isnull()]['player_id'].unique()

In [None]:
# missing positions from pfr
missing_player_positions = {
    'GreeVi00':'TE',
    'AbbrJa00':'WR',
    'JohnAn02':'WR',
    'JohnMa06':'RB',
    'HuffJo00':'WR',
    'SalaGr00':'WR',
    'FostAr00':'RB',
    'DaviKe01':'TE',
    'PeriJu00':'TE',
    'PeadIs00':'RB',
    'MageTe00':'RB',
    'BellJo01':'RB',
    'LeexKh00':'TE',
    'PruiMy00':'TE',
    'HamlCo01':'TE',
    'WhitCh02':'QB',
    'SmitRo06':'RB',
    'WhalGr00':'WR',
    'StreDe00':'WR',
    'RidlSt00':'RB',
}

# adding their position to the dataframe
df['position'].fillna(df['player_id'].map(missing_player_positions), inplace=True)

# Terell Pryor's position was listed as QB
for i, r in df[(df['player_id']=='PryoTe00')].iterrows():
    df.loc[i,'position'] = 'WR'

#### list of positions

In [None]:
df['position'].unique()

In [None]:
# convert fullback to runningback
df['position'].replace({'FB':'RB'}, inplace=True)

In [None]:
df['position'].unique()

#### calc fantasy points

In [None]:
# add indicator for fantasy yard bonuses
df['passYdsOver300'] = np.where(df['passYds']>=300, 1, 0)
df['rushYdsOver100'] = np.where(df['rushYds']>=100, 1, 0)
df['recYdsOver100']  = np.where(df['recYds']>=100, 1, 0)

In [None]:
# standard offense ff points calculator
ffPtsDict = {
    'passTds':        ( 4.0),
    'passYds':        ( 0.04),
    'passYdsOver300': ( 0.0),
    'passInt':        (-1.0),
    'rushYds':        ( 0.1),
    'rushTd':         ( 6.0),
    'rushYdsOver100': ( 0.0),
    'fumbles':        (-1.0),
    'recYds':         ( 0.1),
    'rec':            ( 0.0),
    'recTds':         ( 6.0),
    'recYdsOver100':  ( 0.0)
}

In [None]:
# calculate fantasy points
B = np.array([ffPtsDict[stat] for stat in sorted(ffPtsDict.keys())])
X = df[sorted(ffPtsDict.keys())].as_matrix()
df['ff_PtsTot'] = np.dot(X, B)

In [None]:
df.head(3)

#### setting up matrix factorization data

In [None]:
# teams index
teams = df['team'].unique()
teams.sort()

In [None]:
# offense positions
offense_positions = ['QB', 'RB', 'TE', 'WR']

In [None]:
# get total points for each position each game
game_pos_avg = df.groupby(by=['season', 'week', 'team', 'opponent', 'home', 'position'])['ff_PtsTot'].sum().reset_index()

# average games for teams that played each other twice (not needed when adding home indicator)
# team_opp_pos_avg = game_pos_avg.groupby(by=['team', 'opponent', 'position', 'home']).mean().reset_index()
# team_opp_pos_avg.sortlevel(inplace=True)

# drop non offense positions
team_opp_pos_avg = game_pos_avg[game_pos_avg['position'].isin(offense_positions)]

In [None]:
team_opp_pos_avg.head(3)

# matrix factorization w/ alternating least squares

In [None]:
from scipy import sparse
from scipy import linalg
from sklearn.metrics import mean_squared_error

In [None]:
def nmf(X, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, seed=7):
    """
    Decompose X to A*Y
    """
    np.random.seed(seed)
    
    eps = 1e-5
    print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    X = X.toarray()  # I am passing in a scipy sparse matrix

    # mask
    mask = np.sign(X)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    Y = linalg.lstsq(A, X)[0]
    Y = np.maximum(Y, eps)

    masked_X = mask * X
    X_est_prev = np.dot(A, Y)
    for i in range(1, max_iter + 1):
        # ===== updates =====
        # Matlab: A=A.*(((W.*X)*Y')./((W.*(A*Y))*Y'));
        top = np.dot(masked_X, Y.T)
        bottom = (np.dot((mask * np.dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        # Matlab: Y=Y.*((A'*(W.*X))./(A'*(W.*(A*Y))));
        top = np.dot(A.T, masked_X)
        bottom = np.dot(A.T, mask * np.dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        # print 'Y', np.round(Y, 2)


        # ==== evaluation ====
        if i % 100 == 0 or i == 1 or i == max_iter:
            print 'Iteration {}:'.format(i),
            X_est = np.dot(A, Y)
            err = mask * (X_est_prev - X_est)
            fit_residual = np.sqrt(np.sum(err ** 2))
            X_est_prev = X_est

            curRes = linalg.norm(mask * (X - X_est), ord='fro')
            print 'fit residual', np.round(fit_residual, 4),
            print 'total residual', np.round(curRes, 4)
            if curRes < error_limit or fit_residual < fit_error_limit:
                break

    return A, Y

In [None]:
def nmf2(X_train, X_test, latent_features, max_iter=100, error_limit=1e-6, fit_error_limit=1e-6, seed=7):
    """
    Decompose X to A*Y
    """
    np.random.seed(seed)
    
    test_rmse_list = []
    eps = 1e-5
    #print 'Starting NMF decomposition with {} latent features and {} iterations.'.format(latent_features, max_iter)
    X_train = sparse.csr_matrix(X_train.fillna(0).as_matrix()).toarray() # passing in df with nan
    X_test  = sparse.csr_matrix(X_test.fillna(0).as_matrix()).toarray() # passing in df with nan

    # mask
    mask_train = np.sign(X_train)
    mask_test = np.sign(X_test)

    # initial matrices. A is random [0,1] and Y is A\X.
    rows, columns = X_train.shape
    A = np.random.rand(rows, latent_features)
    A = np.maximum(A, eps)

    masked_X_train = mask_train * X_train

    Y = linalg.lstsq(A, masked_X_train)[0]
    Y = np.maximum(Y, eps)
    
    X_est_prev = dot(A, Y)
    #print 'A',  np.round(A.T, 2), '\rY', np.round(Y, 2)
    for i in range(1, max_iter + 1):
        # ===== updates =====
        # Matlab: A=A.*(((W.*X)*Y')./((W.*(A*Y))*Y'));
        top = dot(masked_X_train, Y.T)
        bottom = (dot((mask_train * dot(A, Y)), Y.T)) + eps
        A *= top / bottom

        A = np.maximum(A, eps)
        # print 'A',  np.round(A, 2)

        # Matlab: Y=Y.*((A'*(W.*X))./(A'*(W.*(A*Y))));
        top = dot(A.T, masked_X_train)
        bottom = dot(A.T, mask_train * dot(A, Y)) + eps
        Y *= top / bottom
        Y = np.maximum(Y, eps)
        #print 'A',  np.round(A.T, 2), '\rY', np.round(Y, 2)


        # ==== evaluation ====
        # add to df
        X_est = dot(A, Y)
        test_rmse = np.mean(np.sqrt((mask_test * (X_test - X_est))**2))
        test_rmse_list.append(test_rmse)
        #print test_rmse, linalg.norm(mask_test * (X_test - X_est), ord='fro')
        
#         if i % 50 == 0 or i == 1 or i == max_iter:
#             print 'Iteration {}:'.format(i),
#             X_est = dot(A, Y)
#             err = mask_train * (X_est_prev - X_est)
#             fit_residual = np.sqrt(np.sum(err ** 2))
#             train_rmse = np.sqrt(np.mean((mask_train * (X_train - X_est))**2))
#             #test_rmse = np.sqrt(np.mean((mask_test * (X_test - X_est))**2))
#             X_est_prev = X_est
            
#             curRes = linalg.norm(mask_train * (X_train - X_est), ord='fro')
#             print 'fit residual', np.round(fit_residual, 4),
#             print 'train rmse', np.round(train_rmse, 4),
#             print 'test rmse', np.round(test_rmse, 4)
#             if curRes < error_limit or fit_residual < fit_error_limit:
#                 break

    #return A, Y
    test_rmse_list = pd.Series(test_rmse_list, index=range(1, max_iter + 1))
    test_rmse_list.index.names = ['iter']
    return test_rmse_list

#### hyperparam 10-fold cv

#### create fold index

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
# create train testing index for each game.
np.random.seed(7)

# series with just QBs
s = team_opp_pos_avg[team_opp_pos_avg['position']=='QB']['team']

# get max number of games for any team
n = s.groupby(s).count().max()

# for each team.. create index for each game as a sample w/o replacement from range 0 to n
tt_fold_index = np.concatenate([np.random.choice(4, size=i, ) for i in s.groupby(s).count()])

# apply index to dataframe
for h, (i, r) in enumerate(team_opp_pos_avg[team_opp_pos_avg['position']=='QB'].iterrows()):
    team_opp_pos_avg.loc[i, 'fold'] = tt_fold_index[h]

# forward fill the indices to other rows (positions)
team_opp_pos_avg['fold'].fillna(method='ffill', inplace=True)

In [None]:
team_opp_pos_avg.head()

#### model functions

In [None]:
# function to create empty dataframe for a position
def team_opp_matrix():
    return pd.DataFrame(columns=teams, index=[teams])

In [None]:
def fill_team_opp_matrix(df):
    m = team_opp_matrix()
    for i, r in df.iterrows():
        m.loc[r['team']][r['opponent']] = r['ff_PtsTot']
    return m

In [None]:
def split_to_tt_matrix(df, index, fold):
    df_train = df[(df[index]!=fold)]
    df_test =  df[(df[index]==fold)]
    return fill_team_opp_matrix(df_train), fill_team_opp_matrix(df_test)

In [None]:
def kfold_nmf(data, folds, latent_features, max_iter=100):
    # results df
    model_i = pd.DataFrame()
    
    # loop over folds
    for fold in range(folds):
        # create train test set
        trn, tst = split_to_tt_matrix(data, 'fold', fold)
        # train matrix factorization
        model_i['rmse_fold_{}'.format(fold)] = nmf2(trn, tst,
                                                    latent_features = latent_features,
                                                    max_iter = max_iter,
                                                    error_limit=1e-6, fit_error_limit=1e-6)
    
    # set hyper param indices
    model_i['position'] = p
    model_i['latent_features'] = lf
    model_i.reset_index(inplace=True)
    model_i.set_index(['position', 'latent_features', 'iter'], inplace=True)
    
    # return results
    return model_i

In [None]:
data = team_opp_pos_avg[team_opp_pos_avg['position']=='QB'].copy()
trn, tst = split_to_tt_matrix(data, 'fold', 0)

In [None]:
nmf2(trn, tst, latent_features=1, max_iter=5, error_limit=1e-6, fit_error_limit=1e-6)

# training

In [None]:
# index of results parameters
offense_positions = ['QB', 'RB', 'TE', 'WR']

max_latent_features = 3
latent_features_list = range(1, max_latent_features+1)

max_iter = 1000
max_iter_list = range(1, max_iter+1)

In [None]:
# train
model_results = pd.DataFrame()
print 'Training...'
for p in offense_positions:
    for lf in latent_features_list:
        print '\rPosition {} - lf {}'.format(p, lf),
        data = team_opp_pos_avg[team_opp_pos_avg['position']==p]
        n_folds = len(team_opp_pos_avg.fold.unique())

        model_results_i = kfold_nmf(data,
                                    n_folds,
                                    latent_features=lf,
                                    max_iter=max_iter
                                   )
        
        model_results = pd.concat([model_results, model_results_i])
print '\rDone!'

In [None]:
model_results['rmse_fold_avg'] = model_results.mean(axis=1)

In [None]:
best_param = {}
for p in offense_positions:
    best_param_pos = model_results.loc[idx[p,:,:],]['rmse_fold_avg'].idxmin()
    best_param[best_param_pos[0]] = {'lf':best_param_pos[1], 'iter':best_param_pos[2]}
best_param

In [None]:
gb = model_results.groupby(level=[0])['rmse_fold_avg']
gb.nsmallest(5)

In [None]:
gb.nsmallest(5)

#### train on whole dataset

In [None]:
test_pos = 'WR'

In [None]:
data = team_opp_pos_avg[team_opp_pos_avg['position']==test_pos].copy()
m = fill_team_opp_matrix(data)
sm_train = sparse.csr_matrix(m.fillna(0).as_matrix())

# use nmf to get A, Y
A, Y = nmf(sm_train
          ,2#best_param[test_pos]['lf']
          ,max_iter = best_param[test_pos]['iter']
          ,error_limit=1e-6, fit_error_limit=1e-6)

# create team opp df with expected values
X_est = pd.DataFrame(columns=teams, index=[teams], data=np.dot(A,Y).round(2))

# add expected points to original df
for i, r in data.iterrows():
    data.loc[i, 'mf_expected_ff_pts'] = X_est.loc[r['team']][r['opponent']]
print 'rmse', np.sqrt(mean_squared_error(data['ff_PtsTot'], data['mf_expected_ff_pts']))

In [None]:
X_est.loc['den']['jax']

In [None]:
X_est.loc['rav']['mia']

In [None]:
print A[0,:], Y[:,0]

In [None]:
np.dot(A[0,:], Y[:,0])

# matrix factorization w/ gradient descent

#### creating team ids

In [None]:
from sklearn import preprocessing

In [None]:
# teams index
teams = df['team'].unique()
teams.sort()

In [None]:
# offense positions
offense_positions = ['QB', 'RB', 'TE', 'WR']

In [None]:
# get total points for each position each game
game_pos_avg = df.groupby(by=['season', 'week', 'team', 'opponent', 'home', 'position'])['ff_PtsTot'].sum().reset_index()

# average games for teams that played each other twice
team_opp_pos_avg = game_pos_avg.groupby(by=['team', 'opponent', 'position', 'home']).mean().reset_index()
#team_opp_pos_avg.sortlevel(inplace=True)

# drop non offense positions
team_opp_pos_avg = team_opp_pos_avg[team_opp_pos_avg['position'].isin(offense_positions)]

In [None]:
# create index for team names
team_list = np.sort(df['team'].unique())
team_le = preprocessing.LabelEncoder()
team_le.fit(team_list)

In [None]:
# add team ids
team_opp_pos_avg['team_id'] = team_le.transform(team_opp_pos_avg['team'])
team_opp_pos_avg['opponent_id'] = team_le.transform(team_opp_pos_avg['opponent'])

In [None]:
# create train testing index for each game.
np.random.seed(7)

# series with just QBs
s = team_opp_pos_avg[team_opp_pos_avg['position']=='QB']['team']

# get max number of games for any team
n = s.groupby(s).count().max()

# for each team.. create index for each game as a sample w/o replacement from range 0 to n
tt_fold_index = np.concatenate([np.random.choice(4, size=i, ) for i in s.groupby(s).count()])

# apply index to dataframe
for h, (i, r) in enumerate(team_opp_pos_avg[team_opp_pos_avg['position']=='QB'].iterrows()):
    team_opp_pos_avg.loc[i, 'fold'] = tt_fold_index[h]

# forward fill the indices to other rows (positions)
team_opp_pos_avg['fold'].fillna(method='ffill', inplace=True)

In [None]:
team_opp_pos_avg.head()

## model functions

In [None]:
def home_team_oppo_mfsgd(dtrain, dtest=False, latent_features=1, max_iter=1000, alpha=0.0001, beta=0.01, mu=0.8, seed=0):
    
    np.random.seed(seed)
    team_size = len(team_list)
    rmse = []
    rmse_test = []
    
    eps = 1e-5
    err_lim = 1e6
    
    alpha0 = alpha # learning rate start point
    vB, vT, vO = 0, 0, 0 # velocity starts 0
    
    # expansion matrix
    expansion_matrix = np.eye(team_size)
    
    # initialize weights
    B = np.random.rand(2) # bias + home_indicator
    T = np.random.rand(team_size, latent_features) # team weights
    T = np.maximum(T, eps)
    O = np.random.rand(team_size, latent_features) # oppo weights
    O = np.maximum(O, eps)
    
    # training data
    X = np.c_[np.ones(len(dtrain)), dtrain['home'].as_matrix()]
    y = dtrain['ff_PtsTot'].as_matrix()
    
    # testing data
    if type(dtest)!=bool:
        X_test = np.c_[np.ones(len(dtest)), dtest['home'].as_matrix()]
        y_test = dtest['ff_PtsTot'].as_matrix()
    
    for i in xrange(max_iter):
        #print '\rStep {}/{}'.format(i+1, max_iter),
        #alpha = alpha0/(1 + .005*i)
        
        # embeddings for mf
        team_embed = T[dtrain['team_id'].as_matrix(),:]
        oppo_embed = O[dtrain['opponent_id'].as_matrix(),:]

        # calc y_hat
        y_hat = np.dot(X, B) + np.einsum('ij,ji->i', team_embed, oppo_embed.T)

        # calc errors
        e = y - y_hat

        # ---- derivatives ----
        # betas
        B_deriv = (np.c_[(-e),(-e)] * X).sum(axis=0)

        # latent factors
        T_deriv = np.dot(np.array([(-e) for _ in range(latent_features)]),
                         expansion_matrix[dtrain['team_id'].as_matrix(),:]
                        ).T * O
        O_deriv = np.dot(np.array([(-e) for _ in range(latent_features)]),
                         expansion_matrix[dtrain['opponent_id'].as_matrix(),:]
                        ).T * T
        
        # ???? add regularization ????
        B_deriv -= beta * B
        T_deriv -= beta * T
        O_deriv -= beta * O
        
        # ???? use MacKays quick n' dirty variance ratio ????
        var_resid = np.var(e)
        beta_T = T.var()/var_resid
        beta_O = O.var()/var_resid
#         print beta_T, beta_O
        T_deriv -= (beta_T * T) + (beta * T)
        O_deriv -= (beta_O * O) + (beta * O)
        
        # ---- parameter updates ----
        B -= alpha * B_deriv
        T -= alpha * T_deriv
        O -= alpha * O_deriv
        
        # ???? add momentum ????
        vB = mu * vB - alpha * B_deriv
        vT = mu * vT - alpha * T_deriv
        vO = mu * vO - alpha * O_deriv
        B += mu * vB
        T += mu * vT
        O += mu * vO
        
        # save error
        # - training
        rmse_i = np.sqrt(np.mean(e**2))
        rmse.append(rmse_i)
        
        # - test error
        if type(dtest)!=bool:
            team_embed_test = T[dtest['team_id'].as_matrix(),:]
            oppo_embed_test = O[dtest['opponent_id'].as_matrix(),:]
            y_hat_test = np.dot(X_test, B) + np.einsum('ij,ji->i', team_embed_test, oppo_embed_test.T)
            e_test = y_test - y_hat_test
            rmse_test_i = np.sqrt(np.mean(e_test**2))
            rmse_test.append(rmse_test_i)
        
        if rmse_i > err_lim:
            #print '\nError limit reached :('
            rmse += [np.nan]*(max_iter - i - 1)
            rmse_test += [np.nan]*(max_iter - i - 1)
            break
    
    # return results if not using test set
    if type(dtest)==bool:
        print '\rDone!'
        #print 'RMSE min {}'.format(min(rmse))
        #print 'RMSE end {}'.format(rmse_i)
        return B, T, O, rmse
    else:
        rmse_test = pd.Series(rmse_test, index=range(1, max_iter + 1))
        rmse_test.index.names = ['iter']
        return rmse_test

In [None]:
def split_data_to_train_test(df, index, fold):
    df_train = df[(df[index]!=fold)]
    df_test =  df[(df[index]==fold)]
    return df_train, df_test

In [None]:
def kfold_cv(data, folds, lf, alpha, beta, mu):
    # results df
    model_i = pd.DataFrame()
    
    # loop over folds
    for fold in range(folds):
        # create train test set
        trn, tst = split_data_to_train_test(data, 'fold', fold)
        # train matrix factorization
        model_i['rmse_fold_{}'.format(fold)] = home_team_oppo_mfsgd(trn, tst,
                                                                    latent_features=lf,
                                                                    alpha=alpha, beta=beta, mu=mu
                                                                   )
        
    # set hyper param indices
    model_i['position'] = p
    model_i['latent_features'] = lf
    model_i['alpha'] = alpha
    model_i['beta'] = beta
    model_i['mu'] = mu
    model_i.reset_index(inplace=True)
    model_i.set_index(['position', 'latent_features', 'alpha', 'beta', 'mu', 'iter'], inplace=True)
    
    # return results
    return model_i

#### running mfsgd kfold cv

In [None]:
# # add cv fold col
# n_folds = 3
# np.random.seed(7)
# team_opp_pos_avg['fold'] = np.random.choice(n_folds, size=len(team_opp_pos_avg), )

In [None]:
# # train
# model_results = pd.DataFrame()
# print 'Training...'
# for p in offense_positions:
#     for lf in latent_features_list:
#         print '\rPosition {} - lf {}'.format(p, lf),
#         data = df[df['position']==p].copy()
#         n_folds = len(team_opp_pos_avg.fold.unique())

#         model_results_i = kfold_cv(data, n_folds)
        
#         model_results = pd.concat([model_results, model_results_i])
# print '\rDone!'

In [None]:
# model_results['rmse_fold_avg'] = model_results.mean(axis=1)
# best_param = {}
# for p in offense_positions:
#     best_param_pos = model_results.loc[idx[p,:,:],]['rmse_fold_avg'].idxmin()
#     best_param[best_param_pos[0]] = {'lf':best_param_pos[1], 'iter':best_param_pos[2]}
# best_param

## testing on ALL hyper parameters

In [None]:
import itertools

In [None]:
# index of results parameters
offense_positions = ['QB', 'RB', 'TE', 'WR']

max_latent_features = 5
latent_features_list = range(1, max_latent_features+1)

max_iter = 300
max_iter_list = range(1, max_iter+1)

In [None]:
# list of hyper param
grid = {
    'lf':latent_features_list,
    'alpha':[0.0001, 0.0005, 0.001],
    'beta':[0.005, 0.01, 0.02],
    'mu':[0.3, 0.5, 0.7, 0.9]
}

# number of grid combincations
grid_n = 1
for key in grid.keys():
    grid_n *= len(grid[key])
print '{} combinations'.format(grid_n)
grid

In [None]:
# train
model_results = pd.DataFrame()
print 'Training...'
for p in offense_positions:
    #data = df[df['position']==p].copy()
    data = team_opp_pos_avg[team_opp_pos_avg['position']==p].copy()
    for i, gs in enumerate(itertools.islice(itertools.product(
        grid['lf'],
        grid['alpha'],
        grid['beta'],
        grid['mu']
    ), grid_n)):
        lf = gs[0]
        alpha = gs[1]
        beta = gs[2]
        mu = gs[3]
        print '\r({}/{}) Position {} - lf {} - alpha {} - beta {} - mu {}'.format(i+1, grid_n,
                                                                                  p, lf, alpha, beta, mu),
        
        n_folds = len(data.fold.unique())
        
        model_results_i = kfold_cv(data, n_folds, lf, alpha, beta, mu)        
        model_results = pd.concat([model_results, model_results_i])
print '\rDone!'

In [None]:
model_results.dropna(inplace=True)
model_results['rmse_fold_avg'] = model_results.mean(axis=1)

In [None]:
best_param = {}
for p in offense_positions:
    best_param_pos = model_results.loc[idx[p,:,:,:,:,:],]['rmse_fold_avg'].idxmin()
    best_param[best_param_pos[0]] = {'lf'   : best_param_pos[1],
                                     'alpha': round(best_param_pos[2], 4),
                                     'beta' : round(best_param_pos[3], 2),
                                     'mu'   : round(best_param_pos[4], 1),
                                     'iter' : best_param_pos[5]
                                    }
best_param

In [None]:
# # manual best params
# best_param = {'QB': {'alpha': 0.0001,
#                      'beta': 0.005,
#                      'iter': 46,
#                      'lf': 3,
#                      'mu': 0.9},
#               'RB': {'alpha': 0.0001,
#                      'beta': 0.02,
#                      'iter': 44,
#                      'lf': 4,
#                      'mu': 0.9},
#               'TE': {'alpha': 0.0001,
#                      'beta': 0.02,
#                      'iter': 28,
#                      'lf': 1,
#                      'mu': 0.9},
#               'WR': {'alpha': 0.0005,
#                      'beta': 0.02,
#                      'iter': 25,
#                      'lf': 3,
#                      'mu': 0.7}
#              }

In [None]:
gb = model_results.groupby(level=[0])['rmse_fold_avg']
gb.nsmallest(3)

In [None]:
# run mf
p = 'QB'
data = team_opp_pos_avg[team_opp_pos_avg['position']==p].copy()
B, T, O, rmse = home_team_oppo_mfsgd(data,
                                     latent_features = best_param[p]['lf'],
                                     max_iter = best_param[p]['iter'],
                                     alpha = best_param[p]['alpha'],
                                     beta = best_param[p]['beta'],
                                     mu = best_param[p]['mu']
                                    )

fig, ax = plt.subplots(figsize=(10,3))
ax.plot(range(len(rmse)),rmse)
ax.set_ylim([0,rmse[0]])
plt.show()

In [None]:
data.head()

In [None]:
def score_mf(data, B, T, O):
    X = np.c_[np.ones(len(data)), data['home'].as_matrix()]
    team_embed = T[team_le.transform(data['team']),:]
    oppo_embed = O[team_le.transform(playoffGames['opponent']),:]

    # calc y_hat
    y_hat = np.dot(X, B) + np.einsum('ij,ji->i', team_embed, oppo_embed.T)
    return y_hat

In [None]:
def score_new_game(team, oppo, home, pos, B, T, O):
    df = pd.DataFrame(data={'team_id':team_le.transform(team),
                            'opponent_id':team_le.transform(oppo),
                            'home':home,
                            'position':pos
                           }, index=[0])
#     df['team_id'] = team_le.transform(df['team'])
#     df['opponent_id'] = team_le.transform(df['opponent'])
    return score_mf(df, B, T, O)

In [None]:
# # add team ids
# team_opp_pos_avg['team_id'] = team_le.transform(team_opp_pos_avg['team'])
# team_opp_pos_avg['opponent_id'] = team_le.transform(team_opp_pos_avg['opponent'])

#### matrix scores

In [None]:
tm_off_avg = pd.DataFrame(index=teams)
tm_def_avg = pd.DataFrame(index=teams)
for p in offense_positions:
    print p, best_param[p]
    data = team_opp_pos_avg[team_opp_pos_avg['position']==p].copy()
    B, T, O, rmse = home_team_oppo_mfsgd(data,
                                     latent_features = best_param[p]['lf'],
                                     max_iter = best_param[p]['iter'],
                                     alpha = best_param[p]['alpha'],
                                     beta = best_param[p]['beta'],
                                     mu = best_param[p]['mu']
                                    )
    tm_off_avg[p] = np.mean(np.dot(T,O.T), axis=1)
    tm_def_avg[p] = np.mean(np.dot(T,O.T), axis=0)

In [None]:
tm_off_avg['TOTAL'] = tm_off_avg.sum(axis=1)
tm_def_avg['TOTAL'] = tm_def_avg.sum(axis=1)

In [None]:
# sorted total offense, higher is better
tm_off_avg.sort_values(by=['TOTAL'], ascending=False)

In [None]:
# sorted total defense, lower = better
tm_def_avg.sort_values(by=['TOTAL'])

In [None]:
data = team_opp_pos_avg[team_opp_pos_avg['position']=='QB'].copy()

In [None]:
B, T, O, rmse = home_team_oppo_mfsgd(data,
                                 latent_features = 3,
                                 max_iter = 46,
                                 alpha = 0.0001,
                                 beta = 0.005,
                                 mu = 0.9
                                )

# score future games

#### pull games

In [None]:
import sportsref
from pyquery import PyQuery as pq

In [None]:
season = 2016

In [None]:
url = sportsref.nfl.BASE_URL + '/years/{}/games.htm'.format(season)
doc = pq(sportsref.utils.getHTML(url))
table = doc('table#games')
playoffGames = sportsref.utils.parseTable(table).loc[192: 192 + 16*3 - 1]

# adding/fixing cols
playoffGames['season'] = season
playoffGames['week'] = playoffGames['week_num'].astype(int)
playoffGames['bsID'] = playoffGames['boxscore_word']
playoffGames['team'] = playoffGames['loser']
playoffGames['opponent'] = playoffGames['winner']
playoffGames['home'] = 1

In [None]:
# copy df with cols needed
playoffGames = playoffGames[['season', 'week', 'bsID', 'team', 'opponent', 'home']].copy()

# for each game, duplicate row and swap home/away teams, home indicator
for i,r in playoffGames.iterrows():
    foo = playoffGames.loc[i].copy()
    foo['team'], foo['opponent'], foo['home'] = foo['opponent'], foo['team'], int(not(playoffGames.loc[i]['home']))
    playoffGames = playoffGames.append(foo)

# sort df
playoffGames = (playoffGames.sort_values(by=['season', 'week', 'bsID', 'home'])
                            .reset_index()
                            .drop(['index'], axis=1))

In [None]:
playoffGames.head(2)

In [None]:
# add each position to each row
# join_positions = pd.DataFrame(offense_positions, columns=['position'])
# join_positions['key'] = 0
# playoffGames['key'] = 0

# playoffGames = pd.merge(playoffGames, join_positions, on=['key']).drop(['key'], axis=1)

#### score games

In [None]:
p = 'TE'

In [None]:
foo = pd.DataFrame()
for p in offense_positions:
    data = team_opp_pos_avg[team_opp_pos_avg['position']==p].copy()
    B, T, O, rmse = home_team_oppo_mfsgd(data,
                                         latent_features = best_param[p]['lf'],
                                         max_iter = best_param[p]['iter'],
                                         alpha = best_param[p]['alpha'],
                                         beta = best_param[p]['beta'],
                                         mu = best_param[p]['mu']
                                        )
    bar = playoffGames.copy()
    bar['position'] = p
    bar['y_hat'] = score_mf(playoffGames, B, T, O)
    bar.sort_values(by=['season', 'team', 'week'], inplace=True)
    foo = foo.append(bar, ignore_index=True)

In [None]:
foo

In [None]:
bar = playoffGames.copy()
bar['position'] = p
bar['y_hat'] = score_mf(playoffGames, B, T, O)

In [None]:
bar.sort_values(by=['season', 'team', 'week'], inplace=True)

In [None]:
bar.head()

In [None]:
foo = pd.DataFrame()

In [None]:
foo.append(bar, ignore_index=True)

In [None]:
foo.to_csv(directory + '/playoff_game_position_forecasts.csv')

# testing

In [None]:
a = np.random.rand(50)

In [None]:
a.var()

In [None]:
from scipy.stats import norm

In [None]:
m, std = norm.fit(a)

In [None]:
print m, std

In [None]:
std**2

In [None]:
T.shape

In [None]:
T.var()