In [53]:
import os
import json
from pprint import pprint
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.optimizers import RMSprop

In [54]:
curr_dir = os.path.dirname('__file__')
folder = os.path.join(curr_dir, "season_data")

In [75]:
years = range(2000, 2016)
features = {'row': [], 'year': [], 'week': []}
for year in years:
    fn = 'output_%i.json' % year
    with open(os.path.join(folder, fn)) as f:
        data = json.load(f)
    for game in data:
        team = 'home'
        for team in ['home', 'away']:
            opponent = {'home': 'away', 'away': 'home'}[team]
            naming = {team: 'my_', opponent: 'opp_'}
            features['row'].append(game['row'])
            features['year'].append(year)
            features['week'].append(game['week'])
            for _team in ['home', 'away']:
                for stat in game[_team]:
                    stat_name = naming[_team] + stat
                    if stat_name not in features:
                        features[stat_name] = []
                    features[stat_name].append(game[_team][stat])

In [76]:
df = pd.DataFrame(features).sort_values(['year', 'row']).reset_index(drop=True)
df['margin_victory'] = df.my_pts - df.opp_pts
df['win'] = df.margin_victory >= 0

In [77]:
def get_epochs(df, n_prev, target, features=None):
    """
    Used to shuffle the dataset at each iteration.
    """
    
    if features is None:
        features = df.drop(target).columns.tolist()
    
    docX, docY = [], []
    indices = []
    for i in range(len(df)-n_prev):
        docX.append(df[features].iloc[i:i+n_prev].as_matrix())
        docY.append(df[target].iloc[i+n_prev])
        indices.append(df.index[i+n_prev])
    alsX = np.array(docX)
    alsY = np.array(docY)
#     if (ravel):
#         alsY = alsY.ravel()

    return alsX, alsY, indices

def reorder_array(arr, indices):
    zipped = zip(arr, indices)
    zipped_sort = sorted(zipped, key=lambda x: x[1])
    return np.array([elem[0] for elem in zipped_sort])

def nfl_epochs(df, target, features=None):
    team_dfs = [group for name, group in df.groupby('my_name')]
    if features is None:
        features = df.drop(target).columns.tolist()
    X = None
    y = None
    for team_df in team_dfs:
        x1, y1, indices = get_epochs(team_df, 32, target, features=features)
        if X is None or y is None:
            X = x1
            y = y1
        else:
            X = np.append(X, x1, axis=0)
            y = np.append(y, y1, axis=0)
        reindexer += indices
    X = reorder_array(X, reindexer)
    y = reorder_array(y, reindexer)
    return X, y

def preprocess_array(arr):
    pass
    

def train_test_split(X, y, test_size=0.1, recenter=False):  
    """
    This just splits data to training and testing parts
    """
    ntrn = int(round(len(X) * (1 - test_size)))
    X_train = X[0:ntrn]
    y_train = y[0:ntrn]    
    X_test = X[ntrn:]
    y_test = y[ntrn:]
    
    if (recenter):
        X_train -= X_train.mean(axis=(0,1))
        X_test = X_train.mean(axis=(0,1))
        y_train -= y_train.mean()
        y_test -= y_train.mean()
        
    return (X_train, y_train), (X_test, y_test)

In [58]:
def reorder_array(arr, indices):
    zipped = zip(arr, indices)
    zipped_sort = sorted(zipped, key=lambda x: x[1])
    return np.array([elem[0] for elem in zipped_sort])

In [78]:
team_dfs = [group for name, group in df.groupby('my_name')]
features = ['margin_victory']
# features = df.drop(['my_name', 'opp_name', 'row', 'week', 'year'], axis=1).columns.tolist()
target = 'win'
reindexer = []
X = None
y = None
for team_df in team_dfs:
    x1, y1, indices = get_epochs(team_df, 16, target, features=features)
    if X is None or y is None:
        X = x1
        y = y1
    else:
        X = np.append(X, x1, axis=0)
        y = np.append(y, y1, axis=0)
    reindexer += indices
X = reorder_array(X, reindexer)
y = reorder_array(y, reindexer)

In [79]:
(X_train, y_train), (X_test, y_test) = train_test_split(X, y, test_size=0.3)  # retrieve data

In [70]:
print X.shape

(8000, 16, 45)


In [85]:
hidden_neurons = 10
model = Sequential()
model.add(SimpleRNN(hidden_neurons, input_dim=X_train.shape[-1], activation='tanh'))
model.add(Dense(1, activation='tanh'))
model.compile(loss="mean_squared_error", optimizer=RMSprop())
model.fit(X_train, y_train, nb_epoch=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f982ad22dd0>

In [81]:
predicted = model.predict(X_test)

In [82]:
print predicted[40:50]

[[ 0.64862698]
 [ 0.59819096]
 [ 0.67499602]
 [ 0.41066673]
 [ 0.57848823]
 [ 0.45954126]
 [ 0.36903736]
 [ 0.44755375]
 [ 0.55796444]
 [ 0.47706291]]


In [89]:
predicted = model.predict(X_test)
print "Error on train set: %.2f" % ((model.predict(X_train) > 0.5 == y_train).mean())
print "Error on test set: %.2f" % ((model.predict(X_test) > 0.5 == y_test).mean())
print "Previous day error: %.2f" % ((.5 - y_test) ** 2).mean()

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [12]:
df


Unnamed: 0,my_first_downs,my_fourth_down_att,my_fourth_down_conv,my_fum,my_fum_lost,my_name,my_pass_att,my_pass_comp,my_pass_int,my_pass_td,...,opp_sacked,opp_sacked_yds,opp_third_down_att,opp_third_down_conv,opp_time_of_pos,opp_turnovers,row,week,year,margin_victory
0,23,1,0,0,0,WAS,36,25,0,0,...,6,59,11,2,1606,1,0,1,2000,3
1,12,1,1,1,1,CAR,26,17,0,1,...,0,0,12,5,1994,0,0,1,2000,-3
2,14,2,1,2,1,NWE,39,26,0,1,...,1,11,16,6,1895,1,1,1,2000,-5
3,17,0,0,2,1,TAM,24,12,0,1,...,6,26,17,4,1705,1,1,1,2000,5
4,22,0,0,2,1,ATL,31,16,0,2,...,1,6,9,2,1701,1,2,1,2000,8
5,23,1,0,3,0,SFO,36,23,1,3,...,0,0,11,4,1899,1,2,1,2000,-8
6,16,0,0,1,1,MIA,24,15,0,1,...,4,12,11,1,1300,6,3,1,2000,23
7,8,1,0,3,2,SEA,24,10,4,0,...,2,7,13,3,2300,1,3,1,2000,-23
8,13,0,0,2,0,OAK,35,20,0,1,...,2,18,18,8,1710,4,4,1,2000,3
9,17,1,0,2,1,SDG,42,19,3,0,...,4,10,16,3,1890,0,4,1,2000,-3
