In [1]:
import os
import json
from pprint import pprint
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM, SimpleRNN
from keras.optimizers import RMSprop

Using Theano backend.


In [2]:
curr_dir = os.path.dirname('__file__')
folder = os.path.join(curr_dir, "season_data")

In [3]:
years = range(2000, 2016)
features = {'row': [], 'year': [], 'week': []}
for year in years:
    fn = 'output_%i.json' % year
    with open(os.path.join(folder, fn)) as f:
        data = json.load(f)
    for game in data:
        team = 'home'
        opponent = {'home': 'away', 'away': 'home'}[team]
        naming = {team: 'home_', opponent: 'away_'}
        features['row'].append(game['row'])
        features['year'].append(year)
        features['week'].append(game['week'])
        for _team in ['home', 'away']:
            for stat in game[_team]:
                stat_name = naming[_team] + stat
                if stat_name not in features:
                    features[stat_name] = []
                features[stat_name].append(game[_team][stat])

In [12]:
df = pd.DataFrame(features).sort_values(['year', 'row']).reset_index(drop=True)
df['margin_victory'] = df.home_pts - df.away_pts
df['win'] = df.margin_victory >= 0
all_features = [c.replace("away_", "").replace("home_", "") for c in df.columns]
all_features = list(set(all_features))

In [16]:
learning_features = [f for f in all_features if f not in ['win', 'year', 'row', 'week', 'name']]
learning_features

[u'penalties_yds',
 'margin_victory',
 u'first_downs',
 u'penalties',
 u'pass_comp',
 u'pass_att',
 u'pts',
 u'rush_yds',
 u'pass_td',
 u'fourth_down_att',
 u'rush_att',
 u'pass_yds',
 u'sacked_yds',
 u'fourth_down_conv',
 u'pass_int',
 u'time_of_pos',
 u'rush_tds',
 u'sacked',
 u'turnovers',
 u'third_down_conv',
 u'third_down_att',
 u'fum_lost',
 u'fum']

In [None]:
def get_epochs(df, n_prev, target, features=None):
    """
    Used to shuffle the dataset at each iteration.
    """
    
    if features is None:
        features = df.drop(target).columns.tolist()
    
    docX, docY = [], []
    indices = []
    for i in range(len(df)-n_prev):
        docX.append(df[features].iloc[i:i+n_prev].as_matrix())
        docY.append(df[target].iloc[i+n_prev])
        indices.append(df.index[i+n_prev])
    alsX = np.array(docX)
    alsY = np.array(docY)
#     if (ravel):
#         alsY = alsY.ravel()

    return alsX, alsY, indices

def reorder_array(arr, indices):
    zipped = zip(arr, indices)
    zipped_sort = sorted(zipped, key=lambda x: x[1])
    return np.array([elem[0] for elem in zipped_sort])

def preprocess_array(arr):
    pass

def train_test_split(X, y, test_size=0.1, recenter=False):  
    """
    This just splits data to training and testing parts
    """
    ntrn = int(round(len(X) * (1 - test_size)))
    X_train = X[0:ntrn]
    y_train = y[0:ntrn]    
    X_test = X[ntrn:]
    y_test = y[ntrn:]
    
    if (recenter):
        X_train -= X_train.mean(axis=(0,1))
        X_test = X_train.mean(axis=(0,1))
        y_train -= y_train.mean()
        y_test -= y_train.mean()
        
    return (X_train, y_train), (X_test, y_test)

In [None]:
def reorder_array(arr, indices):
    zipped = zip(arr, indices)
    zipped_sort = sorted(zipped, key=lambda x: x[1])
    return np.array([elem[0] for elem in zipped_sort])

In [None]:
team_dfs = [group for name, group in df.groupby('my_name')]
features = ['margin_victory']
# features = df.drop(['my_name', 'opp_name', 'row', 'week', 'year'], axis=1).columns.tolist()
target = 'win'
reindexer = []
X = None
y = None
for team_df in team_dfs:
    x1, y1, indices = get_epochs(team_df, 16, target, features=features)
    if X is None or y is None:
        X = x1
        y = y1
    else:
        X = np.append(X, x1, axis=0)
        y = np.append(y, y1, axis=0)
    reindexer += indices
X = reorder_array(X, reindexer)
y = reorder_array(y, reindexer)

In [None]:
(X_train, y_train), (X_test, y_test) = train_test_split(X, y, test_size=0.3)  # retrieve data

In [None]:
print X.shape

In [None]:
hidden_neurons = 10
model = Sequential()
model.add(SimpleRNN(hidden_neurons, input_dim=X_train.shape[-1], activation='tanh'))
model.add(Dense(1, activation='tanh'))
model.compile(loss="mean_squared_error", optimizer=RMSprop())
model.fit(X_train, y_train, nb_epoch=10)

In [None]:
predicted = model.predict(X_test)

In [None]:
print predicted[40:50]

In [None]:
predicted = model.predict(X_test)
print "Error on train set: %.2f" % ((model.predict(X_train) > 0.5 == y_train).mean())
print "Error on test set: %.2f" % ((model.predict(X_test) > 0.5 == y_test).mean())
print "Previous day error: %.2f" % ((.5 - y_test) ** 2).mean()

In [None]:
df
