In [29]:
# imports

import pickle
import math
import pandas as pd
import numpy as np
import os
import pathlib
import tqdm
from functools import reduce
from scipy.stats import uniform, randint
import sklearn
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import warnings
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
warnings.filterwarnings("ignore", message="pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.")

# import all_Data object ... if not loaded run clean.ipynb script in code/clean folder to create

proj_dir = '/Users/nickbachelder/Desktop/Kaggle/Linemen'
os.chdir( os.path.join(proj_dir, 'code/clean') )
%run clean_objects.ipynb
os.chdir( os.path.join(proj_dir, 'data') )
with open('all_data.pkl', 'rb') as file:
    all_data = pickle.load(file)
os.chdir( os.path.join(proj_dir, 'code/modeling') )
%run lstm_objects.ipynb
%run metric_objects.ipynb

In [24]:
# get labels and split data

week = 9

all_dat_labels = all_data.get_rush_sequences_labels(week = week, normalize = True)
training_dat, test_dat = train_test_split(all_dat_labels, test_size = 0.2)

# Check for NAs (None)
bad_indexs = []
print('Train NA')
for value in range(len(training_dat)):
    any_na = training_dat[value][0].isnull().values.any()
    if any_na == True:
        bad_indexs.append(value)
        print(value)

for index in sorted(bad_indexs, reverse=True):
    del training_dat[index]


bad_indexs = []
print('Test NA')
for value in range(len(test_dat)):
    any_na = test_dat[value][0].isnull().values.any()
    if any_na == True:
        bad_indexs.append(value)
        print(value)

for index in sorted(bad_indexs, reverse=True):
    del test_dat[index]

print(f'Length train : {len(training_dat)}')
print(f'Length test : {len(test_dat)}')

Train NA
Test NA
Length train : 6816
Length test : 1704


In [25]:
# model building & training

model = SequenceModel(n_features = training_dat[0][0].shape[1], n_classes = 2, n_hidden = 15, n_layers = 1)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.01)
train_dataset = RushDataset(sequences = training_dat, sequence_length=50)
train_loader = DataLoader(train_dataset, batch_size = 50, shuffle = True)
test_dataset = RushDataset(sequences = test_dat, sequence_length=30)
test_loader = DataLoader(test_dataset, batch_size = 50, shuffle = True)

lstm_model = train_rush_lstm(train_loader, test_loader, model, loss_function, optimizer, num_epochs = 20)

Epoch 0
---------
Overall Train loss: 0.6240993764278663 , Overall Train AUC: 0.6460766949329246
Train loss for period 1: 0.6700963051649775 , Train AUC for period 1: 0.5308091164550495
Train loss for period 2: 0.6721112279996385 , Train AUC for period 2: 0.5401201460952657
Train loss for period 3: 0.6646824500856608 , Train AUC for period 3: 0.5527461758160322
Train loss for period 4: 0.6543421192760885 , Train AUC for period 4: 0.5791657932710563
Train loss for period 5: 0.6342085056061292 , Train AUC for period 5: 0.6309110701455198
Train loss for period 6: 0.6133117205905219 , Train AUC for period 6: 0.6801471005337034
Train loss for period 7: 0.5947942044178065 , Train AUC for period 7: 0.7218219396056718
Train loss for period 8: 0.5828364117302164 , Train AUC for period 8: 0.7481868251896959
Train loss for period 9: 0.578265933659825 , Train AUC for period 9: 0.7570901984069449
Train loss for period 10: 0.5763448900985022 , Train AUC for period 10: 0.7616758553868602
Overall test

In [26]:
os.chdir( os.path.join(proj_dir, 'data') )
torch.save(lstm_model.state_dict(), "lstm_weights")