In [2]:
import pandas as pd
import numpy as np
import sklearn
import warnings
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
warnings.filterwarnings("ignore", message="pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.")

proj_dir = '/Users/nickbachelder/Desktop/Kaggle/Linemen'

os.chdir( os.path.join(proj_dir, 'code/clean') )

%run clean_objects.ipynb


class RushDataset(Dataset):
    def __init__(self, sequences, sequence_length):
        self.sequences = sequences
        self.sequence_length = sequence_length
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        sequence, label = self.sequences[idx]
        label = int(label)
        labels = np.repeat(label, self.sequence_length)
        if sequence.shape[0] > self.sequence_length:
            sequence = sequence.iloc[0:self.sequence_length]
        if sequence.shape[0] < self.sequence_length:
            # padding = pd.DataFrame(0, index=np.arange(self.sequence_length - sequence.shape[0]), columns=sequence.columns) zero pad does not work
            padding = pd.DataFrame(sequence.iloc[-1:].copy())
            num_padded = self.sequence_length - sequence.shape[0]
            padding_rep = pd.DataFrame(np.repeat(padding.values, num_padded, axis=0)) 
            padding_rep.columns = sequence.columns
            sequence = pd.concat([sequence, padding_rep])

        label_tensor = torch.Tensor(labels).long()
        data_tensor = torch.Tensor(sequence.values)
            
        return data_tensor, label_tensor

class SequenceModel(torch.nn.Module):
    def __init__(self, n_features, n_classes, n_hidden, n_layers):
        super().__init__()
        self.n_features = n_features
        self.n_classes = n_classes
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        self.ltsm = torch.nn.LSTM(input_size = n_features, hidden_size = n_hidden, num_layers = n_layers, batch_first = True)
        self.classifier = torch.nn.Linear(n_hidden, n_classes)
    def forward(self, x):
        unfolded_hiddens, (_, _) = self.ltsm(x)
        fc_output = self.classifier(unfolded_hiddens)
        return fc_output
    def get_layer_probabilities(self, x):
        fc_output = self.forward(x)
        fc_output_long = fc_output.view([x.size(0) * x.size(1), self.n_classes])
        time_prob = torch.nn.functional.softmax(fc_output_long, dim = 1).detach().cpu().numpy().tolist()
        time_prob_sack = [x[1] for x in time_prob]
        return time_prob_sack


def train_model(data_loader, model, loss_function, optimizer):
    num_batches = len(data_loader)
    total_loss = 0
    model.train()
    
    outputs = []
    labels = []
    time_frame_losses = {}
    time_frame_labels = {}
    time_frame_fc_outputs = {}
    i = 0
    for X, y in data_loader:
        i += 1
        fc_output = model(X)
        fc_output_long = fc_output.view([data_loader.dataset.sequence_length * X.size(0), model.n_classes])
        y_long = y.view([data_loader.dataset.sequence_length * X.size(0)])
        outputs.extend([x[0] for x in fc_output_long.detach().numpy().tolist()])
        labels.extend(y_long.numpy())
        loss = loss_function(fc_output_long, y_long)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # get auc per 5 frames

        time_frame_start = 0
        if data_loader.dataset.sequence_length%5 != 0:
            print('Pick a sequence length divisable by 5')
            return None
        for time_frames in range(int(data_loader.dataset.sequence_length / 5)):
            time_frames += 1
            this_time_frame_fc_output = fc_output[:, time_frame_start : time_frames*5, :].reshape([X.size(0) * 5, 2])
            this_time_frame_ys = y[:, time_frame_start : time_frames*5].reshape([X.size(0) * 5])
            loss_this_time_frame = loss_function(this_time_frame_fc_output, this_time_frame_ys)
            if i == 1:
                time_frame_losses.update({str(time_frames) : [loss_this_time_frame.item()]})
                time_frame_labels.update({str(time_frames) : this_time_frame_ys.detach().cpu().numpy().tolist()})
                time_frame_fc_outputs.update({str(time_frames) : this_time_frame_fc_output.detach().cpu().numpy().tolist()})
            else:
                time_frame_losses[str(time_frames)].extend([loss_this_time_frame.item()])
                time_frame_labels[str(time_frames)].extend(this_time_frame_ys.detach().cpu().numpy().tolist())
                time_frame_fc_outputs[str(time_frames)].extend(this_time_frame_fc_output.detach().cpu().numpy().tolist())
            time_frame_start = time_frame_start + 5

    avg_loss = total_loss / num_batches
    tpr, fpr, thresholds = sklearn.metrics.roc_curve(y_true = labels, y_score = outputs, pos_label = 1)
    auc = sklearn.metrics.auc(fpr, tpr)
    print(f"Overall Train loss: {avg_loss} , Overall Train AUC: {auc}")

    for time_period in time_frame_losses.keys():
        losses = time_frame_losses[time_period]
        labels = time_frame_labels[time_period]
        outputs_fc = [x[0] for x in time_frame_fc_outputs[time_period]]
        
        avg_loss = sum(losses) / num_batches
        tpr, fpr, thresholds = sklearn.metrics.roc_curve(y_true = labels, y_score = outputs_fc, pos_label = 1)
        auc = sklearn.metrics.auc(fpr, tpr)
        print(f"Train loss for period {time_period}: {avg_loss} , Train AUC for period {time_period}: {auc}")

    return model

def test_model(data_loader, model, loss_function):
    
    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    i = 0
    time_frame_losses = {}
    time_frame_labels = {}
    time_frame_fc_outputs = {}
    with torch.no_grad():
        outputs = []
        labels = []
        for X, y in data_loader:
            i += 1
            fc_output = model(X)
            fc_output_long = fc_output.view([data_loader.dataset.sequence_length * X.size(0), model.n_classes])
            y_long = y.view([data_loader.dataset.sequence_length * X.size(0)])
            outputs.extend([x[0] for x in fc_output_long.detach().numpy().tolist()])
            labels.extend(y_long.numpy())
            total_loss += loss_function(fc_output_long, y_long).item()

            time_frame_start = 0
            if data_loader.dataset.sequence_length%5 != 0:
                print('Pick a sequence length divisable by 5')
                return None
            for time_frames in range(int(data_loader.dataset.sequence_length / 5)):
                time_frames += 1
                this_time_frame_fc_output = fc_output[:, time_frame_start : time_frames*5, :].reshape([X.size(0) * 5, 2])
                this_time_frame_ys = y[:, time_frame_start : time_frames*5].reshape([X.size(0) * 5])
                loss_this_time_frame = loss_function(this_time_frame_fc_output, this_time_frame_ys)
                if i == 1:
                    time_frame_losses.update({str(time_frames) : [loss_this_time_frame.item()]})
                    time_frame_labels.update({str(time_frames) : this_time_frame_ys.detach().cpu().numpy().tolist()})
                    time_frame_fc_outputs.update({str(time_frames) : this_time_frame_fc_output.detach().cpu().numpy().tolist()})
                else:
                    time_frame_losses[str(time_frames)].extend([loss_this_time_frame.item()])
                    time_frame_labels[str(time_frames)].extend(this_time_frame_ys.detach().cpu().numpy().tolist())
                    time_frame_fc_outputs[str(time_frames)].extend(this_time_frame_fc_output.detach().cpu().numpy().tolist())
                time_frame_start = time_frame_start + 5

    avg_loss = total_loss / num_batches
    tpr, fpr, thresholds = sklearn.metrics.roc_curve(y_true = labels, y_score = outputs, pos_label = 1)
    auc = sklearn.metrics.auc(fpr, tpr)
    print(f"Overall test loss: {avg_loss} , Overall test AUC: {auc}")

    for time_period in time_frame_losses.keys():
        losses = time_frame_losses[time_period]
        labels = time_frame_labels[time_period]
        outputs_fc = [x[0] for x in time_frame_fc_outputs[time_period]]
        
        avg_loss = sum(losses) / num_batches
        tpr, fpr, thresholds = sklearn.metrics.roc_curve(y_true = labels, y_score = outputs_fc, pos_label = 1)
        auc = sklearn.metrics.auc(fpr, tpr)
        print(f"Test loss for period {time_period}: {avg_loss} , Test AUC for period {time_period}: {auc}")

def train_rush_lstm(train_loader, test_loader, model, loss_function, optimizer, num_epochs = 10):
    for ix_epoch in range(num_epochs):
        print(f"Epoch {ix_epoch}\n---------")
        model = train_model(train_loader, model, loss_function, optimizer=optimizer)
        test_model(test_loader, model, loss_function)
        print()
    return(model)

def predict_play(master_track, play_id, model, normalize = True, replace_player = None):
    week = master_track.search_track_weeks(variables = ["playId"], variable_values = [play_id]).reset_index(drop = True).week.tolist()[0]
    play_dat_label = master_track.get_rush_sequences_labels(week = week + 1, normalize = normalize, play_id = play_id, replace_player = replace_player)
    play_dataset = RushDataset(sequences = play_dat_label, sequence_length=50)
    play_loader = DataLoader(play_dataset, batch_size = 1, shuffle = False)

    outputs = []
    model.eval()
    with torch.no_grad():
        for X, _ in play_loader:
            y_star = model.get_layer_probabilities(X)
            outputs.extend(y_star)
    return outputs

In [None]:
os.chdir( os.path.join(proj_dir, 'code/modeling') )