# 0. Imports

In [None]:
#!/usr/bin/env python
import os
from datetime import datetime

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm

from src.data import preprocess_data, embed_data

## 1. Data Preprocessing

In [None]:
df_train, df_eval = preprocess_data()
df_train, df_eval = embed_data(df_train, df_eval)

X = df_train.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
y = df_train['EventType'].values

X_id = df_eval['ID'].values
X_eval = df_eval.drop(columns=['MatchID', 'PeriodID', 'ID']).values

## 2. Model Training

In [None]:
# For Kaggle submission save
def save_predictions_model(clf, params, accuracy):
    clf.fit(X, y)
    preds = clf.predict(X_eval)
    pred_df = pd.DataFrame({'ID': X_id, 'EventType': preds})
    # <clf name>/<params>/<filename>_predictions.csv
    params = params.replace(' ', '')
    today = datetime.today().strftime('%Y%m%d')
    file_folder = f"pred-{today}/{clf.__class__.__name__}-{accuracy}/{params}/"
    os.makedirs(file_folder, exist_ok=True)
    file_path = os.path.join(file_folder, f"{clf.__class__.__name__}_predictions.csv")
    pred_df.to_csv(file_path, index=False)
    print(f"Predictions saved to {file_path}")
    return file_folder


## LSTM

In [47]:
def create_sequences(data, labels, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        label = labels[i+seq_length]
        sequences.append(seq)
        targets.append(label)
    return torch.tensor(sequences, dtype=torch.float32), torch.tensor(targets, dtype=torch.long)

seq_length = 50  # Choose time step length
X_train, y_train = create_sequences(X, y, seq_length)

In [48]:
# hyperparameters
input_size = X_train.shape[-1]  # numberOfInputFeatures
# print(input_size)
hidden_size = 80  # hidden layer size
num_layers = 1  # Number of LSTM layers
num_classes = 2  # Number of categories
learning_rate = 0.001
num_epochs = 1000  # Number of iterations
batch_size = 16
stop_threshold = 0.001  # Threshold to stop training

# Convert data to Tensor
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train, dtype=torch.long)
  X_test_tensor = torch.tensor(X_valid, dtype=torch.float32)
  y_test_tensor = torch.tensor(y_valid, dtype=torch.long)


In [49]:
# Define LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        c0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out
        
LSTM_model = LSTMClassifier(input_size, hidden_size, num_layers, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(LSTM_model.parameters(), lr=learning_rate)

In [None]:
# Training model
LSTM_model.train()
progress_bar = tqdm(range(num_epochs), desc="Training")  # Create a progress bar
for epoch in progress_bar:
    epoch_loss = 0.0
    for batch_idx, (X_batch, y_batch) in enumerate(dataloader):
        # X_batch = X_batch.view(X_batch.size(0), 1, -1)  # Add time step dimension
        X_batch, y_batch = X_batch.to(LSTM_model.fc.weight.device), y_batch.to(LSTM_model.fc.weight.device)
        # forward propagation
        outputs = LSTM_model(X_batch)
        loss = criterion(outputs, y_batch)
        epoch_loss += loss.item()
        
        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_loss = epoch_loss / len(dataloader)
    progress_bar.set_postfix({'Loss': avg_loss})
    
    # Determine whether the stopping condition is met
    if avg_loss < stop_threshold:
        print(f"Early stopping at epoch {epoch + 1}, Loss: {avg_loss:.4f}")
        break

Training:   0%|          | 0/1000 [00:00<?, ?it/s]

In [51]:

def test_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        # X_test = X_test.view(X_test.size(0), 1, -1)
        X_test, y_test = X_test.to(model.fc.weight.device), y_test.to(model.fc.weight.device)
        outputs = model(X_test)
        _, predicted = torch.max(outputs, 1)
        accuracy = (predicted == y_test).sum().item() / y_test.size(0)
    print(f'Test Accuracy: {accuracy * 100:.2f}%')
test_model(LSTM_model, X_train_tensor, y_train_tensor)

Test Accuracy: 88.16%
