In [None]:
%load_ext autoreload
%autoreload 2

## Training LSTM

In [117]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np
import pandas as pd 
from src.utils import train_test_split
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from src.preprocessing import TextDataset
import torch
from torch.utils.data import DataLoader, Dataset
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, LoggingHandler
import logging
from copy import deepcopy
from sklearn.decomposition import PCA
from huggingface_hub import notebook_login
from sklearn.ensemble import RandomForestClassifier

tqdm.pandas()

In [2]:
train_data, test_data = train_test_split()

100%|██████████| 16/16 [00:05<00:00,  2.78it/s]


In [9]:
train_data.keys()

dict_keys([0, 2, 4, 7, 8, 11, 13, 14, 18, 19])

In [63]:
lstm_train_df = train_data[4]\
    .sample(50000, replace=False)\
    .sort_values("Timestamp")
lstm_val_df = train_data[7]\
    .sample(50000)\
    .sort_values("Timestamp")

In [64]:
model = SentenceTransformer('peulsilva/sentence-transformer-trained-tweet', cache_folder='/Data')

# Encode texts
embeddings = model.encode(lstm_train_df['Tweet'].tolist(), show_progress_bar=True)
targets = lstm_train_df['EventType'].to_numpy()

embeddings_val = model.encode(lstm_val_df['Tweet'].tolist(), show_progress_bar=True)
targets_val = lstm_val_df['EventType'].to_numpy()

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

Batches:   0%|          | 0/1563 [00:00<?, ?it/s]

In [None]:
sequence_length = 
X_train = np.lib.stride_tricks.sliding_window_view(embeddings, (sequence_length, embeddings.shape[1]))[:, 0, :, :]
y_train = targets[sequence_length-1:]

positive_count = np.sum(y_train)  # Number of positive examples
negative_count = len(y_train) - positive_count  # Number of negative examples

class_weights = torch.tensor([1.0 / negative_count, 1.0 / positive_count], dtype=torch.float32)

X_val = np.lib.stride_tricks.sliding_window_view(embeddings_val, (sequence_length, embeddings_val.shape[1]))[:, 0, :, :]
y_val = targets_val[sequence_length-1:]

# Step 3: Define a custom Dataset
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [126]:
train_dataset = SequenceDataset(X_train, y_train)
val_dataset = SequenceDataset(X_val, y_val)


In [127]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [128]:
class LSTMModel(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = torch.nn.LSTM(input_dim, hidden_dim, batch_first=True)
        # self.dropout = torch.nn.Dropout(0.5)  # Add dropout
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        # hidden = self.dropout(hidden[-1])
        output = torch.sigmoid(self.fc(hidden[-1]))   # Pass through fully connected layer
        return output

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",  # Handle class imbalance
    random_state=42,
    max_depth=2
)

X_train_rf = X_train.reshape(X_train.shape[0], -1)  # Flatten sequences
X_val_rf = X_val.reshape(X_val.shape[0], -1)

rf_model.fit(X_train_rf, y_train)

# Step 5: Evaluate the model
y_pred = rf_model.predict(X_val_rf)

In [131]:
accuracy_score(y_val, y_pred)

0.56614

In [130]:
confusion_matrix(y_val,y_pred)

array([[12639,  7882],
       [13811, 15668]])

In [116]:
input_dim = embeddings.shape[1]
hidden_dim = 1
output_dim = 1  # Single output for binary classification

lstm_model = LSTMModel(input_dim, hidden_dim, output_dim)

def init_weights(m):
    if isinstance(m, torch.nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.zeros_(m.bias)
    elif isinstance(m, torch.nn.LSTM):
        for name, param in m.named_parameters():
            if 'weight' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'bias' in name:
                torch.nn.init.zeros_(param.data)

lstm_model.apply(init_weights)

# Step 5: Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_model.to(device)

class_weight_tensor = class_weights.to(device)
criterion = torch.nn.BCELoss(weight=class_weight_tensor[1])
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

# Track best model
best_val_loss = float('inf')
# best_model_path = "best_lstm_model.pth"

# Training loop
epochs = 20
for epoch in range(epochs):
    # Training phase
    lstm_model.train()
    train_loss = 0
    y_pred_train = np.array([])
    y_true_train = np.array([])
    for X_batch, y_batch in tqdm(train_loader):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        outputs = lstm_model(X_batch).squeeze()  # Squeeze to match shape
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        preds = outputs >= 1/2

        y_pred_train = np.append(y_pred_train, preds.cpu().numpy())
        y_true_train = np.append(y_true_train, y_batch.cpu().numpy())


        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation phase
    lstm_model.eval()
    val_loss = 0

    y_pred_val = np.array([])
    y_true_val = np.array([])
    with torch.no_grad():
        for X_batch, y_batch in tqdm(val_loader):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = lstm_model(X_batch).squeeze()
            preds = outputs >= 1/2

            y_pred_val = np.append(y_pred_val, preds.cpu().numpy())
            y_true_val = np.append(y_true_val, y_batch.cpu().numpy())

        

    print(accuracy_score(y_true_val, y_pred_val), accuracy_score(y_true_train, y_pred_train))
    print(confusion_matrix(y_true_val, y_pred_val))

    

    # Save best model
    # if val_loss < best_val_loss:
    #     best_val_loss = val_loss
    #     # torch.save(lstm_model.state_dict(), best_model_path)
    #     print(f"Best model saved with Val Loss: {best_val_loss:.4f}")

100%|██████████| 6249/6249 [00:13<00:00, 454.82it/s]
100%|██████████| 6249/6249 [00:03<00:00, 1616.72it/s]


0.589686143505831 0.5918865395771239
[[    0 20512]
 [    0 29479]]


100%|██████████| 6249/6249 [00:13<00:00, 457.07it/s]
100%|██████████| 6249/6249 [00:03<00:00, 1610.14it/s]


0.589686143505831 0.5737632773899302
[[    0 20512]
 [    0 29479]]


100%|██████████| 6249/6249 [00:11<00:00, 545.31it/s]
100%|██████████| 6249/6249 [00:02<00:00, 2097.57it/s]


0.589686143505831 0.5737632773899302
[[    0 20512]
 [    0 29479]]


 12%|█▏        | 750/6249 [00:01<00:10, 530.14it/s]


KeyboardInterrupt: 

In [108]:
lstm_model(X_batch)

tensor([0.5567], device='cuda:0', grad_fn=<SigmoidBackward0>)

In [84]:
preds.cpu().numpy()

array([ True,  True,  True,  True,  True,  True,  True])

(7,)

In [78]:
accuracy_score(y_true_val, y_pred_val)

ValueError: unknown is not supported

In [71]:
sum(p.numel() for p in lstm_model.parameters())

3086

In [44]:
with torch.no_grad():
    out = lstm_model(X_batch.to(device))

In [45]:
out

tensor([[ 0.3972],
        [ 0.3969],
        [ 0.3950],
        [-0.1095],
        [ 0.0576],
        [ 0.3972],
        [ 0.3972],
        [ 0.4074]], device='cuda:0')