In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import pickle
import gensim.downloader as api
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import spacy

In [2]:
#!python -m spacy download en_core_web_lg
glove_model = spacy.load('en_core_web_lg')


In [3]:
train_data = pd.read_csv('data/train_data.csv', index_col=0)
test_data = pd.read_csv('data/test_data.csv', index_col=0)

In [4]:
max_text_len = 256
max_title_len = 32

# Truncate the token lists
train_data['text_tokens'] = train_data['text_tokens'].apply(lambda x: x[:max_text_len])
train_data['title_tokens'] = train_data['title_tokens'].apply(lambda x: x[:max_title_len])

test_data['text_tokens'] = test_data['text_tokens'].apply(lambda x: x[:max_text_len])
test_data['title_tokens'] = test_data['title_tokens'].apply(lambda x: x[:max_title_len])

In [5]:
embedding_dim = 300


In [6]:
def embed_tokens(df, title_max_len=32, text_max_len=256):
    title_vectors = []
    text_vectors = []
    labels = []

    for _, row in df.iterrows():
        # --- Title ---
        title_vec = []
        for token in row['title_tokens'][:title_max_len]:
            if token in glove_model.vocab and glove_model.vocab[token].has_vector:
                title_vec.append(glove_model.vocab[token].vector)
            else:
                title_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        while len(title_vec) < title_max_len:
            title_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        title_vectors.append(title_vec)

        # --- Text ---
        text_vec = []
        for token in row['text_tokens'][:text_max_len]:
            if token in glove_model.vocab and glove_model.vocab[token].has_vector:
                text_vec.append(glove_model.vocab[token].vector)
            else:
                text_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        while len(text_vec) < text_max_len:
            text_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        text_vectors.append(text_vec)

        labels.append(row['label'])

    return (
        np.array(title_vectors, dtype=np.float32),
        np.array(text_vectors, dtype=np.float32),
        np.array(labels, dtype=np.int64)
    )


In [7]:
train_title_vecs, train_text_vecs, train_labels = embed_tokens(train_data)
test_title_vecs, test_text_vecs, test_labels = embed_tokens(test_data)


In [8]:
class my_dataset(Dataset):
    def __init__(self, title_vecs, text_vecs, labels):
        self.title_vecs = torch.tensor(title_vecs, dtype=torch.float32)
        self.text_vecs = torch.tensor(text_vecs, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_vecs[idx], self.text_vecs[idx], self.labels[idx]


In [9]:
train_dataset = my_dataset(train_title_vecs, train_text_vecs, train_labels)
test_dataset = my_dataset(test_title_vecs, test_text_vecs, test_labels)


In [10]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [11]:

class DualLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim=300, hidden_dim=128, num_classes=2):
        super(DualLSTMClassifier, self).__init__()
        self.title_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.text_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)

        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, title_input, text_input):

        _, (title_hidden, _) = self.title_lstm(title_input)
        _, (text_hidden, _) = self.text_lstm(text_input)

        title_hidden = title_hidden[-1]
        text_hidden = text_hidden[-1]

        combined = torch.cat((title_hidden, text_hidden), dim=1) 
        combined = self.dropout(combined)
        output = self.fc(combined)  
        return output


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [13]:
model = DualLSTMClassifier()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [15]:
num_epochs =20

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for title_batch, text_batch, labels in train_loader:
        title_batch = title_batch.to(device)
        text_batch = text_batch.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(title_batch, text_batch)  
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Accuracy
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    print(f'Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss:.4f} | Accuracy: {acc:.4f}')


Epoch 1/20 | Loss: 1039.3350 | Accuracy: 0.5827
Epoch 2/20 | Loss: 968.9186 | Accuracy: 0.6441
Epoch 3/20 | Loss: 851.6899 | Accuracy: 0.7108
Epoch 4/20 | Loss: 817.5990 | Accuracy: 0.7223
Epoch 5/20 | Loss: 775.4721 | Accuracy: 0.7430
Epoch 6/20 | Loss: 746.9771 | Accuracy: 0.7582
Epoch 7/20 | Loss: 698.1593 | Accuracy: 0.7807
Epoch 8/20 | Loss: 668.3558 | Accuracy: 0.7925
Epoch 9/20 | Loss: 647.8849 | Accuracy: 0.8000
Epoch 10/20 | Loss: 622.1323 | Accuracy: 0.8113
Epoch 11/20 | Loss: 600.1943 | Accuracy: 0.8178
Epoch 12/20 | Loss: 636.3847 | Accuracy: 0.8057
Epoch 13/20 | Loss: 555.6847 | Accuracy: 0.8335
Epoch 14/20 | Loss: 436.3181 | Accuracy: 0.8683
Epoch 15/20 | Loss: 413.3083 | Accuracy: 0.8777
Epoch 16/20 | Loss: 393.6644 | Accuracy: 0.8858
Epoch 17/20 | Loss: 359.5535 | Accuracy: 0.8958
Epoch 18/20 | Loss: 331.4814 | Accuracy: 0.9071
Epoch 19/20 | Loss: 305.7290 | Accuracy: 0.9145
Epoch 20/20 | Loss: 289.0188 | Accuracy: 0.9216


In [17]:
all_preds = []

with torch.no_grad():
    for title_batch, text_batch, _ in test_loader:
        title_batch = title_batch.to(device)
        text_batch = text_batch.to(device)

        outputs = model(title_batch, text_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())


In [19]:
df_results = pd.DataFrame({
    'label': test_labels,
    'pred_glove': all_preds
})

In [21]:
df_results.to_csv('test_with_glove.csv')

In [22]:
torch.save(model.state_dict(), 'models/lstm_glove.pt')
