In [2]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
import pickle
import gensim.downloader as api
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

In [3]:
w2v_model = api.load("word2vec-google-news-300")


In [4]:
train_data = pd.read_csv('data/train_data.csv', index_col=0)
test_data = pd.read_csv('data/test_data.csv', index_col=0)

In [5]:
max_text_len = 256
max_title_len = 32

# Truncate the token lists
train_data['text_tokens'] = train_data['text_tokens'].apply(lambda x: x[:max_text_len])
train_data['title_tokens'] = train_data['title_tokens'].apply(lambda x: x[:max_title_len])

test_data['text_tokens'] = test_data['text_tokens'].apply(lambda x: x[:max_text_len])
test_data['title_tokens'] = test_data['title_tokens'].apply(lambda x: x[:max_title_len])

In [6]:
embedding_dim = w2v_model.vector_size  
embedding_dim

300

In [7]:
def embed_tokens(df,title_max_len=32, text_max_len=256):
    title_vectors = []
    text_vectors = []
    labels = []

    for _, row in df.iterrows():
        title_vec = []
        for token in row['title_tokens'][:title_max_len]:
            if token in w2v_model:
                title_vec.append(w2v_model[token])
            else:
                title_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        while len(title_vec) < title_max_len:
            title_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        title_vectors.append(title_vec)

        text_vec = []
        for token in row['text_tokens'][:text_max_len]:
            if token in w2v_model:
                text_vec.append(w2v_model[token])
            else:
                text_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        while len(text_vec) < text_max_len:
            text_vec.append(np.zeros(embedding_dim, dtype=np.float32))
        text_vectors.append(text_vec)

        # --- Label ---
        labels.append(row['label'])

    return (
        np.array(title_vectors, dtype=np.float32),
        np.array(text_vectors, dtype=np.float32),
        np.array(labels, dtype=np.int64)
    )

In [8]:
train_title_vecs, train_text_vecs, train_labels = embed_tokens(train_data)
test_title_vecs, test_text_vecs, test_labels = embed_tokens(test_data)


In [10]:
class my_dataset(Dataset):
    def __init__(self, title_vecs, text_vecs, labels):
        self.title_vecs = torch.tensor(title_vecs, dtype=torch.float32)
        self.text_vecs = torch.tensor(text_vecs, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.title_vecs[idx], self.text_vecs[idx], self.labels[idx]


In [11]:
train_dataset = my_dataset(train_title_vecs, train_text_vecs, train_labels)
test_dataset = my_dataset(test_title_vecs, test_text_vecs, test_labels)


In [12]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [13]:

class DualLSTMClassifier(nn.Module):
    def __init__(self, embedding_dim=300, hidden_dim=128, num_classes=2):
        super(DualLSTMClassifier, self).__init__()
        self.title_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.text_lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, title_input, text_input):

        _, (title_hidden, _) = self.title_lstm(title_input)
        _, (text_hidden, _) = self.text_lstm(text_input)

        title_hidden = title_hidden[-1]
        text_hidden = text_hidden[-1]

        combined = torch.cat((title_hidden, text_hidden), dim=1) 
        combined = self.dropout(combined)
        output = self.fc(combined)  
        return output


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [15]:
model = DualLSTMClassifier()
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [32]:
num_epochs =25

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for title_batch, text_batch, labels in train_loader:
        title_batch = title_batch.to(device)
        text_batch = text_batch.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(title_batch, text_batch)  
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # Accuracy
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    acc = correct / total
    print(f'Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss:.4f} | Accuracy: {acc:.4f}')


Epoch 1/25 | Loss: 1049.8041 | Accuracy: 0.5701
Epoch 2/25 | Loss: 1029.5378 | Accuracy: 0.5934
Epoch 3/25 | Loss: 1020.6975 | Accuracy: 0.5999
Epoch 4/25 | Loss: 992.2839 | Accuracy: 0.6135
Epoch 5/25 | Loss: 944.5045 | Accuracy: 0.6512
Epoch 6/25 | Loss: 884.6549 | Accuracy: 0.6860
Epoch 7/25 | Loss: 843.0553 | Accuracy: 0.7088
Epoch 8/25 | Loss: 810.0390 | Accuracy: 0.7267
Epoch 9/25 | Loss: 784.3625 | Accuracy: 0.7408
Epoch 10/25 | Loss: 763.1504 | Accuracy: 0.7523
Epoch 11/25 | Loss: 736.0814 | Accuracy: 0.7649
Epoch 12/25 | Loss: 713.5872 | Accuracy: 0.7724
Epoch 13/25 | Loss: 694.5362 | Accuracy: 0.7789
Epoch 14/25 | Loss: 669.9274 | Accuracy: 0.7917
Epoch 15/25 | Loss: 657.3259 | Accuracy: 0.7955
Epoch 16/25 | Loss: 636.5948 | Accuracy: 0.8046
Epoch 17/25 | Loss: 616.4923 | Accuracy: 0.8120
Epoch 18/25 | Loss: 462.8455 | Accuracy: 0.8615
Epoch 19/25 | Loss: 393.5492 | Accuracy: 0.8860
Epoch 20/25 | Loss: 361.8332 | Accuracy: 0.8956
Epoch 21/25 | Loss: 331.5704 | Accuracy: 0.906

In [34]:
torch.save(model.state_dict(), 'models/lstm_word2vec.pt')


In [37]:
model.eval()
all_preds = []

with torch.no_grad():
    for title_batch, text_batch, _ in test_loader:
        title_batch = title_batch.to(device)
        text_batch = text_batch.to(device)

        outputs = model(title_batch, text_batch)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())

In [45]:
df_results = pd.DataFrame({
    'label': test_labels,
    'pred_w2v': all_preds
})

In [46]:
# df_results.to_csv('test_with_w2v.csv')