In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from collections import Counter
import re

In [30]:
%pip install torchtext

Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())


def build_vocab(train_texts):
    vocab = Counter()
    for text in train_texts:
        tokens = tokenize(text)
        vocab.update(tokens)

    # word-to-index dictionary with special <PAD> token as 0
    word2idx = {word: idx+1 for idx, (word, _) in enumerate(vocab.most_common())}
    word2idx['<PAD>'] = 0
    return word2idx


In [48]:
# TextDataset Class
class TweetDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.tokenizer = tokenize

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.texts[idx])
        indices = [self.vocab[token] if token in self.vocab else 0 for token in tokens]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [49]:
# collate function for padding sequences in DataLoader
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded_texts = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels)


In [50]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.hidden_dim = hidden_dim  # Store hidden_dim in the class instance
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)

        hidden = torch.cat((lstm_out[:, -1, :self.hidden_dim], lstm_out[:, 0, self.hidden_dim:]), dim=1)
        output = self.fc(self.dropout(hidden))
        return output


In [59]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim  # Store hidden_dim in the class instance
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)  # Non-bidirectional LSTM
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]  # Take the output from the last time step
        output = self.fc(self.dropout(hidden))
        return output

In [63]:
# train and evaluate model
def train_and_evaluate_model(train_loader, test_loader, model, optimizer, criterion, epochs=20):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for texts, labels in train_loader:
            optimizer.zero_grad()
            predictions = model(texts)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader)}')

    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for texts, labels in test_loader:
            predictions = model(texts).argmax(1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(f"Accuracy: {accuracy_score(all_labels, all_preds)}")
    print(classification_report(all_labels, all_preds))


In [60]:
def main():
    # datasets
    olid_train = pd.read_csv("/content/drive/MyDrive/datasets/olid-train-small.csv")
    olid_test = pd.read_csv("/content/drive/MyDrive/datasets/olid-test.csv")
    hasoc_train = pd.read_csv("/content/drive/MyDrive/datasets/hasoc-train.csv")


    le = LabelEncoder()
    olid_train['labels'] = le.fit_transform(olid_train['labels'])
    olid_test['labels'] = le.transform(olid_test['labels'])
    hasoc_train['labels'] = le.transform(hasoc_train['labels'])

    # vocabulary from the OLID training set
    vocab = build_vocab(olid_train['text'].values)
    vocab_size = len(vocab) + 1  # Adding 1 for padding token

    train_dataset_olid = TweetDataset(olid_train['text'].values, olid_train['labels'].values, vocab)
    test_dataset_olid = TweetDataset(olid_test['text'].values, olid_test['labels'].values, vocab)
    train_dataset_hasoc = TweetDataset(hasoc_train['text'].values, hasoc_train['labels'].values, vocab)

    train_loader_olid = DataLoader(train_dataset_olid, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader_olid = DataLoader(test_dataset_olid, batch_size=32, shuffle=False, collate_fn=collate_fn)
    train_loader_hasoc = DataLoader(train_dataset_hasoc, batch_size=32, shuffle=True, collate_fn=collate_fn)


    embedding_dim = 100
    hidden_dim = 128
    output_dim = 2

    # BiLSTM Model
    bilstm_model = BiLSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
    optimizer_bilstm = optim.Adam(bilstm_model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    print("Training BiLSTM model on OLIDv1 small dataset and evaluating on OLIDv1 test set:")
    train_and_evaluate_model(train_loader_olid, test_loader_olid, bilstm_model, optimizer_bilstm, criterion)

    # LSTM Model
    lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
    optimizer_lstm = optim.Adam(lstm_model.parameters(), lr=0.001)

    print("Training LSTM model on OLIDv1 small dataset and evaluating on OLIDv1 test set:")
    train_and_evaluate_model(train_loader_olid, test_loader_olid, lstm_model, optimizer_lstm, criterion)

    print("Training BiLSTM model on HASOC dataset and evaluating on OLIDv1 test set:")
    train_and_evaluate_model(train_loader_hasoc, test_loader_olid, bilstm_model, optimizer_bilstm, criterion)

    print("Training LSTM model on HASOC dataset and evaluating on OLIDv1 test set:")
    train_and_evaluate_model(train_loader_hasoc, test_loader_olid, lstm_model, optimizer_lstm, criterion)


In [64]:
if __name__ == "__main__":
    main()

Training BiLSTM model on OLIDv1 small dataset and evaluating on OLIDv1 test set:
Epoch 1, Loss: 0.6653532105716852
Epoch 2, Loss: 0.6119593300454603
Epoch 3, Loss: 0.5248462912814865
Epoch 4, Loss: 0.43427793942188303
Epoch 5, Loss: 0.32324328701027105
Epoch 6, Loss: 0.22842486587459923
Epoch 7, Loss: 0.15070984824026218
Epoch 8, Loss: 0.09470858477320665
Epoch 9, Loss: 0.053557778404557395
Epoch 10, Loss: 0.04272946701868554
Epoch 11, Loss: 0.027798506840643642
Epoch 12, Loss: 0.02507715368079775
Epoch 13, Loss: 0.03345857605165064
Epoch 14, Loss: 0.02712121066831999
Epoch 15, Loss: 0.016828030867558075
Epoch 16, Loss: 0.027366643499575374
Epoch 17, Loss: 0.02054120414086256
Epoch 18, Loss: 0.011223362209321486
Epoch 19, Loss: 0.005362710950812173
Epoch 20, Loss: 0.0048589571401986735
Accuracy: 0.708139534883721
              precision    recall  f1-score   support

           0       0.82      0.76      0.79       620
           1       0.48      0.58      0.53       240

    accurac