In [1]:

import pandas as pd
import string
from collections import Counter
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [126]:
data = pd.read_csv("/home/mohe/Downloads/xenophobic_data (1).csv")
data["tweet"] = data["tweet"].astype(str)

delete_fraction = 0.9

label_0_df = data[data['label'] == 0]

num_rows_to_delete = int(delete_fraction * len(label_0_df))
rows_to_delete = label_0_df.sample(n=num_rows_to_delete, random_state=42)

data = data.drop(rows_to_delete.index).reset_index()
len(data[data["label"] == 0])

1979

In [130]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

class TextVectorizer:

  def __init__(self, sequence_length, vocab_size, target=False):
    self.target = target
    self.sequence_length = sequence_length
    self.vocab_size = vocab_size
    self.vocab_counter = Counter()
    self.stoi = {"[pad]": 0, "[start]": 1, "[end]": 2, "[UNK]": 3}
    self.itos = {0: "[pad]", 1: "[start]", 2: "[end]", 3: "[UNK]"}

  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                  if char not in strip_chars)

  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  def adapt(self, dataset):
    for text in tqdm(dataset):
      tokens = self.tokenize(text)
      for token in tokens:
        self.vocab_counter[token] += 1

    for token, _ in self.vocab_counter.most_common(self.vocab_size):
      indx = len(self.stoi)
      self.stoi[token] = indx
      self.itos[indx] = token

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    if self.target:
      result = ([self.stoi["[start]"]] + [self.stoi.get(token, 3) for token in tokens]
            + [self.stoi["[end]"]])
    else:
      result = [self.stoi.get(token, 3) for token in tokens]

    if len(result) <= self.sequence_length:
        pad_size = self.sequence_length - len(result)
        result += [self.stoi.get("[pad]")] * (pad_size)
    else:
      #truncate!
      result = result[:self.sequence_length]

    return result

  def decode(self, int_sequence):
    return " ".join(self.itos.get(i, "[UNK]") for i in int_sequence)


In [152]:
vocab_size = 9000
sequence_length = 50

vectorizer = TextVectorizer(sequence_length, vocab_size, target=True)

In [153]:
vectorizer.adapt(data["tweet"])

  0%|          | 0/3864 [00:00<?, ?it/s]

100%|██████████| 3864/3864 [00:00<00:00, 59168.95it/s]


In [154]:
class TextDataset(Dataset):
    def __init__(self, data, vectorizer):
        self.data = data
        self.vectorizer = vectorizer

    def __getitem__(self, idx):
      text = self.data.iloc[idx]["tweet"]
      label = self.data.iloc[idx]["label"]
      text = self.vectorizer.encode(text)
      return (torch.tensor(text).long(),
              torch.tensor(label).long())



    def __len__(self):
        return len(self.data)

In [155]:
import random

data = data.sample(frac=1).reset_index(drop=True)
num_val_samples = int(0.15 * len(data))
num_train_samples = len(data) - 2 * num_val_samples

train_pairs = data[:num_train_samples]
val_pairs = data[num_train_samples:]


In [156]:
train_pairs["tweet"].size, val_pairs["tweet"].size

(2706, 1158)

In [157]:
train_ds = TextDataset(train_pairs, vectorizer)
val_ds = TextDataset(val_pairs, vectorizer)

In [230]:
batch_size = 64
train_dl = DataLoader(train_ds, batch_size=batch_size)
val_dl = DataLoader(val_ds, batch_size=batch_size)

In [241]:
for text, label in train_dl:
    print(text)
    print(label)
    break


tensor([[   1,  512, 2994,  ...,    0,    0,    0],
        [   1, 4894,    6,  ...,    0,    0,    0],
        [   1,  868,   54,  ...,    0,    0,    0],
        ...,
        [   1,   36,  163,  ...,    0,    0,    0],
        [   1,   40,   17,  ...,    0,    0,    0],
        [   1, 3182,   24,  ...,    0,    0,    0]])
tensor([1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
        0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0])


In [197]:
a = 0
for text, label in train_dl:
  if a < torch.max(text):
    a = torch.max(text)

print(a)

tensor(8715)


In [242]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim *2, output_dim)

    def forward(self, x):
        x = self.embedding(x)  
        lstm_out, _= self.lstm(x) 
        x = lstm_out[:, -1, :]  
        x = self.dropout(x)
        x = self.fc(x)  
        return x

In [243]:
LR = 2e-2
NUM_EPOCHS = 4
embedding_dim = 100  
hidden_dim = 256  
output_dim = 1  
num_layers = 2

model = BiLSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device('cpu')

model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = torch.nn.BCEWithLogitsLoss()

model.train()
for epoch in range(NUM_EPOCHS):
    
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for text, label in tqdm(train_dl):
        optimizer.zero_grad()
        input_ids = text.to(device)
        labels = label.float().unsqueeze(1).to(device)

        outputs = model(input_ids)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        predicted = torch.round(torch.sigmoid(outputs))
        correct_predictions += (predicted == labels).sum().item()
        total_predictions += labels.size(0)

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_dl)
    epoch_accuracy = correct_predictions / total_predictions

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS} - Loss: {epoch_loss:.4f} - Train: {epoch_accuracy:.4f}")
    
    # validation
    model.eval()
    val_running_loss = 0.0
    val_correct_predictions = 0
    val_total_predictions = 0

    with torch.no_grad():
        for text, label in (val_dl):
            input_ids = text.to(device)
            labels = label.float().unsqueeze(1).to(device)

            outputs = model(input_ids)

            loss = criterion(outputs, labels)

            predicted = torch.round(torch.sigmoid(outputs))
            val_correct_predictions += (predicted == labels).sum().item()
            val_total_predictions += labels.size(0)

            val_running_loss += loss.item()

    val_epoch_loss = val_running_loss / len(val_dl)
    val_epoch_accuracy = val_correct_predictions / val_total_predictions

    print(f"Loss: {val_epoch_loss:.4f} - Validation: {val_epoch_accuracy:.4f}")
    
    model.train()


  0%|          | 0/43 [00:00<?, ?it/s]

100%|██████████| 43/43 [00:04<00:00,  9.73it/s]


Epoch 1/4 - Loss: 0.7969 - Train: 0.5000
Loss: 0.6739 - Validation: 0.5285


100%|██████████| 43/43 [00:04<00:00,  9.84it/s]


Epoch 2/4 - Loss: 0.6691 - Train: 0.5924
Loss: 0.6080 - Validation: 0.6511


100%|██████████| 43/43 [00:04<00:00,  9.83it/s]


Epoch 3/4 - Loss: 0.6222 - Train: 0.6608
Loss: 0.3673 - Validation: 0.8679


100%|██████████| 43/43 [00:05<00:00,  7.74it/s]


Epoch 4/4 - Loss: 0.2924 - Train: 0.9047
Loss: 0.2297 - Validation: 0.9240
