In [1]:
import pandas as pd
from datasets import Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data
train = Dataset.load_from_disk("data/train")
val = Dataset.load_from_disk("data/val")
dev = Dataset.load_from_disk("data/dev")

In [3]:
# only want the text and label columns
train_val_df = pd.concat([pd.DataFrame(train), pd.DataFrame(val)])[['text', 'pcl']]
test_df = pd.DataFrame(dev)[['text', 'pcl']]

# Bag of Words Model with Logistic Regression

In [55]:
# transform data into bag of words representation
vectorizer = CountVectorizer(token_pattern="[^\W\d_]+")
train_X = vectorizer.fit_transform(train_val_df.text.values)
test_X = vectorizer.transform(test_df.text.values)

In [51]:
# run a logistic regression
logistic_reg = LogisticRegression()
logistic_reg.fit(train_X, train_val_df.pcl.values)

In [52]:
def f1_score(pred_y, true_y):
    tp = sum((true_y == 1) & (pred_y == 1))
    fp = sum((true_y == 0) & (pred_y == 1))
    fn = sum((true_y == 1) & (pred_y == 0))
    return tp / (tp + 0.5 * (fp + fn))

In [53]:
# calculate F1 score on 
pred_y = logistic_reg.predict(test_X)
f1_score(pred_y, test_df.pcl.values)

0.2198581560283688

# Neural Models

In [4]:
def text_iterator(tokenizer, series):
    for text in series.values:
        yield tokenizer(text)

def word2int(series, vocab, tokenizer):
    return series.apply(lambda x: vocab(tokenizer(x)))

def add_padding(ls):
    padded_tensors = pad_sequence(ls, batch_first=True)
    return padded_tensors.long()


In [6]:
# generate vocabulary of the training data
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(text_iterator(tokenizer, train_val_df.text), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [7]:
# convert input text of training and testing data to integers from vocabulary
train_tokens = word2int(train_val_df['text'], vocab, tokenizer)
test_tokens = word2int(test_df['text'], vocab, tokenizer)

In [8]:
# pad the tokens such that the input tensor is the same size
all_tokens = [torch.Tensor(t) for t in train_tokens] + [torch.Tensor(t) for t in test_tokens]
all_padded_tokens = add_padding(all_tokens)
train_X = all_padded_tokens[:len(train_tokens)]
test_X = all_padded_tokens[len(train_tokens):]

In [9]:
# convert pcl labels to tensor
train_y = torch.LongTensor(train_val_df['pcl'].values)
test_y = torch.LongTensor(test_df['pcl'].values)

In [10]:
def torch_f1_score(pred_y, true_y):
    tp = torch.sum((true_y == 1) & (pred_y == 1)).item()
    fp = torch.sum((true_y == 0) & (pred_y == 1)).item()
    fn = torch.sum((true_y == 1) & (pred_y == 0)).item()
    return tp / (tp + 0.5 * (fp + fn))

In [11]:
def train_neural_model(model, train_X, train_y, n_epochs, loss_fn, optimizer, batch_size):

    n_batches = 1 + len(train_X) // batch_size
    for epoch in range(n_epochs):
        
        model.train()
        for i in range(n_batches):

            batch_X = train_X[i*batch_size:(i+1)*batch_size]
            batch_y = train_y[i*batch_size:(i+1)*batch_size]
            optimizer.zero_grad()
            
            pred_y = model(batch_X)
            loss = loss_fn(pred_y.view(-1), batch_y.float()) 
            loss.backward()
            optimizer.step()
        
        model.eval()
        print(f"Epoch {epoch}: F1={torch_f1_score(model.predict(train_X), train_y)}")
        print(f"Epoch {epoch}: loss={loss}")

## FC Model

In [19]:
class NeuralModel(nn.Module):
    def __init__(self, vocab_size, num_tokens, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim*num_tokens, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out_layer = nn.Linear(hidden_dim, 1)

        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    def forward(self, x):
        x = self.embedding_layer(x).view(x.shape[0], -1)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        return self.out_layer(x)
    
    def predict(self, x):
        probs = nn.functional.sigmoid(self.forward(x))
        return torch.where(probs >= 0.5, 1, 0)

In [24]:
# train for feedforward neural network with embeddings
torch.manual_seed(100)
np.random.seed(100)
random.seed(100)
neural_model = NeuralModel(len(vocab), train_X.shape[1], 64, 64)
optimizer = torch.optim.AdamW(neural_model.parameters(), lr=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [25]:
train_neural_model(neural_model, train_X, train_y, 10, loss_fn, optimizer, batch_size=64)

Epoch 0: F1=0.0
Epoch 0: loss=0.38029417395591736
Epoch 1: F1=0.06487964485773287
Epoch 1: loss=0.2724159359931946
Epoch 2: F1=0.08850255104597897
Epoch 2: loss=0.0307866632938385
Epoch 3: F1=0.09679327161970665
Epoch 3: loss=0.022862521931529045
Epoch 4: F1=0.09730545311456208
Epoch 4: loss=0.04323570430278778
Epoch 5: F1=0.09480597014925374
Epoch 5: loss=0.0005230935057625175
Epoch 6: F1=0.09486563407005251
Epoch 6: loss=0.0003881649754475802
Epoch 7: F1=0.09486563407005251
Epoch 7: loss=0.00018992534023709595
Epoch 8: F1=0.09486563407005251
Epoch 8: loss=0.00014004850527271628
Epoch 9: F1=0.09486563407005251
Epoch 9: loss=0.00010671108611859381


In [26]:
neural_model.eval()
pred_y = neural_model.predict(test_X)
torch_f1_score(pred_y, test_y)

0.06649304091759335

## RNN Model

In [12]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, num_tokens, embedding_dim, hidden_dim, n_layers):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(2*hidden_dim*num_tokens, 1)

        self.prev_hid_states = None
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

    def forward(self, x):
        # initialise hidden state for RNN
        if self.prev_hid_states is None:
            self.prev_hid_states = torch.zeros(2*self.n_layers, x.shape[0], self.hidden_dim)

        x = self.embedding_layer(x)
        x, final_hid_state = self.rnn(x, self.prev_hid_states[:,-x.shape[0]:])
        
        self.prev_histories = final_hid_state.detach()
        x = x.reshape(x.shape[0], -1)
        return self.out_layer(x)
    
    def predict(self, x):
        self.prev_hid_states = None
        probs = nn.functional.sigmoid(self.forward(x))
        return torch.where(probs >= 0.5, 1, 0)

In [13]:
# train for RNN model
torch.manual_seed(100)
np.random.seed(100)
random.seed(100)
rnn_model = RNNModel(len(vocab), train_X.shape[1], 64, 64, 1)
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [14]:
train_neural_model(rnn_model, train_X, train_y, 20, loss_fn, optimizer, batch_size=64)

Epoch 0: F1=0.04069811181481318
Epoch 0: loss=0.42772576212882996
Epoch 1: F1=0.06721203737410508
Epoch 1: loss=0.2608868479728699
Epoch 2: F1=0.07767745658325473
Epoch 2: loss=0.17888250946998596
Epoch 3: F1=0.0881618859878505
Epoch 3: loss=0.0926768034696579
Epoch 4: F1=0.09353543704578575
Epoch 4: loss=0.048933934420347214
Epoch 5: F1=0.09365790550117481
Epoch 5: loss=0.026958640664815903
Epoch 6: F1=0.09438620937060428
Epoch 6: loss=0.014017196372151375
Epoch 7: F1=0.0946864165929495
Epoch 7: loss=0.0077195740304887295
Epoch 8: F1=0.09480597014925374
Epoch 8: loss=0.005810909438878298
Epoch 9: F1=0.09480597014925374
Epoch 9: loss=0.005253401584923267
Epoch 10: F1=0.09480597014925374
Epoch 10: loss=0.0038581143599003553
Epoch 11: F1=0.09480597014925374
Epoch 11: loss=0.001733487704768777
Epoch 12: F1=0.09480597014925374
Epoch 12: loss=0.0066747358068823814
Epoch 13: F1=0.09480597014925374
Epoch 13: loss=0.0030285895336419344
Epoch 14: F1=0.09480597014925374
Epoch 14: loss=0.00165363

In [15]:
rnn_model.eval()
pred_y = rnn_model.predict(test_X)
torch_f1_score(pred_y, test_y)

0.07372266529457165