In [45]:
import pandas as pd
from datasets import Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

In [132]:
# load data
train = Dataset.load_from_disk("unaugmented_data/train")
val = Dataset.load_from_disk("unaugmented_data/val")
dev = Dataset.load_from_disk("unaugmented_data/dev")

In [133]:
# only want the text and label columns
train_val_df = pd.concat([pd.DataFrame(train), pd.DataFrame(val)])[['text', 'pcl']]
test_df = pd.DataFrame(dev)[['text', 'pcl']]

# Bag of Words Model with Logistic Regression

In [134]:
# transform data into bag of words representation
vectorizer = CountVectorizer(token_pattern="[^\W\d_]+")
train_X = vectorizer.fit_transform(train_val_df.text.values)
test_X = vectorizer.transform(test_df.text.values)

In [135]:
# run a logistic regression
np.random.seed(100)
logistic_reg = LogisticRegression()
logistic_reg.fit(train_X, train_val_df.pcl.values)

In [136]:
def f1_score(pred_y, true_y):
    tp = sum((true_y == 1) & (pred_y == 1))
    fp = sum((true_y == 0) & (pred_y == 1))
    fn = sum((true_y == 1) & (pred_y == 0))
    return tp / (tp + 0.5 * (fp + fn))

In [137]:
def recall(pred_y, true_y):
    tp = sum((true_y == 1) & (pred_y == 1))
    fn = sum((true_y == 1) & (pred_y == 0))
    return tp  / (tp + fn)


def precision(pred_y, true_y):
    tp = sum((true_y == 1) & (pred_y == 1))
    fp = sum((true_y == 0) & (pred_y == 1))
    return tp / (tp + fp)

In [138]:
def accuracy(pred_y, true_y):
    return sum(pred_y==true_y)/len(true_y)

In [139]:
# calculate F1 score and accuracy
pred_y = logistic_reg.predict(test_X)
print(f'Accuracy of BoW Logistic Regression is {accuracy(pred_y, test_df.pcl)}')
print(f'Precision of BoW Logistic Regression is {precision(pred_y, test_df.pcl)}')
print(f'Recall of BoW Logistic Regression is {precision(pred_y, test_df.pcl)}')
print(f'F1 Score of BoW Logistic Regression is {f1_score(pred_y, test_df.pcl.values)}')

Accuracy of BoW Logistic Regression is 0.8948877209746775
Precision of BoW Logistic Regression is 0.37349397590361444
Recall of BoW Logistic Regression is 0.37349397590361444
F1 Score of BoW Logistic Regression is 0.2198581560283688


## Examples of Misclassified Text

In [97]:
bow_misclassified = test_df[test_df.pcl != pred_y]

In [109]:
bow_example1 = bow_misclassified.loc[16]
print(f'Correct PCL = {bow_example1.pcl}, PCL Score = {pd.DataFrame(dev).labels.loc[16]}')
bow_example1.text

Correct PCL = 1, PCL Score = 3.0


"Sheepherding in America has always been an immigrant 's job , too dirty , too cold and too lonely for anyone with options ."

# Neural Models

In [145]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


In [146]:
def text_iterator(tokenizer, series):
    for text in series.values:
        yield tokenizer(text)

def word2int(series, vocab, tokenizer):
    return series.apply(lambda x: vocab(tokenizer(x)))

def add_padding(ls):
    padded_tensors = pad_sequence(ls, batch_first=True)
    return padded_tensors.long()


In [147]:
# generate vocabulary of the training data
tokenizer = get_tokenizer("basic_english")
vocab = build_vocab_from_iterator(text_iterator(tokenizer, train_val_df.text), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [148]:
# convert input text of training and testing data to integers from vocabulary
train_tokens = word2int(train_val_df['text'], vocab, tokenizer)
test_tokens = word2int(test_df['text'], vocab, tokenizer)

In [149]:
# pad the tokens such that the input tensor is the same size
all_tokens = [torch.Tensor(t) for t in train_tokens] + [torch.Tensor(t) for t in test_tokens]
all_padded_tokens = add_padding(all_tokens)
train_X = all_padded_tokens[:len(train_tokens)]
test_X = all_padded_tokens[len(train_tokens):]

In [150]:
# convert pcl labels to tensor
train_y = torch.LongTensor(train_val_df['pcl'].values)
test_y = torch.LongTensor(test_df['pcl'].values)

In [151]:
def torch_f1_score(pred_y, true_y):
    tp = torch.sum((true_y == 1) & (pred_y == 1)).item()
    fp = torch.sum((true_y == 0) & (pred_y == 1)).item()
    fn = torch.sum((true_y == 1) & (pred_y == 0)).item()
    return tp / (tp + 0.5 * (fp + fn))

def torch_recall(pred_y, true_y):
    tp = torch.sum((true_y == 1) & (pred_y == 1)).item()
    fn = torch.sum((true_y == 1) & (pred_y == 0)).item()
    return tp / (tp + fn)

def torch_precision(pred_y, true_y):
    tp = torch.sum((true_y == 1) & (pred_y == 1)).item()
    fp = torch.sum((true_y == 0) & (pred_y == 1)).item()
    return tp / (tp + fp)

def torch_accuracy(pred_y, true_y):
    return torch.sum(pred_y == true_y) / len(true_y)

In [63]:
def train_neural_model(model, train_X, train_y, n_epochs, loss_fn, optimizer, batch_size):

    n_batches = 1 + len(train_X) // batch_size
    for epoch in range(n_epochs):
        
        model.train()
        for i in range(n_batches):

            batch_X = train_X[i*batch_size:(i+1)*batch_size].to(DEVICE)
            batch_y = train_y[i*batch_size:(i+1)*batch_size].to(DEVICE)
            optimizer.zero_grad()
            torch.cuda.empty_cache()
            pred_y = model(batch_X)
            loss = loss_fn(pred_y.view(-1), batch_y.float()) 
            loss.backward()
            optimizer.step()
            torch.cuda.empty_cache()

        model.eval()
        print(f"Epoch {epoch}: loss={loss}")

## FC Model

In [24]:
class NeuralModel(nn.Module):
    def __init__(self, vocab_size, num_tokens, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc1 = nn.Linear(embedding_dim*num_tokens, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, hidden_dim)
        self.out_layer = nn.Linear(hidden_dim, 1)

        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

    def forward(self, x):
        x = self.embedding_layer(x).view(x.shape[0], -1)
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.relu(self.fc2(x))
        x = nn.functional.relu(self.fc3(x))
        x = nn.functional.relu(self.fc4(x))
        return self.out_layer(x)
    
    def predict(self, x):
        probs = nn.functional.sigmoid(self.forward(x))
        return torch.where(probs >= 0.5, 1, 0)

In [25]:
# train for feedforward neural network with embeddings
torch.manual_seed(100)
np.random.seed(100)
random.seed(100)
neural_model = NeuralModel(len(vocab), train_X.shape[1], 128, 128).to(DEVICE)
optimizer = torch.optim.AdamW(neural_model.parameters(), lr=0.0001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [26]:
train_neural_model(neural_model, train_X.to(DEVICE), train_y.to(DEVICE), 30, loss_fn, optimizer, batch_size=64)

Epoch 0: F1=0.0
Epoch 0: loss=0.4651470184326172
Epoch 1: F1=0.00023850558528114147
Epoch 1: loss=0.2583751678466797
Epoch 2: F1=0.08733484750162232
Epoch 2: loss=0.07993243634700775
Epoch 3: F1=0.09260686259424526
Epoch 3: loss=0.009527729824185371
Epoch 4: F1=0.09390190813257299
Epoch 4: loss=0.0023256358690559864
Epoch 5: F1=0.09462652667263054
Epoch 5: loss=0.0023603145964443684
Epoch 6: F1=0.0947462310376285
Epoch 6: loss=0.0007915312307886779
Epoch 7: F1=0.09480597014925374
Epoch 7: loss=0.00037500273901969194
Epoch 8: F1=0.09480597014925374
Epoch 8: loss=0.0002443444973323494
Epoch 9: F1=0.09480597014925374
Epoch 9: loss=0.0001699640997685492
Epoch 10: F1=0.09480597014925374
Epoch 10: loss=0.0001245349703822285
Epoch 11: F1=0.09480597014925374
Epoch 11: loss=9.506844071438536e-05
Epoch 12: F1=0.09480597014925374
Epoch 12: loss=7.464035297743976e-05
Epoch 13: F1=0.09480597014925374
Epoch 13: loss=5.981963477097452e-05
Epoch 14: F1=0.09480597014925374
Epoch 14: loss=4.879498374066

In [27]:
neural_model.eval()
pred_y = neural_model.predict(test_X.to(DEVICE)).flatten()
print(f'Accuracy of Fully-Connected Neural Model is {torch_accuracy(pred_y.to(DEVICE), test_y.to(DEVICE))}')
print(f'F1 Score of Fully-Connected Neural Model is {torch_f1_score(pred_y.to(DEVICE), test_y.to(DEVICE))}')

Accuracy of Fully-Connected Neural Model is 0.8881987929344177
F1 Score of Fully-Connected Neural Model is 0.1


## RNN Model

In [64]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, num_tokens, embedding_dim, hidden_dim, n_layers):
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, n_layers, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(2*hidden_dim*num_tokens, 1)

        self.prev_hid_states = None
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

    def forward(self, x):
        # initialise hidden state for RNN
        if self.prev_hid_states is None:
            self.prev_hid_states = torch.zeros(2*self.n_layers, x.shape[0], self.hidden_dim).to(DEVICE).contiguous()
        torch.cuda.empty_cache()
        x = self.embedding_layer(x.to(DEVICE))
        x, final_hid_state = self.rnn(x, self.prev_hid_states[:,-x.shape[0]:].to(DEVICE).contiguous())
        torch.cuda.empty_cache()

        self.prev_histories = final_hid_state.detach()
        x = x.reshape(x.shape[0], -1)
        torch.cuda.empty_cache()
        return self.out_layer(x)
    
    def predict(self, x):
        self.prev_hid_states = None
        torch.cuda.empty_cache()
        probs = nn.functional.sigmoid(self.forward(x))
        return torch.where(probs >= 0.5, 1, 0)

In [65]:
# train for RNN model
torch.manual_seed(100)
np.random.seed(100)
random.seed(100)
rnn_model = RNNModel(len(vocab), train_X.shape[1], 64, 64, 5).to(DEVICE)
optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=0.0001)
loss_fn = torch.nn.BCEWithLogitsLoss()

In [66]:
train_neural_model(rnn_model, train_X, train_y, 30, loss_fn, optimizer, batch_size=8)
torch.cuda.empty_cache()

Epoch 0: loss=0.4061359167098999
Epoch 1: loss=0.30582019686698914
Epoch 2: loss=0.11218927800655365
Epoch 3: loss=0.057111956179142
Epoch 4: loss=0.03733159974217415
Epoch 5: loss=0.023701535537838936
Epoch 6: loss=0.018041936680674553
Epoch 7: loss=0.010944971814751625
Epoch 8: loss=0.009154314175248146
Epoch 9: loss=0.003790356684476137
Epoch 10: loss=0.003909831866621971
Epoch 11: loss=0.0019716673996299505
Epoch 12: loss=0.0020418993663042784
Epoch 13: loss=0.00120650720782578
Epoch 14: loss=0.0004596534126903862
Epoch 15: loss=0.000764906988479197
Epoch 16: loss=0.004254969768226147
Epoch 17: loss=0.0065689594484865665
Epoch 18: loss=0.0008361065993085504
Epoch 19: loss=0.0006287939031608403
Epoch 20: loss=0.0004637915117200464
Epoch 21: loss=0.00031181698432192206
Epoch 22: loss=0.00018825543520506471
Epoch 23: loss=0.00010579312220215797
Epoch 24: loss=5.7114451919915155e-05
Epoch 25: loss=3.010783257195726e-05
Epoch 26: loss=1.5377743693534285e-05
Epoch 27: loss=8.021014764381

In [152]:
rnn_model.eval()
completed_n = 0
batch_size = 8
all_pred = torch.empty(len(test_X))
while completed_n < len(test_X):
    batch_X = test_X[completed_n: completed_n+batch_size].to(DEVICE)
    all_pred[completed_n: completed_n+batch_size] = rnn_model.predict(batch_X).cpu().flatten()
    torch.cuda.empty_cache()
    completed_n += batch_size

print(f'Accuracy of Bidirectional RNN Neural Model is {torch_accuracy(all_pred, test_y)}')
print(f'Recall of Bidirectional RNN Neural Model is {torch_recall(all_pred, test_y)}')
print(f'Precision of Bidirectional RNN Neural Model is {torch_precision(all_pred, test_y)}')
print(f'F1 Score of Bidirectional RNN Neural Model is {torch_f1_score(all_pred, test_y)}')


Accuracy of Bidirectional RNN Neural Model is 0.8738652467727661
Recall of Bidirectional RNN Neural Model is 0.11557788944723618
Precision of Bidirectional RNN Neural Model is 0.2072072072072072
F1 Score of Bidirectional RNN Neural Model is 0.14838709677419354


## Examples of Misclassified Text

In [111]:
rnn_misclassified = test_df[(all_pred != test_y).detach().numpy()]
rnn_misclassified_0 = rnn_misclassified[rnn_misclassified.pcl == 0]

In [129]:
pd.DataFrame(dev).loc[459]

par_id                                                                8702
art_id                                                           @@4933018
keyword                                                           homeless
country_code                                                            sg
text                     On a single night in January this year , the n...
labels                                                                 0.0
pcl                                                                      0
label_category_vector                                [0, 0, 0, 0, 0, 0, 0]
input_ids                [101, 2006, 1037, 2309, 2305, 1999, 2254, 2023...
attention_mask           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
Name: 459, dtype: object

In [127]:
rnn_misclassified_0.loc[459].text

'On a single night in January this year , the number of homeless individuals in the US was 564,708 , according to The 2015 Annual Homeless Assessment Report released by The US Department of Housing and Urban Development last month .'