# Prediction of reactions to the posts
# Task 2
The notebook was made and tested on Google Colab

## Install dependencies

Installing all the dependencies using pip

In [1]:
!pip install gdown > /dev/null
!pip install pandas > /dev/null
!pip install numpy > /dev/null
!pip install torch torchtext > /dev/null

## Imports

Importing torch and other utility libraries

In [2]:
import gdown
import pandas as pd
import numpy as np

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence

## Download and prepare data

Download the dataset, then read it using Pandas.
Splitting the data into train, val, test.

In [4]:
# The given URL is transformed
# https://drive.google.com/file/d/10CvDP3AFOTYmoXhWXLRDm6n_XSZV6Yev/view?usp=sharing
# Download dataset
URL = "https://drive.google.com/uc?id=10CvDP3AFOTYmoXhWXLRDm6n_XSZV6Yev"
datafile = "dataset.csv"
gdown.download(URL, datafile, quiet=True)
df = pd.read_csv(datafile)

In [5]:
# Counting classes and reading the dataset

classes = pd.unique(df['Label'])
N_classes = len(classes)
df['Label'] = df['Label'].replace(classes, np.arange(N_classes, dtype=int))
text = df['FBPost'].to_numpy()
labels = torch.tensor(df['Label'].values)

In [6]:
#Look at the distribution of the labels
df.groupby(["Label"]).count()

Unnamed: 0_level_0,Unnamed: 0,FBPost
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,280,280
1,79,79
2,641,641


### Define some important functions

In [7]:
def shuffle_dataset(text, labels):
  p = np.random.permutation(text.shape[0])
  return text[p], labels[p]

In [8]:
def split_dataset(text, labels, train_split, val_split):
  N_data = text.shape[0]

  train_split = int(N_data * 0.7)
  val_split = int(N_data * (0.7+0.15))

  # Split
  train_sentences = text[0:train_split]
  train_labels = labels[0:train_split]
  val_sentences = text[train_split:val_split]
  val_labels = labels[train_split:val_split]
  test_sentences = text[val_split:]
  test_labels = labels[val_split:]

  return train_sentences, train_labels, val_sentences, val_labels, test_sentences, test_labels

In [9]:
def token_iterator(dataset, tokenizer):
    for sentence in dataset:
        yield tokenizer(str(text))

In [13]:
def build_vocab(sentences):
  tokenizer = get_tokenizer('basic_english')
  vocab = build_vocab_from_iterator(token_iterator(sentences, tokenizer), specials=["<UNK>"])
  vocab.set_default_index(vocab["<UNK>"])
  vocab.append_token("<PAD>")
  pad_index = vocab["<PAD>"]
  return tokenizer, vocab, pad_index

In [14]:
def truncate_or_pad(sentence, max_len, pad_index):
  if len(sentence) > max_len:
    return sentence[0:max_len]
  else:
    return F.pad(sentence, (0, max_len-len(sentence)),'constant', pad_index)

### Create datasets

In [17]:
#Shuffle and split dataset, then build a vocabulary from tokenized sentences
text, labels = shuffle_dataset(text, labels)
train_sentences, train_labels, val_sentences, val_labels, test_sentences, test_labels = split_dataset(text, labels, 0.7, 0.15)
tokenizer, vocab, pad_index = build_vocab(train_sentences)

In [18]:
MAX_LEN = 300
#Pad the sentences to max length (or truncate them if they are longer) and stack them into a tensor
train_vectorized = torch.stack([truncate_or_pad(torch.tensor(vocab.forward(tokenizer(sentence))), MAX_LEN, pad_index) for sentence in train_sentences])
val_vectorized = torch.stack([truncate_or_pad(torch.tensor(vocab.forward(tokenizer(sentence))), MAX_LEN, pad_index) for sentence in val_sentences])
test_vectorized = torch.stack([truncate_or_pad(torch.tensor(vocab.forward(tokenizer(sentence))), MAX_LEN, pad_index) for sentence in test_sentences])

In [19]:
#Create dataloaders for each dataset
BATCH_SIZE = 50

train = DataLoader(TensorDataset(train_vectorized, train_labels), batch_size=BATCH_SIZE, shuffle=True)
val = DataLoader(TensorDataset(val_vectorized, val_labels), batch_size=BATCH_SIZE, shuffle=True)
test = DataLoader(TensorDataset(test_vectorized, test_labels), batch_size=BATCH_SIZE, shuffle=True)

## Model

In [42]:
class SentenceClassifier(nn.Module):
  def __init__(self, embedding_dimension, hidden_dimension, vocabulary_size, number_of_classes, pad_index):
    super(SentenceClassifier, self).__init__()
    self.pad_index = pad_index

    self.embedding = nn.Embedding(vocabulary_size, embedding_dimension)
    self.lstm = nn.LSTM(embedding_dimension, hidden_dimension, batch_first=True)
    self.last = nn.Linear(hidden_dimension, number_of_classes)

  def forward(self, sentences):
    embeddings = self.embedding(sentences)
    lstm_out, _ = self.lstm(embeddings)

    lengths = (MAX_LEN - (sentences == self.pad_index).sum(dim=1)) -1
    last_values = self.last(lstm_out[range(0, sentences.size()[0]), lengths, :])
    class_scores = F.log_softmax(last_values, dim=1)

    return class_scores

In [43]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 64
VOCABULARY_SIZE = len(vocab)

model = SentenceClassifier(EMBEDDING_DIM,HIDDEN_DIM,VOCABULARY_SIZE, N_classes, pad_index)
loss_function = nn.NLLLoss()
val_loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [44]:
print(model)

SentenceClassifier(
  (embedding): Embedding(3003, 128)
  (lstm): LSTM(128, 64, batch_first=True)
  (last): Linear(in_features=64, out_features=3, bias=True)
)


In [45]:
class EarlyStopping:
  def __init__(self, patience=1, min_delta=0):
    self.patience = patience
    self.min_delta = min_delta
    self.counter = 0
    self.val_loss = np.inf

  def early_stop(self, val_loss):
    if val_loss < self.val_loss:
      self.val_loss = val_loss
      self.counter = 0
    elif val_loss > (self.val_loss + self.min_delta):
      self.counter += 1
    if self.counter >= self.patience:
      return True
    return False

In [46]:
EPOCHS = 100

early_stopper = EarlyStopping(10, 0.01)

for epoch in range(EPOCHS):
  running_loss = 0
  correct_predictions = 0
  total_step = len(train)
  for index, (sentences, labels) in enumerate(train):
    model.zero_grad()

    class_scores = model(sentences)

    loss = loss_function(class_scores, labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    correct_predictions +=(torch.argmax(class_scores, dim=1) == labels).sum()
    if index % 2 == 0:
      print(f'[EPOCH: {epoch + 1}/{EPOCHS}, STEP:{index+1}/{total_step}] loss: {running_loss/5:.3f}')
      running_loss = 0

    #Early stopping
    with torch.no_grad():
      val_loss = 0
      for sentences, labels in val:
        class_scores = model(sentences)
        val_loss += val_loss_function(class_scores, labels)
      if early_stopper.early_stop(val_loss):
        break
  print(f'[EPOCH: {epoch + 1}/{EPOCHS}] accuracy: {correct_predictions/len(train.dataset):.3f} val_loss: {val_loss:.3f}')
  print("---------------------------------------------")

[EPOCH: 1/100, STEP:1/14] loss: 0.219
[EPOCH: 1/100, STEP:3/14] loss: 0.438
[EPOCH: 1/100, STEP:5/14] loss: 0.442
[EPOCH: 1/100, STEP:7/14] loss: 0.442
[EPOCH: 1/100, STEP:9/14] loss: 0.445
[EPOCH: 1/100, STEP:11/14] loss: 0.442
[EPOCH: 1/100, STEP:13/14] loss: 0.445
[EPOCH: 1/100] accuracy: 0.324 val_loss: 3.296
---------------------------------------------
[EPOCH: 2/100, STEP:1/14] loss: 0.217
[EPOCH: 2/100, STEP:3/14] loss: 0.441
[EPOCH: 2/100, STEP:5/14] loss: 0.441
[EPOCH: 2/100, STEP:7/14] loss: 0.437
[EPOCH: 2/100, STEP:9/14] loss: 0.442
[EPOCH: 2/100, STEP:11/14] loss: 0.436
[EPOCH: 2/100, STEP:13/14] loss: 0.444
[EPOCH: 2/100] accuracy: 0.346 val_loss: 3.277
---------------------------------------------
[EPOCH: 3/100, STEP:1/14] loss: 0.216
[EPOCH: 3/100, STEP:3/14] loss: 0.436
[EPOCH: 3/100, STEP:5/14] loss: 0.438
[EPOCH: 3/100, STEP:7/14] loss: 0.431
[EPOCH: 3/100, STEP:9/14] loss: 0.442
[EPOCH: 3/100, STEP:11/14] loss: 0.441
[EPOCH: 3/100, STEP:13/14] loss: 0.439
[EPOCH: 3/

## Evaluate on test dataset

In [52]:
correct_predictions = 0
for sentences, labels in test:
  class_scores = model(sentences)
  correct_predictions +=(torch.argmax(class_scores, dim=1) == labels).sum()
accuracy = correct_predictions/(len(test.dataset))
print(f"Test accuracy: {accuracy:.2%}")

Test accuracy: 67.33%
