In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [None]:
import pandas as pd
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F
import torchtext
import nltk
import time
from datetime import timedelta
import numpy as np
from sklearn import metrics
from typing import Tuple
import os
import json
from typing import Dict, Tuple, Union, Optional
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
from torch import nn, optim
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import ConcatDataset

# Pre-process the Data

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def get_formatted_data(df):
  one_hot_labels = pd.get_dummies(df.iloc[:, 1])
  data = df.to_numpy()
  X = data[:, 0]
  y = torch.tensor(one_hot_labels.values)
  
  dataset = MyDataset(X, y)
  return dataset

## Poem Data Preprocessing

In [None]:
poem_data = pd.read_excel('./PERC_mendelly.xlsx')
labels_to_drop = ["peace", "courage", "hate"]
indices = poem_data[poem_data["Emotion"].isin(labels_to_drop)].index
poem_data["Emotion"] = poem_data["Emotion"].replace("sad", "sadness")
poem_data.drop(indices, inplace=True)
poem_data

Unnamed: 0,Poem,Emotion
0,A Tree\nA tree beside the sandy river-beach \n...,sadness
1,"Sri Krishna\n\nO immense Light and thou, O spi...",love
3,Revelation\n\n\nSomeone leaping from the rocks...,sadness
4,The Silver Call\n\n\nThere is a godhead of unr...,joy
5,Surrender\n\nO THOU of whom I am the instrumen...,love
...,...,...
711,Daughter Taken By Mothers Lies\n\nHave you any...,sadness
712,Involuntary Acceptance\n\nEven though\nWe’re f...,sadness
713,Victim Of Poverty\n\nPoverty stricken youth ju...,sadness
714,Rain\n\nI sit and watch\nas the rain falls \nf...,sadness


In [None]:
# Split the dataset into training and testing sets
poem_train_data, poem_test_data = train_test_split(poem_data, test_size=0.9, random_state=42)

# Split the training set into testing and validation sets
poem_test_data, poem_val_data = train_test_split(poem_test_data, test_size=0.5, random_state=42)

In [None]:
poem_train_ds = get_formatted_data(poem_train_data)
poem_test_ds = get_formatted_data(poem_test_data)
poem_val_ds = get_formatted_data(poem_val_data)

## Twitter Message Data Pre processing

In [None]:
text_data_train = pd.read_table('./train.txt', sep=';', header=None)
text_data_train.to_csv("train_cleaned.txt")
print(text_data_train)

                                                       0        1
0                                i didnt feel humiliated  sadness
1      i can go from feeling so hopeless to so damned...  sadness
2       im grabbing a minute to post i feel greedy wrong    anger
3      i am ever feeling nostalgic about the fireplac...     love
4                                   i am feeling grouchy    anger
...                                                  ...      ...
15995  i just had a very brief time in the beanbag an...  sadness
15996  i am now turning and i feel pathetic that i am...  sadness
15997                     i feel strong and good overall      joy
15998  i feel like this was such a rude comment and i...    anger
15999  i know a lot but i feel so stupid because i ca...  sadness

[16000 rows x 2 columns]


In [None]:
text_data_test = pd.read_table('./test.txt', sep=';', header=None)
text_data_test.to_csv("test_cleaned.csv")
print(set(text_data_test[1]))

{'surprise', 'fear', 'love', 'anger', 'joy', 'sadness'}


In [None]:
text_data_val = pd.read_table('./val.txt', sep=';', header=None)
# one_hot_labels_val = pd.get_dummies(text_data_val[1])
# one_hot_labels_val = one_hot_labels_val.reindex(columns=one_hot_labels.columns, fill_value=0)
# text_data_val[1] = one_hot_labels_val.values
text_data_val.to_csv("val_cleaned.csv")
print(set(text_data_val[1]))

{'surprise', 'fear', 'love', 'anger', 'joy', 'sadness'}


In [None]:
text_train_ds = get_formatted_data(text_data_train)
text_test_ds = get_formatted_data(text_data_test)
text_val_ds = get_formatted_data(text_data_val)

## Reddit Data Pre-processing

In [None]:
dataset = load_dataset("go_emotions", "raw")
columns_to_remove = ["id", "author", "subreddit", "link_id", "parent_id", "created_utc", "rater_id", "example_very_unclear",
                    'admiration', 'amusement', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'gratitude', 'grief', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'neutral']

dataset_dict = dataset.remove_columns(columns_to_remove)

# Convert dataset to pandas DataFrame
df = pd.DataFrame(dataset_dict["train"])

# Count the number of 1s in each row
counts = df.iloc[:, 1:].sum(axis=1)

# Filter the rows where the count is less than or equal to 1
reddit_data = df[counts == 1]

print(reddit_data)

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.03k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.11k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/raw to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to /root/.cache/huggingface/datasets/go_emotions/raw/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

                                                     text  anger  fear  joy  \
0                                         That game hurt.      0     0    0   
3                                      Man I love reddit.      0     0    0   
37      I just came home, what the fuck is this lineup...      0     0    0   
43      By far the coolest thing I've seen on this thr...      0     0    1   
49                     Sending love and strength vibes <3      0     0    1   
...                                                   ...    ...   ...  ...   
211187  I just called the Capitol Police. They are not...      1     0    0   
211212    What a great photo and you two look so happy. 😍      0     0    1   
211219  Well, I'm glad you're out of all that now. How...      0     0    1   
211220                             Everyone likes [NAME].      0     0    0   
211223  The FDA has plenty to criticize. But like here...      1     0    0   

        love  sadness  surprise  
0          0     

In [None]:
# Split the dataset into training and testing sets
reddit_train_data, reddit_test_data = train_test_split(reddit_data, test_size=0.9, random_state=42)

# Split the training set into training and validation sets
reddit_test_data, reddit_val_data = train_test_split(reddit_test_data, test_size=0.5, random_state=42)

In [None]:
def get_format_df(filtered_df):
  X = filtered_df['text'].to_numpy()
  y = torch.tensor(filtered_df.iloc[:, 1:].values)

  return MyDataset(X, y)

In [None]:
reddit_train_ds = get_format_df(reddit_train_data)
reddit_test_ds = get_format_df(reddit_test_data)
reddit_val_ds = get_format_df(reddit_val_data)

## Get DataLoaders for Training and Testing

In [None]:
train_dl = DataLoader(text_train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(text_val_ds, batch_size=32, shuffle=True)

### Combine datasets to Train, Test and Val

---



In [None]:
def get_dl(ds1, ds2, ds3):
  combined_dataset = ConcatDataset([ds1, ds2, ds3])
  return DataLoader(combined_dataset, batch_size=32, shuffle=True)

combined_train_dl = get_dl(text_train_ds, poem_train_ds, reddit_train_ds)
combined_val_dl = get_dl(text_val_ds, poem_val_ds, reddit_val_ds)

### Create Test DataLoaders

In [None]:
poem_test_dl = DataLoader(poem_test_ds, batch_size=32, shuffle=True)
reddit_test_dl = DataLoader(reddit_test_ds, batch_size=32, shuffle=True)
text_test_dl = DataLoader(text_test_ds, batch_size=32, shuffle=True)

# Model 2 - Att-BiLSTM

## Trainer Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

class Trainer:
  """
  Training pipeline

  Parameters
  ----------
  num_epochs : int
      We should train the model for __ epochs

  start_epoch : int
      We should start training the model from __th epoch

  train_loader : DataLoader
      DataLoader for training data

  model : nn.Module
      Model

  model_name : str
      Name of the model

  loss_function : nn.Module
      Loss function (cross entropy)

  optimizer : optim.Optimizer
      Optimizer (Adam)

  lr_decay : float
      A factor in interval (0, 1) to multiply the learning rate with

  dataset_name : str
      Name of the dataset

  grad_clip : float, optional
      Gradient threshold in clip gradients

  print_freq : int
      Print training status every __ batches

  """
  def __init__(
      self,
      num_epochs: int,
      train_loader: DataLoader,
      val_loader: DataLoader,
      model: nn.Module,
      loss_function: nn.Module,
      optimizer,
      tokenizer,
      grad_clip = Optional[None],
      print_freq: int = 100,
  ) -> None:

      self.num_epochs = num_epochs
      self.train_loader = train_loader
      self.val_loader = val_loader

      self.model = model
      self.loss_function = loss_function
      self.optimizer = optimizer

      self.print_freq = print_freq
      self.grad_clip = grad_clip

      self.tokenizer = tokenizer
      self.len_epoch = len(self.train_loader)

  def train(self, epoch: int) -> None:
    """
    Train an epoch

    Parameters
    ----------
    epoch : int
        Current number of epoch
    """
    self.model.train()  # training mode enables dropout
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []

    # Train step
    for i, batch in enumerate(self.train_loader):

      sentences, labels = batch
      tokenized = self.tokenizer(sentences, padding=True, return_tensors='pt')
      sentences = tokenized['input_ids']
      words_per_sentence = torch.sum(tokenized['attention_mask'], dim=-1)
      sentences = sentences.to(device)  # (batch_size, word_limit)
      labels = torch.argmax(labels, dim=-1).to(device)  # (batch_size)

      scores = self.model(sentences, words_per_sentence)  # (batch_size, n_classes)

      # calc loss
      train_loss = self.loss_function(scores, labels)  # scalar

      # backward
      self.optimizer.zero_grad()
      train_loss.backward()

      # grad clip
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.grad_clip)

      # update weights
      self.optimizer.step()

      # find accuracy
      _, predictions = scores.max(dim = 1)  # (n_documents)
      correct_predictions = torch.eq(predictions, labels).sum().item()
      train_accuracy = correct_predictions / labels.size(0)
      train_losses.append(train_loss.item())
      train_accs.append(train_accuracy)
    
    # Validation step
    with torch.no_grad():
      for i, batch in enumerate(self.val_loader):

        sentences, labels = batch
        tokenized = self.tokenizer(sentences, padding=True, return_tensors='pt', max_length=512, truncation=True)
        sentences = tokenized['input_ids']
        words_per_sentence = torch.sum(tokenized['attention_mask'], dim=-1)
        sentences = sentences.to(device)  # (batch_size, word_limit)
        labels = torch.argmax(labels, dim=-1).to(device)  # (batch_size)

        scores = self.model(sentences, words_per_sentence)  # (batch_size, n_classes)

        # calc loss
        val_loss = self.loss_function(scores, labels)  # scalar

        # find accuracy
        _, predictions = scores.max(dim = 1)  # (n_documents)
        correct_predictions = torch.eq(predictions, labels).sum().item()
        val_accuracy = correct_predictions / labels.size(0)
  
        val_losses.append(val_loss.item())
        val_accs.append(val_accuracy)
    
    # keep track of metrics

    # print training status
    print(f'\n Epoch: {epoch}: Train Loss {np.mean(train_losses)}, Train Accuracy {np.mean(train_accs)} \n Val Loss {np.mean(val_losses)}, Val Accuracy {np.mean(val_accs)}')

  def run_train(self):

    # epochs
    for epoch in tqdm(range(self.num_epochs)):
      # trian an epoch
      self.train(epoch=epoch)

cuda


In [None]:
class Attention(nn.Module):
    """
    Attention network
    Parameters
    ----------
    rnn_size : int
        Size of Bi-LSTM
    """
    def __init__(self, rnn_size: int) -> None:
        super(Attention, self).__init__()
        self.w = nn.Linear(rnn_size, 1)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, H: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Parameters
        ----------
        H : torch.Tensor (batch_size, word_pad_len, hidden_size)
            Output of Bi-LSTM
        Returns
        -------
        r : torch.Tensor (batch_size, rnn_size)
            Sentence representation
        alpha : torch.Tensor (batch_size, word_pad_len)
            Attention weights
        """
        # eq.9: M = tanh(H)
        M = self.tanh(H)  # (batch_size, word_pad_len, rnn_size)

        # eq.10: α = softmax(w^T M)
        alpha = self.w(M).squeeze(2)  # (batch_size, word_pad_len)
        alpha = self.softmax(alpha)  # (batch_size, word_pad_len)

        # eq.11: r = H
        r = H * alpha.unsqueeze(2)  # (batch_size, word_pad_len, rnn_size)
        r = r.sum(dim = 1)  # (batch_size, rnn_size)

        return r, alpha

In [None]:
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence

def get_pretrained_embeddings():

    model = BertModel.from_pretrained("bert-base-uncased")
    embedding_matrix = model.embeddings.word_embeddings.weight
    return embedding_matrix

class AttBiLSTM(nn.Module):
    """
    Implementation of Attention-based bidirectional LSTM proposed in paper [1].
    Parameters
    ----------
    n_classes : int
        Number of classes
    vocab_size : int
        Number of words in the vocabulary
    embeddings : torch.Tensor
        Word embedding weights
    emb_size : int
        Size of word embeddings
    fine_tune : bool
        Allow fine-tuning of embedding layer? (only makes sense when using
        pre-trained embeddings)
    rnn_size : int
        Size of Bi-LSTM
    rnn_layers : int
        Number of layers in Bi-LSTM
    dropout : float
        Dropout
    References
    ----------
    1. "`Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification. \
        <https://www.aclweb.org/anthology/P16-2034.pdf>`_" Peng Zhou, et al. ACL 2016.
    """
    def __init__(
        self,
        n_classes: int,
        vocab_size: int,
        emb_size: int,
        rnn_size: int,
        rnn_layers: int,
        dropout: float
    ) -> None:
        super(AttBiLSTM, self).__init__()

        self.rnn_size = rnn_size

        # embedding layer
        self.embeddings = nn.Embedding(vocab_size, emb_size)
        self.embeddings.weight = get_pretrained_embeddings()
        self.embeddings.weight.requires_grad = False
        
        # bidirectional LSTM
        self.BiLSTM = nn.LSTM(
            emb_size, rnn_size,
            num_layers = rnn_layers,
            bidirectional = True,
            dropout = (0 if rnn_layers == 1 else dropout),
            batch_first = True
        )

        self.attention = Attention(rnn_size)
        self.fc = nn.Linear(rnn_size, n_classes)
        # self.fc2 = nn.Linear(emv, n_classes)

        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, text: torch.Tensor, words_per_sentence: torch.Tensor) -> torch.Tensor:
        """
        Parameters
        ----------
        text : torch.Tensor (batch_size, word_pad_len)
            Input data
        words_per_sentence : torch.Tensor (batch_size)
            Sentence lengths
        Returns
        -------
        scores : torch.Tensor (batch_size, n_classes)
            Class scores
        """
        # word embedding, apply dropout
        embeddings = self.dropout(self.embeddings(text)) # (batch_size, word_pad_len, emb_size)

        # pack sequences (remove word-pads, SENTENCES -> WORDS)
        packed_words = pack_padded_sequence(
            embeddings,
            lengths = words_per_sentence.tolist(),
            batch_first = True,
            enforce_sorted = False
        )  # a PackedSequence object, where 'data' is the flattened words (n_words, emb_size)

        # run through bidirectional LSTM (PyTorch automatically applies it on the PackedSequence)
        rnn_out, _ = self.BiLSTM(packed_words)  # a PackedSequence object, where 'data' is the output of the LSTM (n_words, 2 * rnn_size)

        # unpack sequences (re-pad with 0s, WORDS -> SENTENCES)
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first = True)  # (batch_size, word_pad_len, 2 * word_rnn_size)

        # eq.8: h_i = [\overrightarrow{h}_i ⨁ \overleftarrow{h}_i ]
        # H = {h_1, h_2, ..., h_T}
        H = rnn_out[ :, :, : self.rnn_size] + rnn_out[ :, :, self.rnn_size : ] # (batch_size, word_pad_len, rnn_size)

        # attention module
        r, alphas = self.attention(H)  # (batch_size, rnn_size), (batch_size, word_pad_len)

        # eq.12: h* = tanh(r)
        h = self.tanh(r)  # (batch_size, rnn_size)

        scores = self.fc(self.dropout(h))  # (batch_size, n_classes)

        return scores #, alphas

## Model Test

In [None]:
from sklearn.metrics import confusion_matrix

def test(model: nn.Module, test_loader: DataLoader, tokenizer) -> None:

    # n_classes = model.n_classes  # add this line if the number of classes is not already defined in the model
    correct_per_label = [0] * n_classes
    total_per_label = [0] * n_classes
    total_per_prediction = [0] * n_classes
    lst_labels = []
    lst_preds = []
 
    # evaluate in batches
    with torch.no_grad():
        for i, batch in enumerate(test_loader):

            sentences, labels = batch
            tokenized = tokenizer(sentences, padding=True, return_tensors='pt', max_length=512, truncation=True)
            sentences = tokenized['input_ids']
            words_per_sentence = torch.sum(tokenized['attention_mask'], dim=-1)
            sentences = sentences.to(device)  # (batch_size, word_limit)
            labels = torch.argmax(labels, dim=-1).to(device)  # (batch_size)
            lst_labels.extend(labels.cpu())
            scores = model(sentences, words_per_sentence)  # (batch_size, n_classes)

            # accuracy per label
            _, predictions = scores.max(dim=1)  # (n_documents)
            lst_preds.extend(predictions.cpu())
            for j in range(n_classes):
                correct_predictions = torch.logical_and(torch.eq(predictions, j), torch.eq(labels, j)).sum().item()
                correct_per_label[j] += correct_predictions
                total_per_label[j] += torch.eq(labels, j).sum().item()
                total_per_prediction[j] += torch.eq(predictions, j).sum().item()

    # print accuracy per label
    for j in range(n_classes):
        if total_per_label[j] == 0:
            accuracy = 0.0
        else:
            accuracy = correct_per_label[j] / total_per_label[j]
        print('\n * LABEL %d ACCURACY - %.1f percent\n' % (j, accuracy * 100))

        if total_per_prediction[j] == 0:
            true_pos = 0.0
        else:
            true_pos = correct_per_label[j] / total_per_prediction[j]
        print('\n * LABEL %d TRUE POSITIVE - %.1f percent\n' % (j, true_pos * 100))

    # final test accuracy
    overall_accuracy = sum(correct_per_label) / sum(total_per_label) if sum(total_per_label) != 0 else 0.0
    print('\n * TEST ACCURACY - %.1f percent\n' % (overall_accuracy * 100))

    # print confusion matrix
    print(confusion_matrix(lst_labels, lst_preds))

In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def get_formatted_data(df):
  one_hot_labels = pd.get_dummies(df.iloc[:, 1])
  data = df.to_numpy()
  X = data[:, 0]
  y = one_hot_labels.values
  
  dataset = MyDataset(X, y)
  return dataset

# Train Model

In [None]:
torch.cuda.manual_seed(20)
torch.manual_seed(20)

<torch._C.Generator at 0x7fd61d16c250>

In [None]:
# load data

EMB_SIZE = 768
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
n_classes = len(set(text_data_train[1]))
model = AttBiLSTM(n_classes = n_classes, vocab_size = vocab_size, emb_size = EMB_SIZE,
                  rnn_size = 64, rnn_layers = 5, dropout = 0.2)

optimizer = optim.Adam(
    params = filter(lambda p: p.requires_grad, model.parameters()),
    lr = 0.001
)

# loss functions
loss_function = nn.CrossEntropyLoss()

# move to device
model = model.to(device)
loss_function = loss_function.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Only Twitter Message Dataset

In [None]:
trainer = Trainer(
    num_epochs = 20,
    train_loader = train_dl,
    val_loader = val_dl,
    model = model,
    loss_function = loss_function,
    optimizer = optimizer,
    grad_clip = 1000,
    tokenizer = tokenizer
)

trainer.run_train()


  0%|          | 0/20 [00:00<?, ?it/s][A
  5%|▌         | 1/20 [00:15<05:02, 15.92s/it][A


 Epoch: 0: Train Loss 1.5919674158096313, Train Accuracy 0.323625 
 Val Loss 1.587259298279172, Val Accuracy 0.3506944444444444



 10%|█         | 2/20 [00:29<04:26, 14.78s/it][A


 Epoch: 1: Train Loss 1.583609697341919, Train Accuracy 0.3261875 
 Val Loss 1.5876899749513655, Val Accuracy 0.3179563492063492



 15%|█▌        | 3/20 [00:43<04:05, 14.46s/it][A


 Epoch: 2: Train Loss 1.577588897228241, Train Accuracy 0.3396875 
 Val Loss 1.5894137715536452, Val Accuracy 0.3551587301587302



 20%|██        | 4/20 [00:58<03:48, 14.28s/it][A


 Epoch: 3: Train Loss 1.3889501181840898, Train Accuracy 0.460375 
 Val Loss 1.187188222294762, Val Accuracy 0.5669642857142857



 25%|██▌       | 5/20 [01:13<03:39, 14.61s/it][A


 Epoch: 4: Train Loss 1.0149324651956557, Train Accuracy 0.6214375 
 Val Loss 0.8075690988510374, Val Accuracy 0.7123015873015873



 30%|███       | 6/20 [01:36<04:07, 17.68s/it][A


 Epoch: 5: Train Loss 0.6874038452506065, Train Accuracy 0.7440625 
 Val Loss 0.6014097233613332, Val Accuracy 0.7961309523809523



 35%|███▌      | 7/20 [01:50<03:33, 16.39s/it][A


 Epoch: 6: Train Loss 0.48851619657874107, Train Accuracy 0.8259375 
 Val Loss 0.42739233658427284, Val Accuracy 0.8640873015873016



 40%|████      | 8/20 [02:04<03:06, 15.57s/it][A


 Epoch: 7: Train Loss 0.3305070045813918, Train Accuracy 0.8831875 
 Val Loss 0.29536976996395325, Val Accuracy 0.9052579365079365



 45%|████▌     | 9/20 [02:18<02:45, 15.03s/it][A


 Epoch: 8: Train Loss 0.24208424570411444, Train Accuracy 0.9106875 
 Val Loss 0.26948656727160725, Val Accuracy 0.9117063492063492



 50%|█████     | 10/20 [02:32<02:26, 14.66s/it][A


 Epoch: 9: Train Loss 0.20633186473697424, Train Accuracy 0.9203125 
 Val Loss 0.23542544600509463, Val Accuracy 0.9156746031746031



 55%|█████▌    | 11/20 [02:45<02:09, 14.42s/it][A


 Epoch: 10: Train Loss 0.18676915449649095, Train Accuracy 0.92375 
 Val Loss 0.2267511072494681, Val Accuracy 0.9201388888888888



 60%|██████    | 12/20 [03:00<01:55, 14.49s/it][A


 Epoch: 11: Train Loss 0.16774342481140048, Train Accuracy 0.927625 
 Val Loss 0.17340196071872635, Val Accuracy 0.9290674603174603



 65%|██████▌   | 13/20 [03:14<01:39, 14.28s/it][A


 Epoch: 12: Train Loss 0.1599393803011626, Train Accuracy 0.9305625 
 Val Loss 0.21186431479595957, Val Accuracy 0.9201388888888888



 70%|███████   | 14/20 [03:28<01:24, 14.10s/it][A


 Epoch: 13: Train Loss 0.14744138076156377, Train Accuracy 0.92925 
 Val Loss 0.15501874238843955, Val Accuracy 0.9295634920634921



 75%|███████▌  | 15/20 [03:41<01:10, 14.02s/it][A


 Epoch: 14: Train Loss 0.14041246384941042, Train Accuracy 0.9338125 
 Val Loss 0.151425009024226, Val Accuracy 0.9315476190476191



 80%|████████  | 16/20 [03:55<00:55, 13.94s/it][A


 Epoch: 15: Train Loss 0.13235500236041844, Train Accuracy 0.9351875 
 Val Loss 0.1428955234661107, Val Accuracy 0.9295634920634921



 85%|████████▌ | 17/20 [04:09<00:41, 13.87s/it][A


 Epoch: 16: Train Loss 0.1336211936119944, Train Accuracy 0.9351875 
 Val Loss 0.13412173807857528, Val Accuracy 0.9350198412698413



 90%|█████████ | 18/20 [04:23<00:27, 13.86s/it][A


 Epoch: 17: Train Loss 0.12742608074611053, Train Accuracy 0.9354375 
 Val Loss 0.15185686845391516, Val Accuracy 0.9320436507936508



 95%|█████████▌| 19/20 [04:37<00:13, 13.84s/it][A


 Epoch: 18: Train Loss 0.12378526560449973, Train Accuracy 0.93625 
 Val Loss 0.1501798000009287, Val Accuracy 0.9330357142857143



100%|██████████| 20/20 [04:50<00:00, 14.54s/it]


 Epoch: 19: Train Loss 0.12185443728743121, Train Accuracy 0.9371875 
 Val Loss 0.1515373903371039, Val Accuracy 0.9270833333333334





#### Results

In [None]:
test(trainer.model, text_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 95.3 percent


 * LABEL 0 TRUE POSITIVE - 91.0 percent


 * LABEL 1 ACCURACY - 88.4 percent


 * LABEL 1 TRUE POSITIVE - 92.1 percent


 * LABEL 2 ACCURACY - 96.4 percent


 * LABEL 2 TRUE POSITIVE - 92.9 percent


 * LABEL 3 ACCURACY - 76.7 percent


 * LABEL 3 TRUE POSITIVE - 86.5 percent


 * LABEL 4 ACCURACY - 96.4 percent


 * LABEL 4 TRUE POSITIVE - 96.2 percent


 * LABEL 5 ACCURACY - 63.6 percent


 * LABEL 5 TRUE POSITIVE - 79.2 percent


 * TEST ACCURACY - 92.7 percent

[[262   2   3   0   8   0]
 [  8 198   0   0   9   9]
 [  5   0 670  17   1   2]
 [  0   0  35 122   2   0]
 [ 13   1   5   2 560   0]
 [  0  14   8   0   2  42]]


In [None]:
test(trainer.model, poem_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 66.7 percent


 * LABEL 0 TRUE POSITIVE - 9.1 percent


 * LABEL 1 ACCURACY - 20.0 percent


 * LABEL 1 TRUE POSITIVE - 33.3 percent


 * LABEL 2 ACCURACY - 20.8 percent


 * LABEL 2 TRUE POSITIVE - 27.5 percent


 * LABEL 3 ACCURACY - 1.5 percent


 * LABEL 3 TRUE POSITIVE - 25.0 percent


 * LABEL 4 ACCURACY - 6.2 percent


 * LABEL 4 TRUE POSITIVE - 33.3 percent


 * LABEL 5 ACCURACY - 0.0 percent


 * LABEL 5 TRUE POSITIVE - 0.0 percent


 * TEST ACCURACY - 14.5 percent

[[16  0  2  0  5  1]
 [ 5  2  0  1  2  0]
 [40  1 11  0  1  0]
 [43  1 19  1  1  0]
 [64  2  7  2  5  0]
 [ 7  0  1  0  1  0]]


In [None]:
test(trainer.model, reddit_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 93.9 percent


 * LABEL 0 TRUE POSITIVE - 21.5 percent


 * LABEL 1 ACCURACY - 0.7 percent


 * LABEL 1 TRUE POSITIVE - 30.0 percent


 * LABEL 2 ACCURACY - 11.8 percent


 * LABEL 2 TRUE POSITIVE - 30.1 percent


 * LABEL 3 ACCURACY - 0.2 percent


 * LABEL 3 TRUE POSITIVE - 38.1 percent


 * LABEL 4 ACCURACY - 4.8 percent


 * LABEL 4 TRUE POSITIVE - 61.9 percent


 * LABEL 5 ACCURACY - 0.6 percent


 * LABEL 5 TRUE POSITIVE - 53.8 percent


 * TEST ACCURACY - 22.8 percent

[[3229    5  174    1   29    1]
 [1189    9   71    2   22    0]
 [2891    5  389    4   14    6]
 [3086    2  274    8   11    2]
 [2474    3  197    6  135    3]
 [2115    6  186    0    7   14]]


### With Combined Dataset

In [None]:
combined_trainer = Trainer(
    num_epochs = 20,
    train_loader = combined_train_dl,
    val_loader = combined_val_dl,
    model = model,
    loss_function = loss_function,
    optimizer = optimizer,
    grad_clip = 1000,
    tokenizer = tokenizer
)

combined_trainer.run_train()

  0%|          | 0/20 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors
  5%|▌         | 1/20 [00:27<08:36, 27.17s/it]


 Epoch: 0: Train Loss 0.3700651291914803, Train Accuracy 0.8506095412585442 
 Val Loss 1.0763200612700716, Val Accuracy 0.6268297497570458


 10%|█         | 2/20 [00:54<08:12, 27.35s/it]


 Epoch: 1: Train Loss 0.31475773226693154, Train Accuracy 0.8739077584384469 
 Val Loss 0.9838639323403235, Val Accuracy 0.6651026482021379


 15%|█▌        | 3/20 [01:21<07:43, 27.29s/it]


 Epoch: 2: Train Loss 0.2928300481561324, Train Accuracy 0.8827293707279262 
 Val Loss 0.9605836147556499, Val Accuracy 0.677318695335277


 20%|██        | 4/20 [01:49<07:15, 27.23s/it]


 Epoch: 3: Train Loss 0.28470252453978967, Train Accuracy 0.8861338172080896 
 Val Loss 0.9228257235317003, Val Accuracy 0.6955782312925171


 25%|██▌       | 5/20 [02:16<06:48, 27.21s/it]


 Epoch: 4: Train Loss 0.2672533703246418, Train Accuracy 0.8905512296525968 
 Val Loss 0.9256665627060293, Val Accuracy 0.685465257531584


 30%|███       | 6/20 [02:43<06:20, 27.16s/it]


 Epoch: 5: Train Loss 0.2610178423480914, Train Accuracy 0.8960917130575717 
 Val Loss 0.8986719984079705, Val Accuracy 0.6987973760932944


 35%|███▌      | 7/20 [03:10<05:52, 27.14s/it]


 Epoch: 6: Train Loss 0.25418998812124544, Train Accuracy 0.898430343175252 
 Val Loss 0.8923596400184696, Val Accuracy 0.7017735665694849


 40%|████      | 8/20 [03:37<05:26, 27.18s/it]


 Epoch: 7: Train Loss 0.2481603351009578, Train Accuracy 0.9000202593192869 
 Val Loss 0.9102180008174611, Val Accuracy 0.7018115281827016


 45%|████▌     | 9/20 [04:04<04:58, 27.13s/it]


 Epoch: 8: Train Loss 0.24222156037146306, Train Accuracy 0.9030305299133253 
 Val Loss 0.8977476173923129, Val Accuracy 0.7112639698736637


 50%|█████     | 10/20 [04:31<04:31, 27.17s/it]


 Epoch: 9: Train Loss 0.239581907472937, Train Accuracy 0.9039818370798394 
 Val Loss 0.8860087593903347, Val Accuracy 0.7089255344995141


 55%|█████▌    | 11/20 [04:59<04:05, 27.31s/it]


 Epoch: 10: Train Loss 0.2331949343808249, Train Accuracy 0.9057038792192235 
 Val Loss 0.9084975660911628, Val Accuracy 0.7051521501457726


 60%|██████    | 12/20 [05:27<03:39, 27.41s/it]


 Epoch: 11: Train Loss 0.22598695961948684, Train Accuracy 0.9066661968853499 
 Val Loss 0.9103203085916383, Val Accuracy 0.7076348396501458


 65%|██████▌   | 13/20 [05:55<03:13, 27.63s/it]


 Epoch: 12: Train Loss 0.22264145160730886, Train Accuracy 0.9094408427876823 
 Val Loss 0.9434572735003063, Val Accuracy 0.689291788143829


 70%|███████   | 14/20 [06:23<02:46, 27.76s/it]


 Epoch: 13: Train Loss 0.21197486439692625, Train Accuracy 0.9139397329293213 
 Val Loss 0.9396066890180517, Val Accuracy 0.7028213070942663


 75%|███████▌  | 15/20 [06:50<02:18, 27.62s/it]


 Epoch: 14: Train Loss 0.20817148714436512, Train Accuracy 0.9147302868014939 
 Val Loss 0.9087349395362698, Val Accuracy 0.7028592687074829


 80%|████████  | 16/20 [07:18<01:50, 27.55s/it]


 Epoch: 15: Train Loss 0.2044473324571789, Train Accuracy 0.9160669614544429 
 Val Loss 0.9238696481500354, Val Accuracy 0.707794278425656


 85%|████████▌ | 17/20 [07:45<01:22, 27.47s/it]


 Epoch: 16: Train Loss 0.19841293440271332, Train Accuracy 0.9190243816503417 
 Val Loss 0.8995311629204523, Val Accuracy 0.708879980563654


 90%|█████████ | 18/20 [08:12<00:54, 27.43s/it]


 Epoch: 17: Train Loss 0.19145877683622767, Train Accuracy 0.9224376365301953 
 Val Loss 1.0161884798585963, Val Accuracy 0.6972865038872692


 95%|█████████▌| 19/20 [08:41<00:27, 27.82s/it]


 Epoch: 18: Train Loss 0.18824591569080223, Train Accuracy 0.9239372665774083 
 Val Loss 0.9631437413445135, Val Accuracy 0.6997691933916423


100%|██████████| 20/20 [09:09<00:00, 27.47s/it]


 Epoch: 19: Train Loss 0.18251542070340154, Train Accuracy 0.9265203297864844 
 Val Loss 0.9906221586628025, Val Accuracy 0.7090014577259475





#### Results

In [None]:
test(combined_trainer.model, text_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 91.6 percent


 * LABEL 0 TRUE POSITIVE - 91.3 percent


 * LABEL 1 ACCURACY - 85.7 percent


 * LABEL 1 TRUE POSITIVE - 93.2 percent


 * LABEL 2 ACCURACY - 97.3 percent


 * LABEL 2 TRUE POSITIVE - 93.1 percent


 * LABEL 3 ACCURACY - 73.6 percent


 * LABEL 3 TRUE POSITIVE - 91.4 percent


 * LABEL 4 ACCURACY - 96.2 percent


 * LABEL 4 TRUE POSITIVE - 96.5 percent


 * LABEL 5 ACCURACY - 89.4 percent


 * LABEL 5 TRUE POSITIVE - 69.4 percent


 * TEST ACCURACY - 92.8 percent

[[252   6   4   1  11   1]
 [  8 192   1   0   5  18]
 [  1   2 676   8   2   6]
 [  1   0  40 117   0   1]
 [ 14   4   3   1 559   0]
 [  0   2   2   1   2  59]]


In [None]:
test(combined_trainer.model, poem_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 16.7 percent


 * LABEL 0 TRUE POSITIVE - 26.7 percent


 * LABEL 1 ACCURACY - 20.0 percent


 * LABEL 1 TRUE POSITIVE - 28.6 percent


 * LABEL 2 ACCURACY - 47.2 percent


 * LABEL 2 TRUE POSITIVE - 37.9 percent


 * LABEL 3 ACCURACY - 60.0 percent


 * LABEL 3 TRUE POSITIVE - 42.9 percent


 * LABEL 4 ACCURACY - 32.5 percent


 * LABEL 4 TRUE POSITIVE - 46.4 percent


 * LABEL 5 ACCURACY - 44.4 percent


 * LABEL 5 TRUE POSITIVE - 66.7 percent


 * TEST ACCURACY - 41.5 percent

[[ 4  0  1  7 12  0]
 [ 1  2  0  2  5  0]
 [ 5  1 25 14  7  1]
 [ 0  1 19 39  5  1]
 [ 5  3 18 28 26  0]
 [ 0  0  3  1  1  4]]


In [None]:
test(combined_trainer.model, reddit_test_dl, tokenizer)


 * LABEL 0 ACCURACY - 65.3 percent


 * LABEL 0 TRUE POSITIVE - 70.0 percent


 * LABEL 1 ACCURACY - 59.6 percent


 * LABEL 1 TRUE POSITIVE - 58.5 percent


 * LABEL 2 ACCURACY - 66.5 percent


 * LABEL 2 TRUE POSITIVE - 63.8 percent


 * LABEL 3 ACCURACY - 84.5 percent


 * LABEL 3 TRUE POSITIVE - 77.0 percent


 * LABEL 4 ACCURACY - 63.0 percent


 * LABEL 4 TRUE POSITIVE - 65.5 percent


 * LABEL 5 ACCURACY - 58.7 percent


 * LABEL 5 TRUE POSITIVE - 63.1 percent


 * TEST ACCURACY - 67.7 percent

[[2247  227  247  123  382  213]
 [ 133  771   88   45  156  100]
 [ 154   45 2201  438  173  298]
 [  65   16  299 2859   83   61]
 [ 348  166  259  142 1776  127]
 [ 261   94  357  106  143 1367]]
