# Data loading & Imports

In [None]:
# Clone original repository
!git clone https://github.com/cardiffnlp/tweeteval.git

Cloning into 'tweeteval'...
remote: Enumerating objects: 370, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 370 (delta 13), reused 3 (delta 1), pack-reused 354[K
Receiving objects: 100% (370/370), 8.49 MiB | 11.19 MiB/s, done.
Resolving deltas: 100% (122/122), done.


In [None]:
import os
# navigate to desired folder
path = '/content/tweeteval/datasets/emotion'

file_contents = {}

for subdir, dirs, files in os.walk(path):
    # list all files in the emotion dataset
    print(files)
    for file in files:
      file_path = os.path.join(subdir, file)
      with open(file_path, 'r', encoding='utf-8') as f:
          # read file line by line and assign to list
          text = f.readlines()
      # keep track of the names/contents of the files
      file_contents[file] = text

['val_text.txt', 'mapping.txt', 'test_text.txt', 'val_labels.txt', 'train_labels.txt', 'train_text.txt', 'test_labels.txt']


In [None]:
# assign variable names according to the original file names
test_text = file_contents['test_text.txt']
test_labels = file_contents['test_labels.txt']
train_text = file_contents['train_text.txt']
train_labels = file_contents['train_labels.txt']
val_text = file_contents['val_text.txt']
val_labels = file_contents['val_labels.txt']
mapping = file_contents['mapping.txt']

# print data size
print(f"Test set: {len(test_text)} lines of text and {len(test_labels)} labels")
print(f"Train set: {len(train_text)} lines of text and {len(train_labels)} labels")
print(f"Validation set: {len(val_text)} lines of text and {len(val_labels)} labels")

Test set: 1421 lines of text and 1421 labels
Train set: 3257 lines of text and 3257 labels
Validation set: 374 lines of text and 374 labels


In [None]:
import random
# print random samples from the test set text
print(test_text[random.randint(0, len(test_text))])
print(test_text[random.randint(0, len(test_text))])

1 Samuel 18:15\nAnd when #Saul saw that he had great #success, he stood in #fearful #awe of him. 

I'm still bitter about the fact that I didn't get the Php 10/liter promo. 



# Task 1: Create Datasets

In [None]:
import pandas as pd

# create dataframes from the initial data
# remove new line symbol from labels and convert them to integers
test_df = pd.DataFrame({'text': test_text, 'label': [int(label) for label in test_labels]})
train_df = pd.DataFrame({'text': train_text, 'label': [int(label) for label in train_labels]})
val_df = pd.DataFrame({'text': val_text, 'label': [int(label) for label in val_labels]})

# inspect test dataframes
test_df.head(5)

Unnamed: 0,text,label
0,#Deppression is real. Partners w/ #depressed p...,3
1,@user Interesting choice of words... Are you c...,0
2,My visit to hospital for care triggered #traum...,3
3,@user Welcome to #MPSVT! We are delighted to h...,1
4,What makes you feel #joyful? \n,1


In [None]:
# inspect label mapping for data filtering
sentiments_dict = {}

for line in mapping:
    # Split each line based on the tab character
    parts = line.strip().split('\t')

    # Extract the key and value
    key = parts[1]
    value = int(parts[0])

    # Add the key-value pair to the dictionary
    sentiments_dict[key] = value

# Print the resulting dictionary
print("Sentiments Dictionary:")
print(sentiments_dict)

Sentiments Dictionary:
{'anger': 0, 'joy': 1, 'optimism': 2, 'sadness': 3}


### Preprocessing

In [None]:
# install the emoji library before running, if necessary
!pip install emoji

Collecting emoji
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.8.0


In [None]:
import re
import html
import emoji

import spacy
nlp = spacy.load('en_core_web_sm')

# By peaking at the data, we've identified the need for specific preprocessing
# steps so we'll define a function to do that
# First we'll define functions for the sub-steps

# remove emojis
def remove_emojis(text):
    return emoji.demojize(text).replace(":", ": ")

# treat chat words
chat_words = {
    'fyi'  : 'for your information',
    'lol'  : 'laugh out loud',
    'afk'  : 'away from keyboard',
    'w/'   : 'with',
    'brb'  : 'be right back',
    'asap' : 'as soon as possible',
    'lmk'  : 'let me know',
    'nmu'  : 'not much you',
    'hrs'  : 'hours',
    'Fri'  : 'Friday'
}
def chat_words_conv(text):
    new_text = []
    for t in text.split():
        t = chat_words[t] if t in chat_words.keys() else t
        new_text.append(t)
    return ' '.join(new_text)

def handle_special_cases(text):
    new_text = text.replace("\\n", "")
    new_text = re.sub(r'(?<!\s)#', ' #', new_text) # add space before hashtags
    return new_text

# lowercasing
def lowercasing(text):
    return text.lower()

# handle HTML characters
def handle_html(text):
    return(html.unescape(text))

# tokenization & lemmatization
# along with digit, punctuation and stop word removal
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [t.lemma_ for t in doc if (not t.is_digit) and (not t.is_punct) and not (t.is_stop)]

def text_preprocessing(text_df):
    # function assumes input is a dataframe with a 'text' column
    new_text_df = text_df.copy()
    new_text_df['text'] = new_text_df['text'].apply(remove_emojis)
    new_text_df['text'] = new_text_df['text'].apply(handle_html)
    new_text_df['text'] = new_text_df['text'].apply(chat_words_conv)
    new_text_df['text'] = new_text_df['text'].apply(handle_special_cases)
    new_text_df['text'] = new_text_df['text'].apply(lowercasing)
    new_text_df['text'] = new_text_df['text'].apply(tokenize_lemmatize)
    return new_text_df

In [None]:
# Apply preprocessing to all data (might take about 1min to run!)
test_df = text_preprocessing(test_df)
train_df = text_preprocessing(train_df)
val_df = text_preprocessing(val_df)

# Preview the preprocessed data
print(test_df.iloc[random.randint(0, len(test_df))]['text'])

['remember', 'kid', 'flip', 'shirt', 'pouch', 'snack', 'time', 'snack']


### Encode data

In [None]:
## Build vocabulary
# combine all text from all splits
all_text = pd.concat([test_df['text'], train_df['text'], val_df['text']])

# retrieve maximum number of tokens in sentence
max_len = max([len(x) for x in all_text])

# get unique tokens
token_set = set(token for row in all_text for token in row)

# assign unique id to each token
word2idx = {token: idx+2 for idx, token in enumerate(token_set)}
word2idx.update({'<pad>': 0, '<unk>': 1})

In [None]:
import numpy as np

def encode(text_df, word2idx, max_len):
    input_ids = []
    for row in text_df['text']:
        row += ['<pad>']*(max_len - len(row))
        input_ids.append([word2idx.get(token) for token in row])
    return np.array(input_ids)

### Build datasets

In [None]:
def build_dataset(sentiments):
    sent1, sent2 = sentiments_dict[sentiments[0]], sentiments_dict[sentiments[1]]
    # filter data
    test_filtered = test_df[(test_df["label"] == sent1) | (test_df["label"] == sent2)]
    train_filtered = train_df[(train_df["label"] == sent1) | (train_df["label"] == sent2)]
    val_filtered = val_df[(val_df["label"] == sent1) | (val_df["label"] == sent2)]

    test_set = encode(test_filtered, word2idx, max_len)
    test_label = np.array(test_filtered['label'])

    train_set = encode(train_filtered, word2idx, max_len)
    train_label = np.array(train_filtered['label'])

    val_set = encode(val_filtered, word2idx, max_len)
    val_label = np.array(val_filtered['label'])

    # print data size
    print(f"Dataset: {sentiments[0]} and {sentiments[1]}")
    print("------------------------")
    print(f"Test set size: {len(test_set)}")
    print(f"Train set size: {len(train_set)}")
    print(f"Validation set size: {len(val_set)}")
    print("\n")

    return test_set, test_label, train_set, train_label, val_set, val_label

In [None]:
## Dataset 1: anger and joy
test_set, test_lbl, train_set, train_lbl, val_set, val_lbl = build_dataset(['anger', 'joy'])

Dataset: anger and joy
------------------------
Test set size: 916
Train set size: 2108
Validation set size: 257




In [None]:
test_set[0:3]

array([[1845,  597, 2414, 5373, 4103, 4856, 5077, 4417, 5337, 1895, 5145,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [9718, 8602, 3386, 5723, 6719, 3386, 6835,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    

# Task 2: CNN Classifier

In [None]:
from tqdm import tqdm
import time
import torch.utils.data as data
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# use the GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


### CNN Class

In [None]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(word2idx),
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN class.
        Args:
            vocab_size (int): Need to be specified when pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN, self).__init__()
        # Embedding layer
        self.embed_dim = embed_dim

        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (batch_size, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        #print(x_fc.shape)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

### DataLoader

In [None]:
test_data = TensorDataset(torch.from_numpy(test_set), torch.from_numpy(test_lbl))
train_data = TensorDataset(torch.from_numpy(train_set), torch.from_numpy(train_lbl))
val_data = TensorDataset(torch.from_numpy(val_set), torch.from_numpy(val_lbl))

batch_size = 8
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data)
test_dataloader = DataLoader(test_data)

### Instantiate CNN model

In [None]:
# choose desired hyperparamater configuration
model = CNN(embed_dim=500,
            filter_sizes=[3, 5, 7],
            num_filters=[120, 120, 120],
            num_classes=2,
            dropout=0.5)

# Send model to `device` (GPU/CPU)
model.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

### Training loop

In [None]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

# Start training loop
print("Start training...\n")
print(f"{'Epoch':^7} | {'Train Loss':^12}")
print("-"*60)

for epoch_i in range(50):
    total_loss = 0
    # Put the model into the training mode
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)
        # Zero out any previously calculated gradients
        model.zero_grad()
        # Perform a forward pass. This will return logits.
        logits = model(b_input_ids)
        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()
        # Perform a backward pass to calculate gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        # Calculate the average loss over the entire training data
        avg_train_loss = (total_loss / len(train_dataloader))
    print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f}")


Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.669133  
   2    |   0.616094  
   3    |   0.592888  
   4    |   0.571710  
   5    |   0.547343  
   6    |   0.525237  
   7    |   0.503240  
   8    |   0.477858  
   9    |   0.452352  
  10    |   0.430146  
  11    |   0.405969  
  12    |   0.381853  
  13    |   0.353032  
  14    |   0.329927  
  15    |   0.308135  
  16    |   0.281708  
  17    |   0.260262  
  18    |   0.240150  
  19    |   0.223705  
  20    |   0.202177  
  21    |   0.186298  
  22    |   0.169693  
  23    |   0.156417  
  24    |   0.143009  
  25    |   0.128293  
  26    |   0.122298  
  27    |   0.111662  
  28    |   0.100782  
  29    |   0.091895  
  30    |   0.085990  
  31    |   0.081570  
  32    |   0.071683  
  33    |   0.065988  
  34    |   0.061903  
  35    |   0.058536  
  36    |   0.052742  
  37    |   0.048783  
  38    |   0.047020  
  39    |   0.041995  


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# define evaluation function to avoid code-deduplication
def evaluate_model(model, eval_set_dataloader):
    model.eval()
    all_eval_labels = []
    all_eval_preds = []

    with torch.no_grad():
        for eval_batch in eval_set_dataloader:
            # Load batch to GPU
            b_eval_input_ids, b_eval_labels = tuple(t.to(device) for t in eval_batch)
            # Perform a forward pass on the test set
            eval_logits = model(b_eval_input_ids)
            # Collect labels and predictions for evaluation
            all_eval_labels.extend(b_eval_labels.cpu().numpy())
            all_eval_preds.extend(torch.argmax(eval_logits, axis=1).cpu().numpy())

    # Calculate accuracy and F1 score on the test set
    eval_accuracy = accuracy_score(all_eval_labels, all_eval_preds)
    eval_f1 = f1_score(all_eval_labels, all_eval_preds, average='macro')

    print(f"Accuracy: {eval_accuracy:.4f}")
    print(f"F1 Score: {eval_f1:.4f}")

### Model evaluation

In [None]:
print("Evaluate model on the validation set:")
print("-------------------------------------")
evaluate_model(model, val_dataloader)
print("\n")

print("Evaluate model on the test set:")
print("-------------------------------------")
evaluate_model(model, test_dataloader)

Evaluate model on the validation set:
-------------------------------------
Accuracy: 0.7588
F1 Score: 0.7242


Evaluate model on the test set:
-------------------------------------
Accuracy: 0.7762
F1 Score: 0.7479


# Task 3: Re-training best model

### Build dataset

In [None]:
## Dataset 2: anger and sadness
test_set_2, test_lbl_2, train_set_2, train_lbl_2, val_set_2, val_lbl_2 = build_dataset(['anger', 'sadness'])

# fix labels
test_lbl_2 = np.where(test_lbl_2 == 3, 1, 0)
train_lbl_2 = np.where(train_lbl_2 == 3, 1, 0)
val_lbl_2 = np.where(val_lbl_2 == 3, 1, 0)

Dataset: anger and sadness
------------------------
Test set size: 940
Train set size: 2255
Validation set size: 249




### Dataloader

In [None]:
test_data_2 = TensorDataset(torch.from_numpy(test_set_2), torch.from_numpy(test_lbl_2))
train_data_2 = TensorDataset(torch.from_numpy(train_set_2), torch.from_numpy(train_lbl_2))
val_data_2 = TensorDataset(torch.from_numpy(val_set_2), torch.from_numpy(val_lbl_2))

batch_size = 8
train_sampler_2 = RandomSampler(train_data_2)
train_dataloader_2 = DataLoader(train_data_2, sampler=train_sampler_2, batch_size=batch_size)
val_dataloader_2 = DataLoader(val_data_2)
test_dataloader_2 = DataLoader(test_data_2)

### Instantiating the model

In [None]:
# choose best configuration from the previous dataset
model_2 = CNN(embed_dim=500,
            filter_sizes=[3, 5, 7],
            num_filters=[120, 120, 120],
            num_classes=2,
            dropout=0.5)

# Send model to `device` (GPU/CPU)
model_2.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adadelta(model.parameters(), lr=0.01, rho=0.95)

### Training loop

In [None]:
# Specify loss function
loss_fn = nn.CrossEntropyLoss()

# Start training loop
print("Start training...\n")
print(f"{'Epoch':^7} | {'Train Loss':^12}")
print("-"*60)

for epoch_i in range(50):
    total_loss = 0
    # Put the model into the training mode
    model_2.train()
    for step, batch in enumerate(train_dataloader_2):
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)
        # Zero out any previously calculated gradients
        model_2.zero_grad()
        # Perform a forward pass. This will return logits.
        logits = model_2(b_input_ids)
        # Compute loss and accumulate the loss values
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()
        # Perform a backward pass to calculate gradients
        loss.backward()
        # Update parameters
        optimizer.step()
        # Calculate the average loss over the entire training data
        avg_train_loss = (total_loss / len(train_dataloader_2))
    print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f}")

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.673050  
   2    |   0.635470  
   3    |   0.606128  
   4    |   0.585084  
   5    |   0.562489  
   6    |   0.537555  
   7    |   0.512713  
   8    |   0.482599  
   9    |   0.456489  
  10    |   0.430594  
  11    |   0.402979  
  12    |   0.374019  
  13    |   0.345472  
  14    |   0.324439  
  15    |   0.298599  
  16    |   0.275204  
  17    |   0.256216  
  18    |   0.232106  
  19    |   0.219697  
  20    |   0.202086  
  21    |   0.183350  
  22    |   0.172613  
  23    |   0.160839  
  24    |   0.145036  
  25    |   0.134568  
  26    |   0.124854  
  27    |   0.115235  
  28    |   0.109627  
  29    |   0.100743  
  30    |   0.092569  
  31    |   0.082842  
  32    |   0.082088  
  33    |   0.075648  
  34    |   0.069962  
  35    |   0.064864  
  36    |   0.061423  
  37    |   0.058191  
  38    |   0.053983  
  39    |   0.051865  


### Evaluate on test set

In [None]:
print("Evaluate model on the test set:")
print("-------------------------------")
evaluate_model(model_2, test_dataloader_2)

Evaluate model on the test set:
-------------------------------
Accuracy: 0.8032
F1 Score: 0.7881
