# Exercise 3 - Emotion Recognition with a CNN
## 1. Load the datasets


In [160]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [161]:
!pip install spacy



In [162]:
!pip install emoji



In [163]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
import csv
import string
import re
import emoji
import nltk
import spacy

torch.manual_seed(1)

<torch._C.Generator at 0x7bc488134fb0>

In [164]:
# Load the datasets

# Clone the GitHub repository
!git clone https://github.com/cardiffnlp/tweeteval

# Access the repository directory
%cd tweeteval/datasets/emotion

# Read the content of the text files
with open('train_text.txt', 'r') as file1:
    train_text = file1.read().splitlines()

with open('train_labels.txt', 'r') as file2:
    train_labels = file2.read().splitlines()

with open('mapping.txt', 'r') as file3:
    mapping = file3.read().splitlines()

with open('test_text.txt', 'r') as file4:
    test_text = file4.read().splitlines()

with open('test_labels.txt', 'r') as file5:
    test_labels = file5.read().splitlines()

Cloning into 'tweeteval'...
remote: Enumerating objects: 370, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 370 (delta 13), reused 3 (delta 1), pack-reused 354[K
Receiving objects: 100% (370/370), 8.49 MiB | 19.24 MiB/s, done.
Resolving deltas: 100% (122/122), done.
/content/tweeteval/datasets/emotion/tweeteval/datasets/emotion/tweeteval/datasets/emotion/tweeteval/datasets/emotion/tweeteval/datasets/emotion/tweeteval/datasets/emotion


## 2. Preprocess the data

In [165]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [166]:
# preprocessing the text

# Removing stop words
from nltk.corpus import stopwords
stopwords_english = stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords_english:
            continue
        else:
            new_text.append(word)

    return ' '.join(new_text)

def preprocess_text(tweet):
    # Turn emoji into text
    tweet = emoji.demojize(tweet)
    tweet = re.sub(r'_', ' ', tweet)
    # Remove special characters
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # Lowercasing
    tweet = tweet.lower()
    # Remove stop words
    tweet = remove_stopwords(tweet)
    return tweet

In [167]:
# Clean the Train Text
cleaned_train_text = []
for tweet in train_text:
    cleaned_train_text.append(preprocess_text(tweet))

# Clean the Test Text
cleaned_test_text = []
for tweet in test_text:
    cleaned_test_text.append(preprocess_text(tweet))

# Convert labels to int
train_labels_int = [int(x) for x in train_labels]
test_labels_int = [int(x) for x in test_labels]

## 3. Filter the datasets with emotions

In [168]:
# Create data frames

# Combine train text and label into one dataframe
train_df = pd.DataFrame({'Text': cleaned_train_text, 'Label': train_labels_int})
# Write train_df to csv with tab as separator
train_df.to_csv('train_df.csv', index=False, sep='\t')

# Combine test text and label into one dataframe
test_df = pd.DataFrame({'Text': cleaned_test_text, 'Label': test_labels_int})
# Write test_df to csv with tab as separator
test_df.to_csv('test_df.csv', index=False, sep='\t')

# Split the labels and emotions in the list
mapping_list = [item.split('\t') for item in mapping]
# Create a dataframe for mapping
mapping_df = pd.DataFrame(mapping_list, columns=['Label', 'Emotion'])

In [169]:
# Filter the dataset with desired emotions
def filter_dataset(original_df, desired_emotion):
    desired_labels_df = mapping_df[mapping_df['Emotion'].isin(desired_emotion)]
    desired_labels = desired_labels_df['Label'].tolist()
    desired_labels_int = [int(x) for x in desired_labels]

    return original_df[original_df['Label'].isin(desired_labels_int)]

In [170]:
test_df.head()

Unnamed: 0,Text,Label
0,deppression real partners w depressed people t...,3
1,user interesting choice words confirming gover...,0
2,visit hospital care triggered trauma accident ...,3
3,user welcome mpsvt delighted grateful mpsvt re...,1
4,makes feel joyful,1


In [171]:
train_df.head()

Unnamed: 0,Text,Label
0,worry payment problem may never joyce meyer mo...,2
1,roommate okay cant spell autocorrect terrible ...,0
2,thats cute atsu probably shy photos cherry hel...,1
3,rooneys fucking untouchable isnt fucking dread...,0
4,pretty depressing u hit pan ur favourite highl...,3


In [172]:
# We select dataset 1 with emotion 'anger' and 'joy'
# Dataset 2 with 'anger' and 'sadness'

train_df1 = filter_dataset(train_df, ['anger','joy'])
train_df2 = filter_dataset(train_df, ['anger','sadness'])
test_df1 = filter_dataset(test_df, ['anger','joy'])
test_df2 = filter_dataset(test_df, ['anger','sadness'])

In [173]:
train_df1.head()

Unnamed: 0,Text,Label
1,roommate okay cant spell autocorrect terrible ...,0
2,thats cute atsu probably shy photos cherry hel...,1
3,rooneys fucking untouchable isnt fucking dread...,0
5,user pussy weak heard stfu bitch got threaten ...,0
7,tiller breezy collab album rapping singing pro...,1


## 4. Encode the text

In [174]:
# Create a blank spaCy language model for English
nlp = spacy.blank('en')

# Assign unique id to each token
def tokenize(texts):
    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
      tokenized_sent = nlp(sent)
      # Add `tokenized_sent` to `tokenized_texts`
      tokenized_texts.append(tokenized_sent)
      # Add new token to `word2idx`
      for token in tokenized_sent:
        # string any token objects are different things, be careful.
        if token.text not in word2idx:
          word2idx[token.text] = idx
          idx += 1

          # Update `max_len`
      max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len

In [175]:
# Encode the texts to ids
def encode(tokenized_texts, word2idx, max_len):
    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_padded_sent = list(tokenized_sent) + ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(str(token)) for token in tokenized_padded_sent]
        input_ids.append(input_id)

    return np.array(input_ids)

In [176]:
# Encode dataset 1 (anger & joy)
tokenized_texts_train1, word2idx_train1, max_len_train1 = tokenize(train_df1['Text'].tolist())
input_ids_train1 = encode(tokenized_texts_train1, word2idx_train1, max_len_train1)

tokenized_texts_test1, word2idx_test1, max_len_test1 = tokenize(test_df1['Text'].tolist())
input_ids_test1 = encode(tokenized_texts_test1, word2idx_test1, max_len_test1)

In [177]:
# Convert data type to torch.Tensor
train_inputs_1 = torch.from_numpy(input_ids_train1)
labels_train1 = torch.tensor(train_df1['Label'].values)

test_inputs_1 = torch.from_numpy(input_ids_test1)
labels_test1 = torch.tensor(test_df1['Label'].values)

## 5. Data Loaders & Batches

In [178]:
from torch.utils.data import TensorDataset, DataLoader

# Create DataLoader for training data
dataset_train1 = TensorDataset(train_inputs_1, labels_train1)
dataloader_train1 = DataLoader(dataset_train1, batch_size=64, shuffle=True)

## 6. Set the CNN classifier

In [179]:
class CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(word2idx_train1),
                 embed_dim=500,
                 filter_sizes=[3, 4, 5],
                 num_filters=[50, 50, 50],
                 num_classes=2,
                 dropout=0.5):
        super(CNN, self).__init__()
        # Embedding layer
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """
        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)
            Here is (64, 26) for dataset_1

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
            Should be (62, 2) for dataset_1
        """
        # Get embeddings from `input_ids`. Output shape: (batch_size, max_len, embed_dim)
        # (64, 26, 300)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len) -> (64, 300, 26)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        # (64, 100, 3)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        # (64, 100, 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters)) -> (64, 300)
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)

        # Compute logits. Output shape: (b, n_classes) -> (64, 2)
        logits = self.fc(self.dropout(x_fc))

        return logits

In [180]:
# Use GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [182]:
# Training the model

def train_model(model, train_loader, optimizer, loss_fn, device):
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12}")
    print("-"*60)

    for epoch_i in range(12):
        total_loss = 0
        # Put the model into the training mode
        model.train()
        for step, batch in enumerate(train_loader):
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)

            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_loader)
        print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f}")

    return avg_train_loss

## 7. Find the optimal model architecture with different combinations of hyperparameters

In [183]:
from sklearn.metrics import f1_score, accuracy_score

def evaluate_model(model, test_loader, device):
    model.eval()

    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            
            logits = model(b_input_ids)

            # Move to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            
            all_predictions.append(logits)
            all_true_labels.append(label_ids)

    # Flatten the predictions and true values
    flat_predictions = np.concatenate(all_predictions, axis=0)
    flat_true_labels = np.concatenate(all_true_labels, axis=0)

    predictions = np.argmax(flat_predictions, axis=1)

    # Calculate performance metrics
    accuracy = accuracy_score(flat_true_labels, predictions)
    f1 = f1_score(flat_true_labels, predictions, average='macro')

    return accuracy, f1

In [184]:
#Use random search
import itertools
import random

hyperparameter_space = {
    'lr': [0.01, 0.001],
    'batch_size': [64, 128],
    'optimizer': [torch.optim.SGD, torch.optim.Adam]
}
# Generate all possible combinations
all_combinations = list(itertools.product(*hyperparameter_space.values()))
random.shuffle(all_combinations)

In [185]:
def sample_hyperparameters(space):
    return {param: random.choice(values) for param, values in space.items()}

#Set iteration number, let the repetition will not occur
num_iterations = min(8, len(all_combinations))
best_accuracy = 0
best_params = {}


for iteration in range(num_iterations):
    combination = all_combinations[iteration]
    params = dict(zip(hyperparameter_space.keys(), combination))
    model = CNN(vocab_size=len(word2idx_train1), num_classes=2)

    model = model.to(device) # using GPU
    

    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
    loss_fn = torch.nn.CrossEntropyLoss()
    train_model(model, dataloader_train1, optimizer, loss_fn, device)


    accuracy, f1_macro = evaluate_model(model, dataloader_train1, device)
    print(f"Iteration {iteration+1} - Hyperparameters: {params}, Accuracy: {accuracy}, F1 Macro: {f1_macro}")


    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params
        best_f1_score = f1_macro
print("Best Parameters:", best_params)
print(f"Best Accuracy: {best_accuracy}, Best F1 Macro: {best_f1_score}")

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.593084  
   2    |   0.358401  
   3    |   0.140306  
   4    |   0.044545  
   5    |   0.020962  
   6    |   0.015769  
   7    |   0.008692  
   8    |   0.006897  
   9    |   0.003832  
  10    |   0.003136  
  11    |   0.002573  
  12    |   0.002504  
Iteration 1 - Hyperparameters: {'lr': 0.001, 'batch_size': 128, 'optimizer': <class 'torch.optim.sgd.SGD'>}, Accuracy: 0.9995256166982922, F1 Macro: 0.9994681354337106
Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.477013  
   2    |   0.079494  
   3    |   0.066852  
   4    |   0.028246  
   5    |   0.025305  
   6    |   0.011082  
   7    |   0.008370  
   8    |   0.052701  
   9    |   0.077548  
  10    |   0.028133  
  11    |   0.032058  
  12    |   0.191354  
Iteration 2 - Hyperparameters: {'lr': 0.01, 'batch_size': 128, 'optimizer'

In [186]:
# Save the optimal model of dataset 1 and the optimal hyperparameters
model_path = "cnn_for_emotion"
torch.save(model.state_dict(), model_path)

## 8. Train another model for dataset 2

In [187]:
# Encode Dataset 2 (anger & sadness)
tokenized_texts_train2, word2idx_train2, max_len_train2 = tokenize(train_df2['Text'].tolist())
input_ids_train2 = encode(tokenized_texts_train2, word2idx_train2, max_len_train2)

tokenized_texts_test2, word2idx_test2, max_len_test2 = tokenize(test_df2['Text'].tolist())
input_ids_test2 = encode(tokenized_texts_test2, word2idx_test2, max_len_test2)

# Convert data type to torch.Tensor
train_df2['Label'] = train_df2['Label'].map({0: 0, 3: 1})
test_df2['Label'] = test_df2['Label'].map({0: 0, 3: 1})

train_inputs_2 = torch.from_numpy(input_ids_train2)
labels_train2 = torch.tensor(train_df2['Label'].values)

test_inputs_2 = torch.from_numpy(input_ids_test2)
labels_test2 = torch.tensor(test_df2['Label'].values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df2['Label'] = train_df2['Label'].map({0: 0, 3: 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df2['Label'] = test_df2['Label'].map({0: 0, 3: 1})


In [188]:
# Create DataLoader for dataset 1 training data
dataset_train1 = TensorDataset(train_inputs_1, labels_train1)
dataloader_train1 = DataLoader(dataset_train1,
                               batch_size=best_params['batch_size'], shuffle=True)

# Create DataLoader for dataset 1 test data
dataset_test1 = TensorDataset(test_inputs_1, labels_test1)
dataloader_test1 = DataLoader(dataset_test1,
                              batch_size=best_params['batch_size'], shuffle=False)

# Create DataLoader for dataset 2 training data
dataset_train2 = TensorDataset(train_inputs_2, labels_train2)
dataloader_train2 = DataLoader(dataset_train2,
                               batch_size=best_params['batch_size'], shuffle=True)

# Create DataLoader for dataset 2 test data
dataset_test2 = TensorDataset(test_inputs_2, labels_test2)
dataloader_test2 = DataLoader(dataset_test2,
                              batch_size=best_params['batch_size'], shuffle=False)

In [189]:
# Use optimal hyperparameters
best_model_1 = CNN(vocab_size=len(word2idx_train1), num_classes=2)
best_model_1 = best_model_1.to(device)

# Use optimal hyperparameters to train dataset 1
best_optimizer_1 = best_params['optimizer'](best_model_1.parameters(), lr=best_params['lr'])
train_model(best_model_1, dataloader_train1, best_optimizer_1, loss_fn, device)

# Evaluate on the test set using the best model
accuracy_1, f1_macro_1 = evaluate_model(best_model_1, dataloader_test1, device)

print("Dataset 1 Test Accuracy:", accuracy_1)
print("Dataset 1 Test F1-Macro:", f1_macro_1)

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.605476  
   2    |   0.370633  
   3    |   0.144021  
   4    |   0.052138  
   5    |   0.024000  
   6    |   0.012893  
   7    |   0.007619  
   8    |   0.005348  
   9    |   0.006060  
  10    |   0.003487  
  11    |   0.006683  
  12    |   0.002024  
Dataset 1 Test Accuracy: 0.5807860262008734
Dataset 1 Test F1-Macro: 0.5249655619479783


In [190]:
best_model_2 = CNN(vocab_size=len(word2idx_train2), num_classes=2)
best_model_2 = best_model_2.to(device)

# Use optimal hyperparameters to train dataset 2
best_optimizer_2 = best_params['optimizer'](best_model_2.parameters(), lr=best_params['lr'])
train_model(best_model_2, dataloader_train2, best_optimizer_2, loss_fn, device)

# Evaluate on the test set using the best model
accuracy_2, f1_macro_2 = evaluate_model(best_model_2, dataloader_test2, device)

print("Dataset 2 Test Accuracy:", accuracy_2)
print("Dataset 2 Test F1-Macro:", f1_macro_2)

Start training...

 Epoch  |  Train Loss 
------------------------------------------------------------
   1    |   0.626333  
   2    |   0.381359  
   3    |   0.161623  
   4    |   0.070541  
   5    |   0.044059  
   6    |   0.032307  
   7    |   0.028278  
   8    |   0.028845  
   9    |   0.018266  
  10    |   0.023640  
  11    |   0.021702  
  12    |   0.020218  
Dataset 2 Test Accuracy: 0.4968085106382979
Dataset 2 Test F1-Macro: 0.4774334509819132
