 # Emotion Recognition with a CNN

## Load Data

Pick two different emotion classes for your model to predict (e.g., anger and joy). Load/filter your
dataset to include only the related class data. Create another dataset and change only one of the
classes (e.g., anger and sadness) this time.

In [1]:
import pandas as pd

In [2]:
test_text = pd.read_csv('test_text.txt', header=None, names=['text'], sep='\r\n', engine='python')
test_labels = pd.read_csv('test_labels.txt', header=None, names=['label'], sep='\r\n', engine='python')
test_data = pd.concat([test_text, test_labels], axis=1)

print("Preview of testing data: ")
test_data[:5]

Preview of testing data: 


Unnamed: 0,text,label
0,#Deppression is real. Partners w/ #depressed p...,3
1,@user Interesting choice of words... Are you c...,0
2,My visit to hospital for care triggered #traum...,3
3,@user Welcome to #MPSVT! We are delighted to h...,1
4,What makes you feel #joyful?,1


In [3]:
train_text = pd.read_csv('train_text.txt', header=None, names=['text'], sep='\r\n', engine='python')
train_labels = pd.read_csv('train_labels.txt', header=None, names=['label'], sep='\r\n', engine='python')
train_data = pd.concat([train_text, train_labels], axis=1)

print("Preview of training data: ")
train_data[:5]

Preview of training data: 


Unnamed: 0,text,label
0,“Worry is a down payment on a problem you may ...,2
1,My roommate: it's okay that we can't spell bec...,0
2,No but that's so cute. Atsu was probably shy a...,1
3,Rooneys fucking untouchable isn't he? Been fuc...,0
4,it's pretty depressing when u hit pan on ur fa...,3


In [4]:
val_text = pd.read_csv('val_text.txt', header=None, names=['text'], sep='\r\n', engine='python')
val_labels = pd.read_csv('val_labels.txt', header=None, names=['label'], sep='\r\n', engine='python')
val_data = pd.concat([val_text, val_labels],  axis=1)

print("Preview of validation data: ")
val_data[:5]

Preview of validation data: 


Unnamed: 0,text,label
0,"@user @user Oh, hidden revenge and anger...I r...",0
1,if not then #teamchristine bc all tana has don...,0
2,Hey @user #Fields in #skibbereen give your onl...,0
3,Why have #Emmerdale had to rob #robron of havi...,0
4,@user I would like to hear a podcast of you go...,0


In [5]:
with open(f'mapping.txt') as f:
    mapping = f.read().replace('\t', ' ').split('\n')
mapping

['0 anger', '1 joy', '2 optimism', '3 sadness']

In [6]:
# with open(f'test_text.txt') as f:
#     test_text = f.read().split('\n')
# with open(f'test_labels.txt') as f:
#     test_labels = f.read().split('\n')
# with open(f'train_text.txt') as f:
#     train_text = f.read().split('\n')
# with open(f'train_labels.txt') as f:
#     train_labels = f.read().split('\n')
# with open(f'val_text.txt') as f:
#     val_text = f.read().split('\n')
# with open(f'val_labels.txt') as f:
#     val_labels = f.read().split('\n')
# with open(f'mapping.txt') as f:
#     mapping = f.read().replace('\t', ' ').split('\n')

First two classes: optimism, sadness (2, 3)


Second two classes: anger, sadness (0, 3)

In [7]:
def filter_data_by_classes(dataset : pd.DataFrame, two_classes=[2,3]):
    filtered_data = dataset[dataset['label'].isin(two_classes)]
    return filtered_data

train_23 = filter_data_by_classes(train_data)
test_23 = filter_data_by_classes(test_data)
val_23 = filter_data_by_classes(val_data)

train_03 = filter_data_by_classes(train_data, [0, 3])
test_03 = filter_data_by_classes(test_data, [0, 3])
val_03 = filter_data_by_classes(val_data, [0, 3])

In [8]:
print("Preview of training data filtered: ")
train_23[:3], train_03[:3]

Preview of training data filtered: 


(                                                text  label
 0  “Worry is a down payment on a problem you may ...      2
 4  it's pretty depressing when u hit pan on ur fa...      3
 6  Making that yearly transition from excited and...      3,
                                                 text  label
 1  My roommate: it's okay that we can't spell bec...      0
 3  Rooneys fucking untouchable isn't he? Been fuc...      0
 4  it's pretty depressing when u hit pan on ur fa...      3)

## Data Preprocessing

1. Special Characters Cleaning
2. Character Casing
3. Stop Word Removal

In [9]:
import nltk

In [10]:
input = "It's a text to test pre-processing functions. "

### Special Characters Cleaning

In [11]:
# import library: Regular Expression
import re

"""
Clean the data by removing special characters (punctuation)
"""
def sp_chara_cleaning(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    return clean_text

In [12]:
# Special characters like ' . - are removed.
input = sp_chara_cleaning(input)
input

'It s a text to test pre processing functions  '

### Character Casing

In [13]:
"""
Lowercase all words.
"""
def character_casing(text):
    lower_text = text.lower()
    return lower_text

In [14]:
# All cases become lowercases.
input = character_casing(input)
input

'it s a text to test pre processing functions  '

### Stop Word Removal

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
from nltk.corpus import stopwords

"""
Here we remove words that in English stop words list.
"""
def stop_word_removal(text):
    words = text.split()
    stop_words = stopwords.words("english")
    clean_words = [w for w in words if w not in stop_words]
    clean_text = " ".join(clean_words)
    return clean_text



In [17]:
# Stop words like 'it', 's', 'a', 'this' are removed.
input = stop_word_removal(input)
print(input)

text test pre processing functions


### Apply Preprocessing Functions
Skip this if you already have saved data after pre-processing.

In [18]:
data_list = [train_23, test_23, val_23, train_03, test_03, val_03]
funcs = [sp_chara_cleaning, character_casing, stop_word_removal]
preview = train_23["text"][:1].copy()
for func in funcs:
    print("Function in process: ", func.__name__)
    for data in data_list:
        data["text"] = data["text"].apply(func, )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["text"] = data["text"].apply(func, )


Function in process:  sp_chara_cleaning
Function in process:  character_casing
Function in process:  stop_word_removal


In [19]:
print("Sample before pre-processing: \n", preview[0])
print("Sample after pre-processing: \n", train_23["text"][0])

Sample before pre-processing: 
 “Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry
Sample after pre-processing: 
 worry payment problem may never joyce meyer motivation leadership worry


### Convert Labels to Classes Space

In [20]:
for data in data_list:
  data["label"] = data["label"].map({0:0, 1:1, 2:0, 3:1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label"] = data["label"].map({0:0, 1:1, 2:0, 3:1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label"] = data["label"].map({0:0, 1:1, 2:0, 3:1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label"] = data["label"].map({0:0, 1:1, 2:0, 3:1})
A value is trying to be set on a copy

In [21]:
train_23[:5]

Unnamed: 0,text,label
0,worry payment problem may never joyce meyer mo...,0
4,pretty depressing u hit pan ur favourite highl...,1
6,making yearly transition excited hopeful colle...,1
11,newyork several baloch amp indian activists ho...,1
17,saved ordering risk life panic stayed calm res...,0


### Create Dataset for Training

#### Tokenize Sentences

In [22]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Col

In [23]:
from transformers import AutoTokenizer, AutoConfig

# Load bert-base-uncased, a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
embedding_dim = config.hidden_size
embedding_dim

768

In [25]:
vocab_size = tokenizer.vocab_size
vocab_size

30522

In [26]:
input_ids_list = []
# data_list = [train_23, test_23, val_23, train_03, test_03, val_03]
for data in data_list:
    input_ids_list.append(tokenizer(list(data["text"]), padding=True, truncation=True, return_tensors="pt"))

In [27]:
import torch
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
# device = torch.device("mps")
device

'cuda:0'

In [29]:

from torch.utils.data import TensorDataset, DataLoader

def createDataLoader(X, y, batch_size=64):
  torch.manual_seed(1)
  data_set = TensorDataset(X.to(device), y.to(device))
  data_loader = DataLoader(data_set, batch_size=batch_size)
  return data_loader


In [30]:
data_loaders = []
for i, input_ids in enumerate(input_ids_list):
  data_loaders.append(createDataLoader(input_ids.input_ids, torch.tensor(list(data_list[i]["label"]))))

## Training

### Model

In [31]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import time
import logging

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EmotionCNN(nn.Module):
    def __init__(self,
                 vocab_size = vocab_size,
                 embedding_dim = embedding_dim,
                 num_filters = [50, 100, 150],
                 filter_sizes = [3, 4, 5],
                 num_classes = 2,
                 dropout = 0.2,
                 stride = 1,
                 pool_func = 'max'):
        super(EmotionCNN, self).__init__()

        assert len(num_filters) == len(filter_sizes)

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, max_norm=5.0)

        # Convolutional layers
        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters[i],
                kernel_size=filter_sizes[i],
                stride=stride,
                ) for i in range(len(filter_sizes))
        ])

        # Fully connected network
        self.fc = nn.Linear(sum(num_filters), num_classes)

        # Dropout
        self.dropout = nn.Dropout(dropout)

        self.pool_func = pool_func

    def forward(self, input):
        # input: [batch_size, seq_length]

        # Calculate embeddings
        # embedded: [batch_size, seq_length, embedding_dim]
        embedded = self.embedding(input)

        # Switch the last two dimensions to match input for convs
        # permuted: [batch_size, embedding_dim, seq_length]
        permuted = embedded.permute(0, 2, 1)

        # Apply convolution and ReLU activation
        # conved: [batch_size, ...]
        conved = [F.relu(conv(permuted)) for conv in self.convs]

        # Max pooling or average pooling over the time dimension
        # pooled: [batch_size, num_filters]
        if self.pool_func == 'max':
            pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        else:
            pooled = [F.avg_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        # Concatenate pooled features
        # cat: [batch_size, sum(num_filters)]
        cat = torch.cat(pooled, dim=1)

        # Apply dropout
        # dropped: [batch_size, sum(num_filters)]
        dropped = self.dropout(cat)

        # Apply all full connected layers
        # output: [batch_size, num_classes]
        output = self.fc(dropped)


        # for data in [input, embedded, permuted, cat, dropped, output]:
        #     print(data.shape)

        return output


### GridSearchCV

In [33]:
%pip install skorch

Collecting skorch
  Downloading skorch-0.15.0-py3-none-any.whl (239 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/239.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m122.9/239.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.3/239.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: skorch
Successfully installed skorch-0.15.0


In [74]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from skorch.callbacks import EarlyStopping

from skorch import NeuralNetClassifier


EmoCNN = NeuralNetClassifier(
    module=EmotionCNN(
            vocab_size=vocab_size,
            embedding_dim=embedding_dim),
    callbacks = [EarlyStopping()],
    max_epochs=200,
    criterion=nn.CrossEntropyLoss(),
    verbose = 0,
    device='cuda',
)


# Here are the parameters we want to find best.
# These include the 5 requirements in Exercise 1 document.
param_grid = {  # optimizer
                'optimizer': [torch.optim.SGD, torch.optim.Adam],

                # learning rate
                'optimizer__lr': [0.1, 0.01],

                # layer sizes
                'module__dropout': [0.2, 0.5],

                # number of filters
                'module__num_filters': [[50, 100, 150], [150, 150, 150]],

                # stride
                'module__stride': [1, 2],

                # kernel size
                'module__filter_sizes': [[3, 4, 5], [2, 3, 4]],

                # pooling
                'module__pool_func': ['avg', 'max'],

                # early stopping
                'callbacks__EarlyStopping__patience': [10, 20],

    }

In [75]:
gs_CNN = GridSearchCV(EmoCNN, param_grid, cv=3, verbose=1)

In [76]:
X_gs, y_gs = input_ids.input_ids[:600], torch.tensor(list(data_list[i]["label"]))[:600]
gs_CNN.fit(X_gs, y_gs)

Fitting 3 folds for each of 256 candidates, totalling 768 fits


In [77]:
CNN_df = pd.DataFrame.from_dict(gs_CNN.cv_results_)
CNN_df.sort_values(by=["rank_test_score"]).iloc[:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_callbacks__EarlyStopping__patience,param_module__dropout,param_module__filter_sizes,param_module__num_filters,param_module__pool_func,param_module__stride,param_optimizer,param_optimizer__lr,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
211,1.471128,0.085637,0.005902,0.000218,20,0.5,"[3, 4, 5]","[150, 150, 150]",avg,1,<class 'torch.optim.adam.Adam'>,0.01,"{'callbacks__EarlyStopping__patience': 20, 'mo...",0.710843,0.686747,0.73494,0.710843,0.019675,1
163,1.487656,0.101992,0.006982,0.002318,20,0.2,"[2, 3, 4]","[50, 100, 150]",avg,1,<class 'torch.optim.adam.Adam'>,0.01,"{'callbacks__EarlyStopping__patience': 20, 'mo...",0.686747,0.722892,0.698795,0.702811,0.015027,2
67,0.809939,0.066146,0.005046,0.000376,10,0.5,"[3, 4, 5]","[50, 100, 150]",avg,1,<class 'torch.optim.adam.Adam'>,0.01,"{'callbacks__EarlyStopping__patience': 10, 'mo...",0.638554,0.698795,0.771084,0.702811,0.05418,2
195,1.28599,0.115854,0.005309,0.000192,20,0.5,"[3, 4, 5]","[50, 100, 150]",avg,1,<class 'torch.optim.adam.Adam'>,0.01,"{'callbacks__EarlyStopping__patience': 20, 'mo...",0.722892,0.650602,0.73494,0.702811,0.037243,2
226,1.195224,0.135348,0.004467,0.000153,20,0.5,"[2, 3, 4]","[50, 100, 150]",avg,1,<class 'torch.optim.adam.Adam'>,0.1,"{'callbacks__EarlyStopping__patience': 20, 'mo...",0.686747,0.686747,0.722892,0.698795,0.017039,5


In [78]:
CNN_df.sort_values(by=["rank_test_score"]).iloc[0]['params']

{'callbacks__EarlyStopping__patience': 20,
 'module__dropout': 0.5,
 'module__filter_sizes': [3, 4, 5],
 'module__num_filters': [150, 150, 150],
 'module__pool_func': 'avg',
 'module__stride': 1,
 'optimizer': torch.optim.adam.Adam,
 'optimizer__lr': 0.01}

In [108]:
import torch.optim as optim

# Instantiate CNN model
model = EmotionCNN(
            dropout=0.5,
            filter_sizes = [3, 4, 5],
            num_filters = [150, 150, 150],
            pool_func = 'avg',
            stride = 1
            )

# Send model to `device` (GPU/CPU)
model.to(device)

# Instantiate Adadelta optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Specify loss function
criterion = nn.CrossEntropyLoss()
logging.basicConfig(filename='EmotionCNN.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [109]:

# Start training loop
print("Start training...\n")
print("-"*60, "\n")

epochs = 200
lowest_loss = float('inf')
max_patience = 5
# data order: train_23, test_23, val_23, train_03, test_03, val_0
for epoch in range(epochs):
  total_loss = 0
  start_time = time.time()
  correct = 0
  total = 0
  # Put the model into the training mode
  model.train()
  for batch_num, (b_input_ids, b_labels) in enumerate(data_loaders[0]):

    # Torch accumulates gradients. Before passing in a
    # new instance, zero out the gradients from the old instance
    model.zero_grad()

    # Perform a forward pass. This will return logits.
    logits = model(b_input_ids)

    # Compute loss and accumulate the loss values
    loss = criterion(logits, b_labels)

    total_loss += loss.item()

    # Perform a backward pass to calculate gradients
    loss.backward()

    # Update parameters
    optimizer.step()

    _, predicted = logits.max(dim=1)
    total += len(b_labels)
    correct += predicted.eq(b_labels).sum().item()

    # Calculate the average loss over the entire training data
  avg_train_loss = total_loss / len(data_loaders[0])
  train_accuracy = correct / total


  # Validation
  model.eval()
  val_loss = 0.0
  correct = 0
  total = 0

  with torch.no_grad():
      for batch_num, (b_input_ids, b_labels) in enumerate(data_loaders[2]):

          val_logits = model(b_input_ids)

          # Compute the validation loss
          val_loss += criterion(val_logits, b_labels).item()

          # Calculate validation accuracy
          _, predicted = val_logits.max(dim=1)
          total += len(b_labels)
          correct += predicted.eq(b_labels).sum().item()

  avg_val_loss = val_loss / len(data_loaders[2])
  val_accuracy = correct / total

  info = f"Epoch: {epoch + 1} / {epochs} Time: {time.time() - start_time:.2f}s \
\nTrain Loss: {avg_train_loss:.4f} Train Acc: {train_accuracy:.4f} \
\nVal Loss: {avg_val_loss:.4f} Val Acc: {val_accuracy:.4f}\n"
  print(info)
  print("-"*60, "\n")
  logging.info(info)

  if val_loss < lowest_loss:
      lowest_loss = val_loss
      patience = 0  # Reset patience counter
  else:
      patience += 1  # Increment patience counter

  if patience >= max_patience:
      print(f'Early stopping after {epoch} epochs.')
      break

torch.save(model.state_dict(), f'EmotionCNN_{epochs}.pth')


Start training...

------------------------------------------------------------ 

Epoch: 1 / 200 Time: 0.38s 
Train Loss: 0.5717 Train Acc: 0.7206 
Val Loss: 0.5300 Val Acc: 0.7607

------------------------------------------------------------ 

Epoch: 2 / 200 Time: 0.32s 
Train Loss: 0.5010 Train Acc: 0.7511 
Val Loss: 0.4876 Val Acc: 0.7607

------------------------------------------------------------ 

Epoch: 3 / 200 Time: 0.32s 
Train Loss: 0.3105 Train Acc: 0.8790 
Val Loss: 0.4456 Val Acc: 0.8120

------------------------------------------------------------ 

Epoch: 4 / 200 Time: 0.32s 
Train Loss: 0.1189 Train Acc: 0.9600 
Val Loss: 0.4887 Val Acc: 0.7949

------------------------------------------------------------ 

Epoch: 5 / 200 Time: 0.32s 
Train Loss: 0.0575 Train Acc: 0.9782 
Val Loss: 0.6018 Val Acc: 0.7863

------------------------------------------------------------ 

Epoch: 6 / 200 Time: 0.34s 
Train Loss: 0.0248 Train Acc: 0.9948 
Val Loss: 0.6337 Val Acc: 0.8034

---

In [110]:
model.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for batch_num, (b_input_ids, b_labels) in enumerate(data_loaders[1]):

        test_logits = model(b_input_ids)

        # Compute the validation loss
        test_loss += criterion(test_logits, b_labels).item()

        # Calculate validation accuracy
        _, predicted = test_logits.max(dim=1)
        total += len(b_labels)
        correct += predicted.eq(b_labels).sum().item()

avg_test_loss = test_loss / len(data_loaders[1])
test_accuracy = correct / total
info = f"Test Loss: {avg_test_loss:.4f} Test Acc: {test_accuracy:.4f}"
print(info)
logging.info(info)


Test Loss: 0.6034 Test Acc: 0.8119
