**Exercise**

Desing a FFN Neural network that learn to classify the emotional label of a given input sentence. We assum that, each word in the input sentence have a weight affect to the label. To this end, we need to learn these hidden weights.

**tips**: *design a Embedding layer (random init) to convert word to vector. Then `sum` all word embedding vectors, then transform document vector to the label values (images)*

![model_arc](../../img/dl_tutorial-Trang-2.drawio.png)

In [1]:
import torch
import numpy as np 
 
# =====================

from torchtext.vocab import vocab
from collections import Counter, OrderedDict
from datasets import load_dataset
import re

def split_tokens(sentence):                             
    return [w for w in re.split(r" +",  re.sub(r"[^a-z@# ]", "", sentence.lower()))]   

dataset = load_dataset('emotion')
train_data = dataset['train']
all_words = []
all_labels = []
for sample in train_data:
    all_words+= split_tokens(sample['text']) 
    all_labels.append(sample['label'])

# build vocab - using vocab object of torchtext 
counter = Counter(all_words)
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
my_vocab = vocab(OrderedDict(sorted_by_freq_tuples), specials=['<pad>','<unk>'])
my_vocab.set_default_index(my_vocab['<unk>'])

# count label 
num_labels = len(set(all_labels))
from torch.utils.data import DataLoader
import torch

def convert_sentence_to_ids(sentence, vocab):    
    word_ids = None
    # - split sentence to tokens using `split_tokens` defined above
    # (tips: split sentence to list of words, then feed to the vocab to get list of id) 
    word_ids = vocab(split_tokens(sentence))

    # ===================================
    return word_ids


def get_max_sentence_length_in_batch(batch_input_ids): 
    # - find and return the MAXIMUM length (number of word) of each sample (sentence) in a batch.
    
    max_sentence_length = max([len(e) for e in batch_input_ids])
    return max_sentence_length


def add_padding(batch_input_ids, padding_id):
    max_sample_len_in_batch = get_max_sentence_length_in_batch(batch_input_ids=batch_input_ids)

    # - batch data contains many sentence having difference number of words. To train a deep learning model
    #   we need to convert it to tensor which have the same length for all sentences. 
    # - We need to add padding into each sentence (sample) in a batch. 
    # - for example: a batch contains [[1,2,3,4],[6,7,8],[9]] ==(after padding 0)==> [[1,2,3,4],[6,7,8,0],[9,0,0,0]]
    # (tips: each sample, calculate the number of padding tokens need to add to get max_sample_len_in_batch) 
    padded_word_ids = []
    for i, word_ids in enumerate(batch_input_ids):
        padded_word_ids.append(word_ids + [padding_id] * (max_sample_len_in_batch - len(word_ids)))
    return padded_word_ids


class BatchPreprocessor(object):
    def __init__(self, vocab):
        self.vocab = vocab 

    def __call__(self, batch):
        inputs = []
        masks = []

        # covert text to number 
        for sample in batch:
            word_ids = convert_sentence_to_ids(sample['text'], self.vocab)
            inputs.append(word_ids)
        
        # padding to create a tensor input - make all sentence having the same length 
        padding_id = self.vocab["<pad>"]
        padded_batch = add_padding(batch_input_ids=inputs, padding_id=padding_id)

        # label processing 
        labels = []
        for sample in batch:
            label = sample['label']
            labels.append(int(label))

        # make a tensor 
        inputs = torch.LongTensor(padded_batch)

        # make mask flag tensor
        masks = inputs == padding_id

        return (inputs, torch.FloatTensor(labels), torch.BoolTensor(masks)) 


No config specified, defaulting to: emotion/split
Found cached dataset emotion (/home/phuongnm/.cache/huggingface/datasets/emotion/split/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:

batch_size = 60

# dataset_example should support operator index_selection for create the data_loader object
test_loader = DataLoader(dataset['test'], batch_size=batch_size, collate_fn=BatchPreprocessor(my_vocab), shuffle=True)
train_loader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=BatchPreprocessor(my_vocab), shuffle=True)
valid_loader = DataLoader(dataset['validation'], batch_size=batch_size, collate_fn=BatchPreprocessor(my_vocab), shuffle=True)
for e in test_loader:
    print('First epoch data:')
    print('input data\n', e[0])
    print('label data\n',e[1])
    print('padding mask data\n',e[2])
    break  

First epoch data:
input data
 tensor([[   2,   70,   17,  ...,    0,    0,    0],
        [   2,    3,   14,  ...,    0,    0,    0],
        [   2,   24,    8,  ...,    0,    0,    0],
        ...,
        [   2,    3,  723,  ...,    0,    0,    0],
        [   2, 1293,    6,  ...,    0,    0,    0],
        [   2,    3,  110,  ...,    0,    0,    0]])
label data
 tensor([0., 1., 1., 3., 2., 0., 1., 0., 0., 1., 1., 1., 1., 3., 3., 4., 0., 4.,
        1., 0., 4., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 3., 0., 2.,
        1., 4., 1., 0., 1., 4., 1., 0., 0., 1., 1., 2., 2., 4., 0., 0., 4., 2.,
        3., 4., 0., 1., 2., 2.])
padding mask data
 tensor([[False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        ...,
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ...,  

In [3]:
print('train size', len(train_loader))
print('test size',  len(test_loader))
len(my_vocab)

train size 267
test size 34


15214

In [4]:
from torch import nn



d_model = 200
word_embedding = nn.Embedding(num_embeddings=len(my_vocab), embedding_dim=d_model, padding_idx=my_vocab['<pad>'])
word_embedding.cuda()

# ===================================
# REQUIREMENT:
# - construct a Linear (dense connection) layer to transform a document vector (embedding size) to 1 
# - NOTE: then move this layer to CUDA device for computation 
# ===================================
# - PUSH YOUR CODE IN HERE, can not modify any code in outside this range.  

output_layer = None
# ===================================

# loss also is supported by a library 
loss_computation = torch.nn.MSELoss()
optimizer = torch.optim.Adam(list(word_embedding.parameters()) + list(output_layer.parameters()), lr = 1e-3)    # using Adam optimizer instead of SGD



In [5]:
def forward_function(w_vectors, output_module):
    # ===================================
    # REQUIREMENT: compute emotion prediction values given all word embedding vectors 
    # we assum that, each word in the input sentence have a weight effect to the label
    # and we need to learn these hidden weights.
    #
    # - Compute the document vector based on the input word embedding vector based on sum operator. 
    #   e.g. doc1 = sum([w1, w2, ... wn]) = w1 + w2 + ... + wn 
    #   NOTE: check function `torch.sum` (https://pytorch.org/docs/stable/generated/torch.sum.html)
    #
    # - forward the document vector to `output_module` layer to get emotion values. 
    # ===================================
    # - PUSH YOUR CODE IN HERE, can not modify any code in outside this range. 
 
    label_vectors = None
    # ===================================
    return label_vectors

In [6]:
def eval(data_loader):
    count_true = 0
    count_total = 0
    for batch in data_loader:

        x, y_gold, masked = batch

        x = x.cuda()
        y_gold = y_gold.cuda()
        
        # ============= ###### IMPORTANT ######## ===============
        # Forward pass: Compute predicted y by passing x to the model
        w_vectors = word_embedding(x) #  batch size x sequence length x hidden size 
        label_vectors = forward_function(w_vectors=w_vectors, output_module=output_layer)
        
        predictions = torch.ceil(label_vectors.squeeze())
        # ============= ######################### ===============

        count_true += torch.sum((predictions==y_gold) == True)
        count_total += x.shape[0]

    return count_true / count_total
print('Acc model BEFORE train = ', eval(test_loader))

Acc model BEFORE train =  tensor(0.1310, device='cuda:0')


In [7]:


MAX_EPOCHS=15
for epoch in range(MAX_EPOCHS):
    avg_loss = 0.0
    for batch in train_loader:

        x, y_gold, masked = batch

        x = x.cuda()
        y_gold = y_gold.cuda() 
        

        # ============= ###### IMPORTANT ######## ===============
        # Forward pass: Compute predicted y by passing x to the model
        w_vectors = word_embedding(x) #  batch size x sequence length x hidden size  

        label_vectors = forward_function(w_vectors=w_vectors, output_module=output_layer)

        # Compute and loss = average ((out_put - pred) ^ 2)
        loss = loss_computation(label_vectors.squeeze(), y_gold) 
        # ============= ######################### ===============

        # perform a backward pass (backpropagation) => to compute the gradient values in Tensor weights
        loss.backward()
        avg_loss += loss.item()

        # USE LIBRARY: 'model.parameters()' in stead of 'model.get_parameter()' is implemented by library, also return list of parameters: "weight" and "bias" 
        # Optimizer step(), this update gradient values to weights.
        optimizer.step() # instead of `param.add_(-lr * param.grad)` => update weight values
        optimizer.zero_grad() # instead of `param.grad.fill_(0)` => remove all the old gradient values in all Tensor weight
    
    avg_loss = avg_loss / len(train_loader)
    if avg_loss < 0.0001:
        print(loss)
        break
    print('epoch/batch: ', epoch, ' avg loss: ', avg_loss, "Acc=", eval(valid_loader))
    # break 


epoch/batch:  0  avg loss:  3.7099706077397094 Acc= tensor(0.1860, device='cuda:0')
epoch/batch:  1  avg loss:  1.8406190037280878 Acc= tensor(0.2255, device='cuda:0')
epoch/batch:  2  avg loss:  1.2563680033112286 Acc= tensor(0.2955, device='cuda:0')
epoch/batch:  3  avg loss:  0.8811012373658155 Acc= tensor(0.3145, device='cuda:0')
epoch/batch:  4  avg loss:  0.6801902278978726 Acc= tensor(0.3490, device='cuda:0')
epoch/batch:  5  avg loss:  0.5625927380185002 Acc= tensor(0.3410, device='cuda:0')
epoch/batch:  6  avg loss:  0.48991260730595176 Acc= tensor(0.3585, device='cuda:0')
epoch/batch:  7  avg loss:  0.4360239225045572 Acc= tensor(0.3315, device='cuda:0')
epoch/batch:  8  avg loss:  0.39910079287679007 Acc= tensor(0.3885, device='cuda:0')
epoch/batch:  9  avg loss:  0.37134649389692015 Acc= tensor(0.3775, device='cuda:0')
epoch/batch:  10  avg loss:  0.35205166415775313 Acc= tensor(0.3755, device='cuda:0')
epoch/batch:  11  avg loss:  0.3304659425933263 Acc= tensor(0.3355, dev

In [8]:
print('Acc model AFTER train = ', eval(test_loader))


Acc model AFTER train =  tensor(0.3335, device='cuda:0')
