# Redes Neuronales Recurrentes

En esta notebook veremos cómo construir una *red neuronal recurrente* para el problema de clasificación de texto con el conjunto de datos **IMDB reviews**.

In [1]:
import numpy as np
import pandas as pd

import csv
import gzip

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from gensim import corpora
from gensim.parsing import preprocessing
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn import metrics
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm, trange

## Parte 1: Preprocesamiento del texto

Primero obtenemos el dataset como se explica en la notebook `5_CNNs.ipynb`.

In [2]:
# DataSet
class IMDBReviewsDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        if torch.is_tensor(item):
            item = item.to_list()

        item = {
            'data': self.dataset.loc[item, 'review'],
            'target': self.dataset.loc[item, 'sentiment']
        }

        if self.transform:
            item = self.transform(item)

        return item

# Pre-Procesamiento
class RawDataProcessor:
    def __init__(self, dataset, ignore_header=True, filters=None, vocab_size=50000):
        if filters:
            self.filters = filters
        else:
            self.filters = [
                lambda s: s.lower(),
                preprocessing.strip_tags,
                preprocessing.strip_punctuation,
                preprocessing.strip_multiple_whitespaces,
                preprocessing.strip_numeric,
                preprocessing.remove_stopwords,
                preprocessing.strip_short,
            ]

        # Create dictionary based on all the reviews (with corresponding preprocessing)
        self.dictionary = corpora.Dictionary(
            dataset['review'].map(self._preprocess_string).tolist()
        )
        # Filter the dictionary and compactify it (make the indices continous)
        self.dictionary.filter_extremes(no_below=2, no_above=1, keep_n=vocab_size)
        self.dictionary.compactify()
        # Add a couple of special tokens
        self.dictionary.patch_with_special_tokens({'[PAD]': 0, '[UNK]': 1})
        self.idx_to_target = sorted(dataset['sentiment'].unique())
        self.target_to_idx = {t: i for i, t in enumerate(self.idx_to_target)}

    def _preprocess_string(self, string):
        return preprocessing.preprocess_string(string, filters=self.filters)

    def _sentence_to_indices(self, sentence):
        return self.dictionary.doc2idx(sentence, unknown_word_index=1)

    def encode_data(self, data):
        return self._sentence_to_indices(self._preprocess_string(data))

    def encode_target(self, target):
        return self.target_to_idx[target]

    def __call__(self, item):
        if isinstance(item['data'], str):
            # String
            data = self.encode_data(item['data'])
        else:
            # Iterable
            data = [self.encode_data(d) for d in item['data']]

        if isinstance(item['target'], str):
            # String
            target = self.encode_target(item['target'])
        else:
            # Iterable
            target = [self.encode_target(t) for t in item['target']]

        return {'data': data, 'target': target, 'sentence': item['data']}

In [3]:
dataset = pd.read_csv('data/imdb_reviews.csv.gz')

preprocess = RawDataProcessor(dataset)

train_indices, test_indices = train_test_split(dataset.index, test_size=0.2, random_state=123)

train_dataset = IMDBReviewsDataset(dataset.loc[train_indices].reset_index(drop=True), transform=preprocess)
test_dataset = IMDBReviewsDataset(dataset.loc[test_indices].reset_index(drop=True), transform=preprocess)

In [4]:
print('Sample element...')

element = train_dataset[1]

print(element['sentence'])
print('=' * 10)
print(element['data'])
print('=' * 10)
print(f'#Indexes: {len(element["data"])}')
print(f'Sentiment: {element["target"]}')

Sample element...
I came to NEW PORT SOUTH expecting a surrogate movie about the Columbine school massacre similar to Gus Van Sant's ELEPHANT and certainly the synopsis in the TV guide stating that a student sociopath rebels against the system did give me that impression but this is a very boring movie where little happens so consider yourself warned <br /><br />The story is about Maddox , a Chicago high school student who decides to strike back at what he perceives to be an authoritarian regime . The major problem is that the character is underwritten and the actor who plays him Blake Shields is unable to embellish any script deficiencies . You have the gut feeling that Maddox should have the evil charisma of Hitler , Saddam or Bin Laden but he never comes across as anything more than a petulant truculent teenager and it's impossible to believe he could rally any disciples . The subtext of you overthrow one manipulative authoritarian regime only to replace it with another manipulative

In [5]:
# Padding
class PadSequences:
    def __init__(self, pad_value=0, max_length=100):
        self.pad_value = pad_value
        self.max_length = max_length

    def __call__(self, items):
        data, target = list(zip(*[(item['data'], item['target']) for item in items]))
        seq_lengths = [len(d) for d in data]

        max_length = self.max_length
        seq_lengths = [min(self.max_length, l) for l in seq_lengths]

        data = [d[:l] + [self.pad_value] * (max_length - l) for d, l in zip(data, seq_lengths)]

        return {'data': torch.LongTensor(data), 'target': torch.FloatTensor(target)}

In [6]:
MAX_SEQUENCE_LEN = 100
pad_sequences = PadSequences(max_length=MAX_SEQUENCE_LEN)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=pad_sequences, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=pad_sequences, drop_last=False)

In [7]:
batch = iter(train_loader).next()

batch['data'].shape

torch.Size([128, 100])

**¿A qué corresponde cada dimensión?**

- *128* es el tamaño del lote.
- *100* es el tamaño de las reseñas.

## Parte 2: Esqueleto de la red neuronal

Lo siguiente que debemos pensar es cómo será la arquitectura de nuestra red para resolver la tarea deseada. En esta sección crearemos el modelo sequencial con *PyTorch* que representará nuestra red.

Para poder implementar el modelo debemos responder las siguientes preguntas:
- ¿Es una red one-to-one, one-to-many, many-to-one, o many-to-many?

**Respuesta**
Tenemos una sentencia de palabras y queremos una clasificación binaria, *many-to-one*.

- ¿Cuál es el formato de entrada y de salida de la red?

**Respuesta**
Para la entrada, tenemos *128 x 100* (*batch_size x sequence_length*).
Para la salida, tenemos *128* (*batch_size*).

- ¿Cuál es el tamaño de las matrices (tensores) de entrada y de salida (con respecto a la capa de embedding)?

**Respuesta**
Para la entrada, tenemos *128 x 100*.
Para la salida, tenemos *128 x 100 x e* donde *e* será el tamaño del embedding.

- Luego de que la entrada pasa por la capa recurrente, ¿Qué tamaño tiene el tensor?

**Respuesta**
El tensor tendrá el tamaño *128 x 100 x h* donde *h* será el tamaño de la capa recurrente.

- ¿Cómo se conecta la salida de la capa recurrente con la capa densa que realiza la clasificación?

**Respuesta**
La capa densa tomará un vector de tamaño *100 x h* por cada uno de los elementos del lote.

- ¿Cuál es el loss apropiado para este problema?

**Repuesta**
Es un problema de clasificación binaria, *log loss*.

Primero importamos los módulos que necesitaremos para implementar nuestra red...

- `torch`: Acceso a todo el framework.
- `torch.nn`: Acceso a capas implementadas y a la clase *Module* para instanciar nuestra red.

In [8]:
import torch
import torch.nn as nn

# Check if we have a GPU available
use_cuda = torch.cuda.is_available()
device = torch.device('cuda') if use_cuda else torch.device('cpu')

In [9]:
class IMDBLSTM(nn.Module):
    def __init__(self,
                 pretrained_embeddings_path, # Dirección a los Embeddings
                 dictionary,                 # Diccionario de Vocabulario
                 embedding_size,             # Tamaño de Embeddings
                 hidden_layer=32,            # Cantidad de Neuronas Recurrentes
                 num_layers=1,               # Cantidad de Capas Recurrentes
                 dropout=0.0,                # Regularización con Dropout
                 bias=True,                  # Término Bias
                 bidirectional=False,        # ¿Ambos sentidos?
                 freeze_embedings=True):     # Entrenamiento de Embeddings

        super(IMDBLSTM, self).__init__()
        # Binary Classification
        output_size = 1
        # Create the Embeddings layer and add pre-trained weights
        embeddings_matrix = torch.randn(len(dictionary), embedding_size)
        embeddings_matrix[0] = torch.zeros(embedding_size)
        with gzip.open(pretrained_embeddings_path, 'rt') as fh:
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in dictionary.token2id:
                    wordID = dictionary.token2id[word]
                    embeddings_matrix[wordID] = torch.FloatTensor([float(n) for n in vector.split()])
        # Embedding Layer
        self.embedding_config = {'freeze': freeze_embedings, 'padding_idx': 0}
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix, **self.embedding_config)
        # Set our LSTM parameters
        self.lstm_config = {'input_size': embedding_size,
                            'hidden_size': hidden_layer,
                            'num_layers': num_layers,
                            'bias': bias,
                            'batch_first': True,
                            'dropout': dropout,
                            'bidirectional': bidirectional}
        # Set our Fully Connected Layer parameters
        self.linear_config = {'in_features': hidden_layer,
                              'out_features': output_size,
                              'bias': bias}
        # Instanciate
        self.lstm = nn.LSTM(**self.lstm_config)
        self.classification_layer = nn.Linear(**self.linear_config)
        self.activation = nn.Sigmoid()

    def forward(self, inputs):
        embedding = self.embeddings(inputs)
        lstm_out, _ = self.lstm(embedding)
        # Take last state of lstm, which is a representation of the entire text
        lstm_out = lstm_out[:, -1, :].squeeze()
        predictions = self.activation(self.classification_layer(lstm_out))
        return predictions

In [10]:
model = IMDBLSTM('data/glove.6B.50d.txt.gz',
                 preprocess.dictionary,
                 embedding_size=50,
                 hidden_layer=64,
                 num_layers=1,
                 dropout=0)

print(model)

IMDBLSTM(
  (embeddings): Embedding(50002, 50, padding_idx=0)
  (lstm): LSTM(50, 64, batch_first=True)
  (classification_layer): Linear(in_features=64, out_features=1, bias=True)
  (activation): Sigmoid()
)


## Parte 3: Entrenamiento de la red

En esta sección entrenaremos nuestra red. Primero configuraremos los hiperparámetros de la red. En este momento determinamos lo siguiente:
-  *epochs*
-  *learning_rate*
-  *loss*
-  *optimizer*

También definiremos los parámetros para `DataLoader`, clase que implementa un manejador del dataset que nos dividirá los datos en batches (y los distribuirá entre distintos nodos de cómputo, en caso de contar con multi GPU).

In [11]:
# Don't forget to send the model to GPU if there is one available!
model.to(device)

IMDBLSTM(
  (embeddings): Embedding(50002, 50, padding_idx=0)
  (lstm): LSTM(50, 64, batch_first=True)
  (classification_layer): Linear(in_features=64, out_features=1, bias=True)
  (activation): Sigmoid()
)

In [12]:
import torch.optim as optim

EPOCHS = 5

learning_rate = 0.001

loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), learning_rate)

In [13]:
history = {'train_loss': [], 'test_loss': [], 'test_avp': []}

for epoch in trange(EPOCHS):
    # Training
    model.train()
    running_loss = []
    print(f'EPOCH: {epoch}')
    for idx, batch in enumerate(tqdm(train_loader)):
        # We need to send everything to the device!
        data = batch['data'].to(device)
        target = batch['target'].to(device)
        # Continue the training...
        optimizer.zero_grad()
        output = model(data)
        loss_value = loss_function(output.squeeze(), target)
        loss_value.backward()
        optimizer.step()
        running_loss.append(loss_value.item())
    train_loss = sum(running_loss) / len(running_loss)
    print(f'\t Final train_loss: {train_loss}')
    history['train_loss'].append(train_loss)
    # Evaluation
    model.eval()
    running_loss = []
    targets = []
    predictions = []
    for batch in tqdm(test_loader):
        # We need to send everything to the device!
        data = batch['data'].to(device)
        target = batch['target'].to(device)
        # Continue the evaluation...
        output = model(data)
        loss_value = loss_function(output.squeeze(), target)
        running_loss.append(loss_value.item())
        targets.extend(target.cpu().numpy())
        # Round up model output to get the predictions.
        # What would happen if you change the activation to tanh?
        predictions.extend(output.cpu().squeeze().round().detach().numpy())
    test_loss = sum(running_loss) / len(running_loss)
    avp = metrics.average_precision_score(targets, predictions)
    print(f'\t Final test_loss: {test_loss}')
    print(f'\t Final test_avp: {avp}')
    history['test_loss'].append(test_loss)
    history['test_avp'].append(avp)

  0%|          | 0/5 [00:00<?, ?it/s]

EPOCH: 0


  0%|          | 0/313 [00:00<?, ?it/s]

	 Final train_loss: 0.6832791159328181


  0%|          | 0/79 [00:00<?, ?it/s]

	 Final test_loss: 0.6922922534278676
	 Final test_avp: 0.5104447720702991
EPOCH: 1


  0%|          | 0/313 [00:00<?, ?it/s]

	 Final train_loss: 0.6850689103047307


  0%|          | 0/79 [00:00<?, ?it/s]

	 Final test_loss: 0.6808551889431628
	 Final test_avp: 0.5468931686914957
EPOCH: 2


  0%|          | 0/313 [00:00<?, ?it/s]

	 Final train_loss: 0.6696904770101602


  0%|          | 0/79 [00:00<?, ?it/s]

	 Final test_loss: 0.670406257804436
	 Final test_avp: 0.5843666962682463
EPOCH: 3


  0%|          | 0/313 [00:00<?, ?it/s]

	 Final train_loss: 0.6879336980585092


  0%|          | 0/79 [00:00<?, ?it/s]

	 Final test_loss: 0.6838873700250553
	 Final test_avp: 0.5273906505979606
EPOCH: 4


  0%|          | 0/313 [00:00<?, ?it/s]

	 Final train_loss: 0.65697637171791


  0%|          | 0/79 [00:00<?, ?it/s]

	 Final test_loss: 0.688163996497287
	 Final test_avp: 0.5289320312581892


## Extras

Una vez que hemos implementado la arquitectura básica de la red, podemos comenzar a experimentar con distintas modificaciones para lograr mejores resultados. Algunas tareas posibles son:

- Agregar más capas recurrentes (Definición del Modelo).
- Probar otros largos de secuencias máximas (Definición del Padding).
- Agregar capas de regularización y/o dropout (Definición del Modelo).
- Entrenar los embeddings junto con la red (Definición del Modelo).

In [14]:
MAX_SEQUENCE_LEN = 200
pad_sequences = PadSequences(max_length=MAX_SEQUENCE_LEN)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=pad_sequences, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, collate_fn=pad_sequences, drop_last=False)

In [15]:
model = IMDBLSTM('data/glove.6B.50d.txt.gz',
                 preprocess.dictionary,
                 embedding_size=50,
                 hidden_layer=128,
                 num_layers=3,
                 dropout=0.5,
                 freeze_embedings=True)

print(model)

IMDBLSTM(
  (embeddings): Embedding(50002, 50, padding_idx=0)
  (lstm): LSTM(50, 128, num_layers=3, batch_first=True, dropout=0.5)
  (classification_layer): Linear(in_features=128, out_features=1, bias=True)
  (activation): Sigmoid()
)


In [16]:
# Don't forget to send the model to GPU if there is one available!
model.to(device)

EPOCHS = 10

learning_rate = 0.0001

loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), learning_rate)

In [17]:
history = {'train_loss': [], 'test_loss': [], 'test_avp': []}

for epoch in trange(EPOCHS):
    # Training
    model.train()
    running_loss = []
    print(f'EPOCH: {epoch}')
    for idx, batch in enumerate(tqdm(train_loader)):
        # We need to send everything to the device!
        data = batch['data'].to(device)
        target = batch['target'].to(device)
        # Continue the training...
        optimizer.zero_grad()
        output = model(data)
        loss_value = loss_function(output.squeeze(), target)
        loss_value.backward()
        optimizer.step()
        running_loss.append(loss_value.item())
    train_loss = sum(running_loss) / len(running_loss)
    print(f'\t Final train_loss: {train_loss}')
    history['train_loss'].append(train_loss)
    # Evaluation
    model.eval()
    running_loss = []
    targets = []
    predictions = []
    for batch in tqdm(test_loader):
        # We need to send everything to the device!
        data = batch['data'].to(device)
        target = batch['target'].to(device)
        # Continue the evaluation...
        output = model(data)
        loss_value = loss_function(output.squeeze(), target)
        running_loss.append(loss_value.item())
        targets.extend(target.cpu().numpy())
        # Round up model output to get the predictions.
        # What would happen if you change the activation to tanh?
        predictions.extend(output.cpu().squeeze().round().detach().numpy())
    test_loss = sum(running_loss) / len(running_loss)
    avp = metrics.average_precision_score(targets, predictions)
    print(f'\t Final test_loss: {test_loss}')
    print(f'\t Final test_avp: {avp}')
    history['test_loss'].append(test_loss)
    history['test_avp'].append(avp)

  0%|          | 0/10 [00:00<?, ?it/s]

EPOCH: 0


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6931263402009465


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6923185855150222
	 Final test_avp: 0.5021
EPOCH: 1


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6893414388037031


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6851669684052467
	 Final test_avp: 0.5246217198040765
EPOCH: 2


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6756794285622372


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6577675268054008
	 Final test_avp: 0.5506730479047511
EPOCH: 3


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6397815384682576


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.636734525859356
	 Final test_avp: 0.613642716697254
EPOCH: 4


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6440917264883685


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6501246258616448
	 Final test_avp: 0.5953804561470135
EPOCH: 5


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6437187426409144


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6966372415423393
	 Final test_avp: 0.5327212854634389
EPOCH: 6


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6744093811436064


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6626439318060875
	 Final test_avp: 0.572972680862825
EPOCH: 7


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6536324005218068


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6635235100984573
	 Final test_avp: 0.5722203011632037
EPOCH: 8


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6586916473261111


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6945038571953773
	 Final test_avp: 0.5732897516513782
EPOCH: 9


  0%|          | 0/157 [00:00<?, ?it/s]

	 Final train_loss: 0.6572739090889123


  0%|          | 0/40 [00:00<?, ?it/s]

	 Final test_loss: 0.6718494534492493
	 Final test_avp: 0.5596479961937639
