# Construyendo una red convolucional con PyTorch

## Librerías

In [1]:
import numpy as np
import pandas as pd

import gzip
import mlflow
import tempfile

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as transforms

from gensim import corpora
from gensim.parsing import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, average_precision_score

from tqdm.notebook import tqdm, trange

## Red convolucional para imágenes

### Datos del CIFAR10

Utilizamos los mismos datos que se usaron en el [Notebook 1](1_Basic_MLP.ipynb).

In [2]:
CIFAR_CLASSES = ('plane',
                 'car',
                 'bird',
                 'cat',
                 'deer', 
                 'dog',
                 'frog',
                 'horse',
                 'ship',
                 'truck')

BATCH_SIZE = 128

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform)

trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


### Red convolucional

- La red convolucional se obtiene apilando capas [`torch.nn.Conv2d`](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html).
    - En particular, este tipo de capas acepta matrices (a diferencia de la lineal que sólo acepta vectores). En las capas se definen los canales de entrada y los de salida, además del tamaño del *kernel* (i.e. ventana).
- También son comunes las capas [`torch.nn.MaxPool2d`](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) que realizan una operación de *max pooling*, en 2 dimensiones.
- La red se completa con algunas capas lineales para poder llevarla a las 10 dimensiones de salida que vienen a representar las clases.

In [3]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = CNN()
print(model)

CNN(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


### Entrenamiento

La red se entrena igual que en el caso del *perceptrón multicapa* (**MLP**), solo que esta vez no requiere reacomodar la matriz de entrada.

In [4]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)

EPOCHS = 3

In [5]:
model.train()

iters_per_epoch = len(trainloader)
for epoch in trange(EPOCHS): # Loop over the dataset multiple times...
    pbar = tqdm(trainloader, desc='Train Loss: NaN')
    for inputs, labels in pbar:
        # Mini-Batch
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()
        # Train Loss Log
        pbar.set_description(f'Train Loss: {loss.item():.3f}')

  0%|          | 0/3 [00:00<?, ?it/s]

Train Loss: NaN:   0%|          | 0/391 [00:00<?, ?it/s]

Train Loss: NaN:   0%|          | 0/391 [00:00<?, ?it/s]

Train Loss: NaN:   0%|          | 0/391 [00:00<?, ?it/s]

### Evaluación

Una vez más, la evaluación es similar al caso del *perceptrón multicapa*.

In [6]:
y_true = []
y_pred = []
with torch.no_grad():
    for inputs, labels in tqdm(testloader):
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_true.extend(labels.numpy())
        y_pred.extend(predicted.numpy())

print(classification_report(y_true, y_pred, target_names=CIFAR_CLASSES))

  0%|          | 0/79 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       plane       0.59      0.51      0.54      1000
         car       0.59      0.73      0.65      1000
        bird       0.48      0.38      0.42      1000
         cat       0.38      0.28      0.32      1000
        deer       0.53      0.36      0.43      1000
         dog       0.40      0.56      0.47      1000
        frog       0.56      0.65      0.60      1000
       horse       0.61      0.56      0.59      1000
        ship       0.62      0.65      0.63      1000
       truck       0.52      0.62      0.57      1000

    accuracy                           0.53     10000
   macro avg       0.53      0.53      0.52     10000
weighted avg       0.53      0.53      0.52     10000



## CNNs para Texto

### Datos IMDB

Similar al caso de **CNN** para imágenes, vamos a volver sobre el conjunto de datos que ya utilizamos anteriomente: el de reseñas **IMDB**. Esta vez para compararlo contra el modelo del *perceptrón multicapa* utilizando la media de los embeddings.

In [7]:
class IMDBReviewsDataset(Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        if torch.is_tensor(item):
            item = item.to_list()

        item = {
            'data': self.dataset.loc[item, 'review'],
            'target': self.dataset.loc[item, 'sentiment']
        }

        if self.transform:
            item = self.transform(item)
        
        return item

### Preprocesamiento

Aplicamos el mismo tipo de preprocesamiento.

In [8]:
class RawDataProcessor:
    def __init__(self, dataset, ignore_header=True, filters=None, vocab_size=50000):
        if filters:
            self.filters = filters
        else:
            self.filters = [
                lambda s: s.lower(),
                preprocessing.strip_tags,
                preprocessing.strip_punctuation,
                preprocessing.strip_multiple_whitespaces,
                preprocessing.strip_numeric,
                preprocessing.remove_stopwords,
                preprocessing.strip_short,
            ]

        # Create dictionary based on all the reviews (with corresponding preprocessing).
        self.dictionary = corpora.Dictionary(
            dataset['review'].map(self._preprocess_string).tolist()
        )
        # Filter the dictionary and compactify it (make the indices continous).
        self.dictionary.filter_extremes(no_below=2, no_above=1, keep_n=vocab_size)
        self.dictionary.compactify()
        # Add a couple of special tokens.
        self.dictionary.patch_with_special_tokens({'[PAD]': 0, '[UNK]': 1})
        self.idx_to_target = sorted(dataset['sentiment'].unique())
        self.target_to_idx = {t: i for i, t in enumerate(self.idx_to_target)}

    def _preprocess_string(self, string):
        return preprocessing.preprocess_string(string, filters=self.filters)

    def _sentence_to_indices(self, sentence):
        return self.dictionary.doc2idx(sentence, unknown_word_index=1)

    def encode_data(self, data):
        return self._sentence_to_indices(self._preprocess_string(data))

    def encode_target(self, target):
        return self.target_to_idx[target]

    def __call__(self, item):
        if isinstance(item['data'], str):
            # String
            data = self.encode_data(item['data'])
        else:
            # Iterable
            data = [self.encode_data(d) for d in item['data']]

        if isinstance(item['target'], str):
            # String
            target = self.encode_target(item['target'])
        else:
            # Iterable
            target = [self.encode_target(t) for t in item['target']]

        return {'data': data, 'target': target}

### Carga de datos

In [9]:
dataset = pd.read_csv('data/imdb_reviews.csv.gz')

preprocess = RawDataProcessor(dataset)

train_indices, test_indices = train_test_split(dataset.index, test_size=0.2, random_state=123)

train_dataset = IMDBReviewsDataset(dataset.loc[train_indices].reset_index(drop=True), transform=preprocess)
test_dataset = IMDBReviewsDataset(dataset.loc[test_indices].reset_index(drop=True), transform=preprocess)

### Padding de secuencias

Dado que en este caso utilizaremos las secuencias completas sobre las que aplicaremos las convoluciones, necesitamos trabajar con dichas secuencias de manera que en un *batch* de datos se tenga el tamaño correcto.

In [10]:
class PadSequences:
    def __init__(self, pad_value=0, max_length=None, min_length=1):
        assert max_length is None or min_length <= max_length
        self.pad_value = pad_value
        self.max_length = max_length
        self.min_length = min_length

    def __call__(self, items):
        data, target = list(zip(*[(item['data'], item['target']) for item in items]))
        seq_lengths = [len(d) for d in data]

        if self.max_length:
            max_length = self.max_length
            seq_lengths = [min(self.max_length, l) for l in seq_lengths]
        else:
            max_length = max(self.min_length, max(seq_lengths))

        data = [d[:l] + [self.pad_value] * (max_length - l) for d, l in zip(data, seq_lengths)]

        return {'data': torch.LongTensor(data), 'target': torch.FloatTensor(target)}

### DataLoaders

Una vez creada nuestra función para hacer *padding* de secuencia, definiremos los `DataLoaders`. Una cuestión importante, las redes convolucionales sobre texto esperan que todas las secuencias sean al menos del tamaño de la convolución máxima (caso contrario ocurrirá un error por no poder realizar la convolución sobre un espacio más chico que el tamaño del filtro). Es por eso que utilizamos el parámetro `min_length` esta vez.

In [11]:
FILTERS_COUNT = 100

FILTERS_LENGTH = [2, 3, 4]
pad_sequences = PadSequences(min_length=max(FILTERS_LENGTH))

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=pad_sequences, drop_last=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=pad_sequences, drop_last=False)

### Red convolucional sobre texto

Por último, tenemos la red convolucional sobre texto. Si bien arranca muy similar al caso del clasificador del *perceptrón multicapa*, vemos que en este caso hacemos uso de [`torch.nn.Conv1d`](https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html) dado que sólo nos desplazamos por una dimensión (i.e. la secuencia). En particular, como utilizamos *max pooling* global, no hacemos uso del módulo `torch.nn` para calcularlo, simplemente utilizamos el método `max()` del tensor.

In [12]:
class IMDBReviewsClassifier(nn.Module):
    def __init__(self, pretrained_embeddings_path, dictionary, vector_size, freeze_embedings):
        super().__init__()
        embeddings_matrix = torch.randn(len(dictionary), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, 'rt') as fh:
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in dictionary.token2id:
                    # We know the embedding!
                    wordID = dictionary.token2id[word]
                    embeddings_matrix[wordID] = torch.FloatTensor([float(n) for n in vector.split()])
        # Embedding Layer
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix, freeze=freeze_embedings, padding_idx=0)
        # Convolutional Layers
        self.convs = []
        for filter_lenght in FILTERS_LENGTH:
            self.convs.append(nn.Conv1d(vector_size, FILTERS_COUNT, filter_lenght))
        self.convs = nn.ModuleList(self.convs)
        # Fully Connected Layers
        self.fc = nn.Linear(FILTERS_COUNT * len(FILTERS_LENGTH), 128)
        self.output = nn.Linear(128, 1)
        self.vector_size = vector_size

    @staticmethod
    def conv_global_max_pool(x, conv):
        return F.relu(conv(x).transpose(1, 2).max(1)[0])

    def forward(self, x):
        x = self.embeddings(x).transpose(1, 2) # Conv1d takes (batch, channel, seq_len)
        x = [self.conv_global_max_pool(x, conv) for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc(x))
        x = torch.sigmoid(self.output(x))
        return x

### Experimento

El experimento de **MLFlow** es prácticamente igual, salvo que cambiamos algunos de los parámetros a guardar.

In [13]:
mlflow.set_experiment('a_CNN_experiment')

with mlflow.start_run():
    # HyperParameters Logs
    mlflow.log_param('model_name', 'CNN')
    mlflow.log_param('freeze_embedding', True)
    mlflow.log_params({
        'filters_count': FILTERS_COUNT,
        'filters_length': FILTERS_LENGTH,
        'fc_size': 128
    })
    # Model Definition
    model = IMDBReviewsClassifier('data/glove.6B.50d.txt.gz', preprocess.dictionary, 50, True)
    loss = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.00001)
    # Training
    for epoch in trange(3):
        model.train()
        running_loss = []
        for idx, batch in enumerate(tqdm(train_loader)):
            # Mini-Batch Training
            optimizer.zero_grad()
            output = model(batch['data'])
            loss_value = loss(output, batch['target'].view(-1, 1))
            loss_value.backward()
            optimizer.step()
            running_loss.append(loss_value.item())
        # Train Loss for each epoch
        mlflow.log_metric('train_loss', sum(running_loss) / len(running_loss), epoch)
        # Evaluation
        model.eval()
        targets = []
        predictions = []
        running_loss = []
        for batch in tqdm(test_loader):
            # Mini-Batch Evaluation
            output = model(batch['data'])
            loss_value = loss(output, batch['target'].view(-1, 1))
            running_loss.append(loss_value.item())
            targets.extend(batch['target'].numpy())
            predictions.extend(output.squeeze().detach().numpy())
        # Test Loss for each epoch
        mlflow.log_metric('test_loss', sum(running_loss) / len(running_loss), epoch)
        mlflow.log_metric('test_avp', average_precision_score(targets, predictions), epoch)

    with tempfile.TemporaryDirectory() as tmpdirname:
        targets = []
        predictions = []
        for batch in tqdm(test_loader):
            output = model(batch['data'])
            # Let's save the final predictions
            targets.extend(batch['target'].numpy())
            predictions.extend(output.squeeze().detach().numpy())
        # Predictions DataFrame
        results = {'prediction': predictions, 'target': targets}
        pd.DataFrame(results).to_csv(f'{tmpdirname}/predictions.csv.gz', index=False)
        # Predictions Log
        mlflow.log_artifact(f'{tmpdirname}/predictions.csv.gz')

INFO: 'a_CNN_experiment' does not exist. Creating a new experiment


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/313 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]

  0%|          | 0/79 [00:00<?, ?it/s]