# Pytorch Example

## Imports

In [1]:
import gc
import re

import pandas as pd
import numpy as np
import sklearn.model_selection
import gensim
import tqdm
import nltk

# Torch Stuff
import torch
import torchvision

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter

import torch.optim as optim

import matplotlib.pyplot as plt

## Pytorch Dataset Utility Class

In [2]:
class IMDBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset, word_vector_path, vector_dimension):
        super(torch.utils.data.Dataset, self).__init__()

        self.__vector_dimension = vector_dimension

        self.__dataset = self.__prepare_dataset(dataset)
        self.__word_vector = self.__read_word_vector(word_vector_path)

        self.__length_of_instances = len(self.__dataset['REVIEW'].iloc[0])
        

    def __prepare_dataset(self, dataset):
        dataset_copy = dataset.copy(deep=True)
        return dataset_copy
    

    def __read_word_vector(self, path):
        word2vec_model = word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(path)

        word2vec_model.wv["<pad>"] = np.array(
            [0.01]*self.__vector_dimension, dtype=np.float)
        word2vec_model.wv["<end>"] = np.array(
            [0.02]*self.__vector_dimension, dtype=np.float)

        return word2vec_model.wv

    def __len__(self):
        return len(self.__dataset)
    

    def __getitem__(self, idx):
        instance = self.__dataset.iloc[idx]

        review = instance[0]
        score = instance[1]

        embedded_review = self.__embed_review(review)

        return (embedded_review, score)

    def __embed_review(self, review):
        return np.array(list(map(self.__get_embedding, review)))
    

    def __get_embedding(self, word):
        try:
            return self.__word_vector[word]
        except KeyError:
            return np.array([0]*self.__vector_dimension, dtype=np.float)
        

    def get_length_of_reviews(self):
        return self.__length_of_instances

## Utilitary Functions

In [3]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def pre_processing(review):
    review = review.lower()
    review = cleanhtml(review)
    return tokenizer.tokenize(review)

## Load Dataset

### Loading from Processed data

In [4]:
usable_dataset = pd.read_csv("../data/processed/acllib_data.csv")
usable_dataset = usable_dataset.sample(frac=1).reset_index(drop=True)

## Pre processing Data

In [5]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-z]+")

usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(pre_processing)
usable_dataset["SCORE"] = usable_dataset["SCORE"].map(lambda x: 0 if x <5 else 1)
usable_dataset.head()

### Fill sentences

usable_dataset["REVIEW"].map(len).describe()

count    50000.000000
mean       234.139260
std        173.495615
min          6.000000
25%        128.000000
50%        176.000000
75%        284.000000
max       2487.000000
Name: REVIEW, dtype: float64

In [6]:
max_sentence_length = usable_dataset["REVIEW"].map(len).describe()['75%'] + 1
max_sentence_length

def fill_sentence(sentence):
    tokens_to_fill = int(max_sentence_length - len(sentence))
    
    sentence.append('<END>')
    sentence.extend(['<PAD>']*tokens_to_fill)
    
    return sentence

In [7]:
usable_dataset["REVIEW"] = usable_dataset["REVIEW"].map(fill_sentence)

## Train Test Validation Split

In [8]:
train_dataframe, test_dataframe = sklearn.model_selection.train_test_split(usable_dataset,
                                             test_size=0.33,
                                             shuffle=True)

### Applying Dataset Function

In [9]:
train_data = IMDBDataset(train_dataframe, "../data/processed/glove.6b.300d.txt", 300)



## Network Architecture Definition

In [10]:
class Conv1DArchitecture(nn.Module):
    def __init__(self, number_of_tokens, embedding_size):
        super(Conv1DArchitecture, self).__init__()
        
        self.number_of_tokens = number_of_tokens
        self.embedding_size = embedding_size

        number_of_kernels = 100
        

        self.convolution_window_2 = nn.Conv2d(
            in_channels=1,
            out_channels=number_of_kernels,
            kernel_size=(2, embedding_size),
        )
        
        self.convolution_window_3 = nn.Conv2d(
            in_channels=1,
            out_channels=number_of_kernels,
            kernel_size=(3, embedding_size),
        )
        
        self.convolution_window_4 = nn.Conv2d(
            in_channels=1,
            out_channels=number_of_kernels,
            kernel_size=(4, embedding_size),
        )
        
        self.max_pooling_window_2 = torch.nn.MaxPool1d(kernel_size=number_of_tokens-1)
        self.max_pooling_window_3 = torch.nn.MaxPool1d(kernel_size=number_of_tokens-2)
        self.max_pooling_window_4 = torch.nn.MaxPool1d(kernel_size=number_of_tokens-3)
        
        self.dense_1 = torch.nn.Linear(300, 150)
        self.dense_2 = torch.nn.Linear(150, 75)
        self.dense_3 = torch.nn.Linear(75, 1)

    def forward(self, x):
        
        x = x.view(-1,1,x.shape[1], self.embedding_size)

        x_window2 = F.relu(self.convolution_window_2(x).squeeze(-1))
        x_window3 = F.relu(self.convolution_window_3(x).squeeze(-1))
        x_window4 = F.relu(self.convolution_window_4(x).squeeze(-1))
        
        
        x_window2 = self.max_pooling_window_2(x_window2).squeeze(-1)
        x_window3 = self.max_pooling_window_3(x_window3).squeeze(-1)
        x_window4 = self.max_pooling_window_4(x_window4).squeeze(-1)
        
        y = torch.cat((x_window2, x_window3, x_window4), 1)
        
        y = F.dropout(F.relu(self.dense_1(y)),0.5)
        y = F.dropout(F.relu(self.dense_2(y)),0.5)
        y = F.relu(self.dense_3(y))

        return y.squeeze(-1)

## Instanciating Network

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = Conv1DArchitecture(max_sentence_length, embedding_size=300)
net = net.to(device)
print(net.parameters)

criterion = nn.MSELoss()
optimizer = optim.RMSprop(net.parameters(), lr=0.0001)

<bound method Module.parameters of Conv1DArchitecture(
  (convolution_window_2): Conv2d(1, 100, kernel_size=(2, 300), stride=(1, 1))
  (convolution_window_3): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
  (convolution_window_4): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
  (max_pooling_window_2): MaxPool1d(kernel_size=284.0, stride=284.0, padding=0, dilation=1, ceil_mode=False)
  (max_pooling_window_3): MaxPool1d(kernel_size=283.0, stride=283.0, padding=0, dilation=1, ceil_mode=False)
  (max_pooling_window_4): MaxPool1d(kernel_size=282.0, stride=282.0, padding=0, dilation=1, ceil_mode=False)
  (dense_1): Linear(in_features=300, out_features=150, bias=True)
  (dense_2): Linear(in_features=150, out_features=75, bias=True)
  (dense_3): Linear(in_features=75, out_features=1, bias=True)
)>


## Train

In [13]:
for epoch in range(5):
    print("Epoch: ", epoch)
    acc_loss = []
    for entry, labels in torch.utils.data.DataLoader(train_data, num_workers=4, batch_size=32):
        optimizer.zero_grad()

        entry = entry.type(torch.FloatTensor)
        labels = labels.type(torch.FloatTensor)
        entry = entry.to(device)
        labels = labels.to(device)

        outputs = net(entry)


        loss = criterion(outputs, labels)
        acc_loss.append(np.float(loss))

        loss.backward()
        optimizer.step()
    
    
    print("Error on test: %.4f" % np.mean(acc_loss))

Epoch:  0


RuntimeError: Traceback (most recent call last):
  File "/home/miranda/Documents/academico/mestrado/monitoria/workshop/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 106, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/miranda/Documents/academico/mestrado/monitoria/workshop/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 187, in default_collate
    return [default_collate(samples) for samples in transposed]
  File "/home/miranda/Documents/academico/mestrado/monitoria/workshop/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 187, in <listcomp>
    return [default_collate(samples) for samples in transposed]
  File "/home/miranda/Documents/academico/mestrado/monitoria/workshop/venv/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 173, in default_collate
    return torch.stack([torch.from_numpy(b) for b in batch], 0)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 286 and 359 in dimension 1 at /pytorch/aten/src/TH/generic/THTensorMath.cpp:3616


## Test

In [None]:
del train_data
test_data = IMDBDataset(test_dataframe, "../data/processed/glove.6b.300d.txt", 300)

In [None]:
with torch.no_grad():
    t_outputs = torch.tensor([], dtype=torch.float32)
    t_labels = torch.tensor([], dtype=torch.float32)

    for entry_test, labels_test in torch.utils.data.DataLoader(test_data, num_workers=4, batch_size=100000):

        entry_test = entry_test.float()
        labels_test = labels_test.float()
        entry_test = entry_test.to(device)
        labels_test = labels_test.to(device)
        
        t_labels = torch.cat((t_labels, labels_test),0)
        t_outputs = torch.cat((t_outputs, net(entry_test)),0)

    validation_error = F.mse_loss(t_outputs, t_labels)
    print("Error on test: %.4f" % validation_error)