In [73]:
import torch
import sklearn.datasets
import numpy as np
import pandas as pd
import torch.nn as nn
from typing import Iterator, List, Callable, Tuple
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [74]:
from urllib.request import urlretrieve
urlretrieve('https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv', 'IMDB_Dataset.csv')

('IMDB_Dataset.csv', <http.client.HTTPMessage at 0x7f84df7df090>)

In [75]:
data = pd.read_csv('IMDB_Dataset.csv')
data_reviews, data_labels = data.review, data.sentiment.tolist()

In [76]:
train_data, validation_data, train_labels, validation_labels = train_test_split(data_reviews, data_labels, test_size=0.1, random_state=42)
train_data, test_data, train_labels, test_labels = train_test_split(data_reviews, data_labels, test_size=0.1, random_state=42)

In [77]:
def transform_to_words(data):
  reviews = []
  for review in data:
    review_cleaned = nltk.tokenize.word_tokenize(review)
    reviews.append(review_cleaned)

  return reviews

train_reviews = transform_to_words(train_data)
validation_reviews = transform_to_words(validation_data)
test_reviews = transform_to_words(test_data)

In [78]:
def get_vocab(data):
  words = set([word for review in data for word in review])
  return words

train_vocab = get_vocab(train_reviews)
val_vocab = get_vocab(validation_reviews)
test_vocab = get_vocab(test_data)

In [79]:
import operator
def word_freq(data, min_aparitions):
    
    all_words = [words.lower() for sentences in data for words in sentences]
    sorted_vocab = sorted(dict(Counter(all_words)).items(), key=operator.itemgetter(1))
    final_vocab = [k for k,v in sorted_vocab if v > min_aparitions]

    return final_vocab

train_vocab = word_freq(train_reviews, min_aparitions = 10)
val_vocab = word_freq(validation_reviews, min_aparitions = 10)
test_vocab = word_freq(test_reviews, min_aparitions = 10)

In [80]:
print(len(train_vocab))

27223


In [81]:
train_word_indices = dict((c, i + 2) for i, c in enumerate(train_vocab))
train_indices_word = dict((i + 2, c) for i, c in enumerate(train_vocab))

train_indices_word[0] = 'UNK'
train_word_indices['UNK'] = 0

train_indices_word[1] = 'PAD'
train_word_indices['PAD'] = 1


val_word_indices = dict((c, i + 2) for i, c in enumerate(val_vocab))
val_indices_word = dict((i + 2, c) for i, c in enumerate(val_vocab))

val_indices_word[0] = 'UNK'
val_word_indices['UNK'] = 0

val_indices_word[1] = 'PAD'
val_word_indices['PAD'] = 1

test_word_indices = dict((c, i + 2) for i, c in enumerate(test_vocab))
test_indices_word = dict((i + 2, c) for i, c in enumerate(test_vocab))

test_indices_word[0] = 'UNK'
test_word_indices['UNK'] = 0

test_indices_word[1] = 'PAD'
test_word_indices['PAD'] = 1

In [82]:
def vectorize_sentences(data, char_indices, one_hot = False):
    vectorized = []
    for sentences in data:

        # transformam fiecare review in reprezentarea lui sub forma de indici ale caracterelor continute
        sentences_of_indices = [char_indices[w] if w in char_indices.keys() else char_indices['UNK'] for w in sentences]

        # pentru fiecare indice putem face reprezentarea one-hot corespunzatoare
        # sau putem sa nu facem asta si sa adaugam un embedding layer in model care face această transformare
        if one_hot:
            sentences_of_indices = np.eye(len(char_indices))[sentences_of_indices]

        vectorized.append(sentences_of_indices)

    return vectorized

In [83]:
def pad(samples, max_length):
    
    return torch.tensor([
        sample[:max_length] + [1] * max(0, max_length - len(sample))
        for sample in samples
    ])

In [84]:
train_reviews_vectorized = vectorize_sentences(train_reviews, train_word_indices)
val_reviews_vectorized = vectorize_sentences(validation_reviews, val_word_indices)
test_reviews_vectorized = vectorize_sentences(test_reviews, test_word_indices)

train_reviews_vectorized = pad(train_reviews_vectorized, max_length = 512)
val_reviews_vectorized = pad(val_reviews_vectorized, max_length = 512)
test_reviews_vectorized = pad(test_reviews_vectorized, max_length = 512)

In [85]:
print(train_reviews_vectorized.shape)

torch.Size([45000, 512])


In [86]:
class ReviewDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
            
    def __getitem__(self, k):
        """Returneaza al k-lea exemplu din dataset"""
        return self.data[k], self.labels[k]
    
    def __len__(self):
        """Returneaza dimensiunea datasetului"""
        return len(self.data)

In [87]:
train_dataset = ReviewDataset(data=train_reviews_vectorized, labels=train_labels)
validation_dataset = ReviewDataset(data=val_reviews_vectorized, labels=validation_labels)
test_dataset = ReviewDataset(data=test_reviews_vectorized, labels=test_labels)

In [88]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=True)

In [89]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        # Definim un embedding layer cu un vocabular de dimensiune 72
        # și ca output un embedding de dimensiune 20
        # padding_idx este indexul din vocabular al paddingului (1, în cazul nostru)
        
        self.embedding = torch.nn.Embedding(27225, 100, padding_idx=1)

        self.dropout = torch.nn.Dropout(0.4)
        
        conv1 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=100, out_channels=128, kernel_size=3, padding=1),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2),
        )
        
        conv2 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, padding=2),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2),
        )

        conv3 = torch.nn.Sequential(
            torch.nn.Conv1d(in_channels=128, out_channels=128, kernel_size=5, padding=2),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.MaxPool1d(kernel_size=2)
        )
        
        global_average = torch.nn.AvgPool1d(kernel_size=64, stride=64)

        self.convolutions = torch.nn.Sequential(
            conv1, conv2, conv3, global_average
        )
        
        self.flatten = torch.nn.Flatten()
        self.output = torch.nn.Linear(in_features=128, out_features=2)

        
    def forward(self, input):
        # trecem inputul prin layerul de embedding
        embeddings = self.embedding(input)
        embeddings = self.dropout(embeddings)
        
        # permutăm inputul astfel încât prima dimensiune este numărul de channels
        embeddings = embeddings.permute(0, 2, 1)
        
        # trecem inputul prin secvența de layere
        out = self.convolutions(embeddings)
        out = self.output(self.flatten(out))
        return out

In [90]:
def train_epoch(model, train_dataloader, loss_crt, optimizer):
    """
    model: Model object 
    train_dataloader: DataLoader over the training dataset
    loss_crt: loss function object
    optimizer: Optimizer object

    The function returns: 
     - the epoch training loss, which is an average over the individual batch
       losses
     - the predictions made by the model
     - the labels 
    """
    model.train()
    epoch_loss = 0.0
    num_batches = len(train_dataloader)
    predictions = []
    labels = []
    for idx, batch in tqdm(enumerate(train_dataloader)):
        batch_data, batch_labels = batch

        output = model(batch_data)
        batch_predictions = torch.argmax(output, dim=1)
        
        predictions += batch_predictions.tolist()
        labels += batch_labels.squeeze().tolist()

        loss = loss_crt(output, batch_labels)
        loss_scalar = loss.item()

        loss.backward()

        optimizer.step()
        model.zero_grad()

        epoch_loss += loss_scalar

    epoch_loss = epoch_loss/num_batches

    return epoch_loss, predictions, labels

def eval_epoch(model, val_dataloader, loss_crt):
    """
    model: Model object 
    val_dataloader: DataLoader over the validation dataset
    loss_crt: loss function object

    The function returns: 
     - the epoch validation loss, which is an average over the individual batch
       losses
     - the predictions made by the model
     - the labels 
    """
    model.eval()
    epoch_loss = 0.0
    num_batches = len(val_dataloader)
    predictions = []
    labels = []
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(val_dataloader)):
            batch_data, batch_labels = batch
            
            output = model(batch_data)
            batch_predictions = torch.argmax(output, dim=1)
        
            predictions += batch_predictions.tolist()
            labels += batch_labels.squeeze().tolist()

            loss = loss_crt(output, batch_labels)
            loss_scalar = loss.item()

            epoch_loss += loss_scalar

    epoch_loss = epoch_loss/num_batches

    return epoch_loss, predictions, labels

In [91]:
def compute_accuracy(predictions: List[int], labels:List[int]) -> float:
    """
    Compute accuracy given the predictions of a binary classifier and the 
    ground truth label.
    predictions: list of model predictions (0 or 1)
    labels: list of ground truth labels (0 or 1)
    """
    num_correct = len([(p,l) for (p,l) in zip(predictions,labels) if p==l])
    epoch_accuracy = num_correct/len(labels)
    
    return epoch_accuracy

In [92]:
model = Model()

loss_criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for epoch_idx in range(10):
    train_epoch_loss, train_predictions, train_labels = train_epoch(
        model, 
        train_dataloader, 
        loss_criterion, 
        optimizer
    )
    val_epoch_loss, val_predictions, val_labels = eval_epoch(
        model,
        validation_dataloader,
        loss_criterion,
    )
    train_acc = compute_accuracy(train_predictions, train_labels)
    val_acc = compute_accuracy(val_predictions, val_labels)
    train_losses.append(train_epoch_loss)
    val_losses.append(val_epoch_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print("epoch %d, train loss=%f, train acc=%f, val loss=%f, val acc=%f" % (
        epoch_idx, 
        train_epoch_loss,
        train_acc,
        val_epoch_loss,
        val_acc
    ))

704it [08:54,  1.32it/s]
79it [00:14,  5.38it/s]


epoch 0, train loss=0.490831, train acc=0.760667, val loss=0.814999, val acc=0.498400


704it [08:53,  1.32it/s]
79it [00:14,  5.51it/s]


epoch 1, train loss=0.339225, train acc=0.853533, val loss=0.983205, val acc=0.479600


704it [08:44,  1.34it/s]
79it [00:14,  5.31it/s]


epoch 2, train loss=0.286281, train acc=0.882889, val loss=0.919215, val acc=0.496600


704it [08:51,  1.33it/s]
79it [00:14,  5.39it/s]


epoch 3, train loss=0.256295, train acc=0.896156, val loss=0.946038, val acc=0.500200


704it [08:44,  1.34it/s]
79it [00:14,  5.32it/s]


epoch 4, train loss=0.233078, train acc=0.906800, val loss=1.014972, val acc=0.505200


704it [08:49,  1.33it/s]
79it [00:14,  5.29it/s]


epoch 5, train loss=0.212461, train acc=0.915822, val loss=1.077705, val acc=0.502200


704it [08:46,  1.34it/s]
79it [00:14,  5.31it/s]


epoch 6, train loss=0.197603, train acc=0.922378, val loss=1.187248, val acc=0.501400


704it [08:41,  1.35it/s]
79it [00:14,  5.31it/s]


epoch 7, train loss=0.187115, train acc=0.927022, val loss=1.451706, val acc=0.490600


704it [08:43,  1.35it/s]
79it [00:14,  5.53it/s]


epoch 8, train loss=0.174274, train acc=0.931667, val loss=2.235968, val acc=0.498600


704it [08:48,  1.33it/s]
79it [00:14,  5.29it/s]

epoch 9, train loss=0.157846, train acc=0.937956, val loss=1.284705, val acc=0.494800





In [93]:
test_losses = []
test_accuracies = []

test_loss, test_predictions, test_labels = eval_epoch(model, test_dataloader, loss_criterion)
test_acc = compute_accuracy(test_predictions, test_labels)
test_losses.append(test_loss)
test_accuracies.append(test_acc)

print("test loss=%f, test acc=%f" % (test_loss, val_acc))

79it [00:14,  5.49it/s]

test loss=1.277900, test acc=0.494800



