In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls
%cd drive/My\ Drive/Colab\ Notebooks

drive  sample_data
/content/drive/My Drive/Colab Notebooks


In [3]:
import os

if os.path.exists("data/train.csv") and os.path.exists("data/test.csv"):
  print("No need to download data - already present!")
else:
  os.mkdir("data") 
  print("Created a data folder. Please place the files train.csv and test.csv in there!")
  raise Exception("Could not find train and test files. Please make sure to place them within the data folder (My Drive/Colab Notebooks/data).")

No need to download data - already present!


In [4]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
    raise Exception("Please use the GPU runtime! (Runtime -> Change runtime type -> GPU) Then you need to rerun the cells above!")

Running on the GPU


In [5]:
import os
import io
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils


class MovieSentimentDataset(Dataset):
    """Movie sentiment dataset."""

    def __init__(self, data: pd.DataFrame, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with sentiments.
        """
        self.movie_sentiments = data

    def __len__(self):
        return len(self.movie_sentiments)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        review = self.movie_sentiments.iloc[idx, 0]
        #review  = np.array([review])

        sentiment = self.movie_sentiments.iloc[idx, 1]
        #sentiment  = np.array([sentiment])

        sample = {'review': review, 'sentiment': sentiment}

        return sample


class MovieSentimentDatasetBuilder:
    def __init__(self, data: pd.DataFrame):
        self.data = data
        self.train_validation_split = None

    @staticmethod
    def from_csv(csv_file: str):
        return MovieSentimentDatasetBuilder(data=pd.read_csv(csv_file))
    
    def with_train_validation_split(self, splits: (float, float)=[.8, .2]):
        self.train_validation_split = splits
        return self
    
    def build(self):
        if self.train_validation_split is None:
            return MovieSentimentDataset(data=self.data)
        else:
            msk = np.random.rand(len(self.data)) < self.train_validation_split[0]
            train = self.data[msk]
            validation = self.data[~msk]
            return (MovieSentimentDataset(data=train), MovieSentimentDataset(data=validation))


In [6]:
import pandas as pd

SYMBOLS_TO_REMOVE = [".", "\"", "(", ")", ",", "?", "!", "'", ";", "{", "}", "-", "*", "=", ":", "\x91", "\x97", "<br />", "/", "<", ">"]

class Preprocessor:
    @staticmethod
    def remove_symbols(review_texts: pd.Series) -> pd.Series:
        def preprocess_text(text: str):
            for symbol in SYMBOLS_TO_REMOVE:
                text = text.replace(symbol, " ")
            text = " ".join([w for w in text.split() if w])
            return text.lower()

        return review_texts.str.lower().apply(preprocess_text)
    
    @staticmethod
    def remove_long_sequences(df: pd.DataFrame, max_len: int) -> pd.Series:
        seq_lengths = df["review"].apply(lambda text: len(text.split()))
        return df[seq_lengths <= max_len]

In [7]:
import torch
import numpy as np
import pandas as pd
from torch import Tensor
from sklearn import preprocessing
from typing import List, Set

class SequenceTokenizer:

    def __init__(self):
        self.vocab = dict({"EOS": 0})
        self.padding_size = -1

    def fit(self, reviews: pd.Series):
        for review in reviews:
            words_in_review = review.split()
            if len(words_in_review) > self.padding_size:
                self.padding_size = len(words_in_review)
            for word in words_in_review:
                if word not in self.vocab:
                    self.vocab[word] = len(self.vocab)
        self.vocab_size = len(self.vocab)
        print("Build a vocabulary of size {}".format(self.vocab_size))

    def __transform_single_review(self, review: str):
        tokenized = np.array([self.vocab[word] for word in review.split() if word in self.vocab])
        if len(tokenized) < self.padding_size:
            tokenized = np.append(np.array([0 for _ in range(self.padding_size - len(tokenized))]), tokenized)
        return torch.from_numpy(tokenized).long()

    def transform(self, reviews: pd.Series):
        return reviews.apply(self.__transform_single_review)

    def fit_transform(self, reviews: pd.Series) -> pd.Series:
        self.fit(reviews)
        return self.transform(reviews)


In [8]:
import torch
from torch import nn


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size: int, padding_size: int, embedding_size: int, hidden_size: int):
        super(LSTMClassifier, self).__init__()
        self.vocab_size = vocab_size
        self.padding_size = padding_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embbedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size, padding_idx=0)
        self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, dropout=1)

        self.out = nn.Sequential(
            nn.Linear(hidden_size, 32), nn.ReLU(),
            nn.Linear(32, 4), nn.ReLU(),
            nn.Linear(4, 1), nn.Sigmoid()
        )

    def forward(self, reviews):
        reviews_embedded = self.embbedding(reviews)
        reviews_embedded_permuted = reviews_embedded.permute(1, 0, 2)
        lstm_output, lstm_hidden = self.rnn(reviews_embedded_permuted)
        return self.out(lstm_hidden[0].reshape(-1, self.hidden_size))

In [9]:
import torch
import math

class EarlyStopping():
    def __init__(self, patience: int=3, checkpoint_path: str='best_model.pt', verbose: bool=True):
        super().__init__()
        self.patience = patience
        self.checkpoint_path = checkpoint_path
        self.verbose = verbose
        self.best_validation_loss = math.inf
        self.counter = 0

    def track(self, epoch: int, model, validation_loss: int):
        if validation_loss < self.best_validation_loss:
            if self.verbose:
                print('Validation loss decreased from {:.4f} to {:.4f} in epoch {}.  Creating model checkpoint ...\n'.format(self.best_validation_loss, validation_loss, epoch))
            self.best_validation_loss = validation_loss
            self.save_model(model)
            self.counter = 0
            return False
        else:
            self.counter += 1
            if self.counter > self.patience:
                return True
    
    def save_model(self, model):
        torch.save(model.state_dict(), self.checkpoint_path)

    def get_best_version(self, model):
        if self.best_validation_loss is math.inf:
            raise Exception('Cannot bet best model. No model stored yet.')
        model.load_state_dict(torch.load(self.checkpoint_path))


In [10]:
import torch
import sklearn
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

if __name__ == '__main__':
    if torch.cuda.is_available():
        device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
        enable_sampling = False
        print("Running on the GPU")
    else:
        device = torch.device("cpu")
        enable_sampling = True
        print("Running on the CPU")

    dataset_train, dataset_validation = MovieSentimentDatasetBuilder\
        .from_csv(csv_file='data/train.csv')\
        .with_train_validation_split(splits=[.8, .2])\
        .build()
    dataset_test = MovieSentimentDatasetBuilder\
        .from_csv(csv_file='data/test.csv')\
        .build()
    
    # Restrict the number of reviews if running on the CPU
    if enable_sampling:
        dataset_train.movie_sentiments = dataset_train.movie_sentiments.sample(100)
        dataset_validation.movie_sentiments = dataset_validation.movie_sentiments.sample(20)

    # Preprocess reviews
    def execute_preprocessing_pipeline(dataset: MovieSentimentDataset, tokenizer=None):
        reviews = dataset.movie_sentiments["review"]
        dataset.movie_sentiments["review"] = Preprocessor.remove_symbols(dataset.movie_sentiments["review"])
        dataset.movie_sentiments = Preprocessor.remove_long_sequences(dataset.movie_sentiments, max_len=1000)

        if tokenizer is None:
            tokenizer = SequenceTokenizer()
            tokenizer.fit(dataset.movie_sentiments["review"])
        dataset.movie_sentiments["review"] = tokenizer.transform(dataset.movie_sentiments["review"])
        return tokenizer
    
    tokenizer = execute_preprocessing_pipeline(dataset_train)
    vocab_size, padding_size = tokenizer.vocab_size, tokenizer.padding_size
    execute_preprocessing_pipeline(dataset_validation, tokenizer=tokenizer)
    execute_preprocessing_pipeline(dataset_test, tokenizer=tokenizer)

    # Create DataLoader
    dataloader_train = DataLoader(dataset_train, batch_size=256, shuffle=True, num_workers=1)
    dataloader_validation = DataLoader(dataset_validation, batch_size=256, shuffle=False, num_workers=1)
    dataloader_test = DataLoader(dataset_test, batch_size=256, shuffle=False, num_workers=1)

    # Set up a bag of words model and training
    embedding_size = 200
    model = LSTMClassifier(vocab_size=vocab_size, padding_size=padding_size, embedding_size=embedding_size, hidden_size=128).to(device)
    loss = nn.BCELoss()
    num_epochs = 50
    lr = 1e-2
    trainer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-3)
    early_stopping = EarlyStopping(patience=5)

    for epoch in range(num_epochs):
        train_loss_epoch, train_acc, n = 0.0, 0, 0
        model.train()
        for i_batch, sample_batched in enumerate(dataloader_train):
            y = sample_batched["sentiment"].type(torch.FloatTensor).to(device).reshape(-1, 1)
            y_hat = model(sample_batched["review"].to(device).reshape(-1, padding_size))
            
            l = loss(y_hat, y)
            train_loss_epoch += l.item()
            trainer.zero_grad()
            l.backward()
            trainer.step()
            train_acc += (y == (y_hat > .5).type(torch.FloatTensor).to(device)).sum().item()
            n += len(y)
        
        train_loss_epoch /= n
        train_acc /= n
        
        validation_loss, validation_acc, n = 0, 0, 0
        with torch.no_grad():
            for i_batch, sample_batched in enumerate(dataloader_validation):
                y = sample_batched["sentiment"].type(torch.FloatTensor).to(device).reshape(-1, 1)
                y_hat = model(sample_batched["review"].to(device).reshape(-1, padding_size))
                l = loss(y_hat, y)
                validation_loss += l.item()
                validation_acc += (y == (y_hat > .5).type(torch.FloatTensor).to(device)).sum().item()
                n += len(y)
            
        validation_loss /= n
        validation_acc /= n
        print("Epoch: {}, Train Loss: {}, Train acc: {}, Validation Loss: {}, Validation acc: {}".format(epoch+1, train_loss_epoch, train_acc, validation_loss, validation_acc))
        
        perform_early_stop = early_stopping.track(epoch=epoch, model=model, validation_loss=validation_loss)
        if perform_early_stop:
            print("Stopping early as no improvement was reached for {} epochs".format(early_stopping.patience))
            early_stopping.get_best_version(model)
            break

    test_loss, test_acc, n = 0, 0, 0
    with torch.no_grad():
        for i_batch, sample_batched in enumerate(dataloader_test):
            y = sample_batched["sentiment"].type(torch.FloatTensor).to(device).reshape(-1, 1)
            y_hat = model(sample_batched["review"].to(device).reshape(-1, padding_size))
            l = loss(y_hat, y)
            test_loss += l.item()
            test_acc += (y == (y_hat > .5).type(torch.FloatTensor).to(device)).sum().item()
            n += len(y)
        
    test_loss /= n
    test_acc /= n
    print("Test Loss: {}, Test acc: {}".format(test_loss, test_acc))



Running on the GPU
Build a vocabulary of size 68917


  "num_layers={}".format(dropout, num_layers))


Epoch: 1, Train Loss: 0.0025771741794092355, Train acc: 0.5996989463120923, Validation Loss: 0.002494532976090679, Validation acc: 0.6514875531268974
Validation loss decreased from inf to 0.0025 in epoch 0.  Creating model checkpoint ...

Epoch: 2, Train Loss: 0.0023456868459038323, Train acc: 0.6695935775213246, Validation Loss: 0.0022833621217229305, Validation acc: 0.7136207245496863
Validation loss decreased from 0.0025 to 0.0023 in epoch 1.  Creating model checkpoint ...

Epoch: 3, Train Loss: 0.0022884824290325817, Train acc: 0.6870546914199699, Validation Loss: 0.0024627982617100104, Validation acc: 0.6486541185994738
Epoch: 4, Train Loss: 0.002341984090527517, Train acc: 0.6808329152032112, Validation Loss: 0.0022522088089734285, Validation acc: 0.7253592390204412
Validation loss decreased from 0.0023 to 0.0023 in epoch 3.  Creating model checkpoint ...

Epoch: 5, Train Loss: 0.0021015565253596057, Train acc: 0.7391871550426493, Validation Loss: 0.0017787437343906231, Validatio