# Task definition
Implement LSTM Sentiment Tagger for imdb reviews dataset.

1. (5pt) Fill missing code below
    * 1pt implement vectorization
    * 2pt implement \_\_init\_\_ and forward methods of models
    * 2pt implement collate function
2. (4pt) Implement training loop, choose proper loss function, use clear ml for max points.
    * 2pts is a baseline for well written, working code
    * 2pts if clear ml used properly
3. (3pt) Train the models (find proper hyperparams). Make sure you are not overfitting or underfitting. Visualize training of your best model (plot training, and test loss/accuracy in time). Your model should reach at least 87% accuracy. For max points it should exceed 89%. 
    * 1pt for accuracy above 89%
    * 1pt for accuracy above 87%
    * 1pt for visualizations

Remarks:
* Use embeddings of size 50
* Use 0.5 threshold when computing accuracy.
* Use supplied dataset for training and evaluation.
* You do not have to use validation set.
* You should monitor overfitting during training.
* For max points use clear ml to store and manage logs from your experiments. 
* We encourage to use pytorch lightning library (Addtional point for using it - however the sum must not exceed 12)

[Clear ML documentation](https://clear.ml/docs/latest/docs/)

[Clear ML notebook exercise from bootcamp](https://colab.research.google.com/drive/1wtLb4gg8beLS7smcyJlOZppn6_rQvSxL?usp=sharing)

In [None]:
!pip install clearml

import os
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torchtext
from clearml import Task

import torch
from torch import nn
from torch import optim

from torch.utils.data import Dataset, DataLoader



In [None]:

web_server = 'https://app.community.clear.ml'
api_server = 'https://api.community.clear.ml'
files_server = 'https://files.community.clear.ml'
access_key = ' '#@param {type:"string"}
secret_key = ''#@param {type:"string"}

Task.set_credentials(web_host=web_server,
                     api_host=api_server,
                     files_host=files_server,
                     key=access_key,
                     secret=secret_key)

In [None]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1hK-3iiRPlbePb99Fe-34LJNZ5yB-nduq
!tar -xvzf imdb_dataset.gz
data = pd.read_csv("imdb_dataset.csv")

Downloading...
From: https://drive.google.com/uc?id=1hK-3iiRPlbePb99Fe-34LJNZ5yB-nduq
To: /content/imdb_dataset.gz
100% 77.0M/77.0M [00:01<00:00, 56.2MB/s]
imdb_dataset.csv


In [None]:
from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
PADDING_VALUE = 0


class NaiveVectorizer:
    def __init__(self, tokenized_data, **kwargs):
        """Converts data from string to vector of ints that represent words. 
        Prepare lookup dict (self.wv) that maps token to int. Reserve index 0 for padding.
        """
        tokenized_data = [seq.split() for seq in tokenized_data]
        words = set()
        for v in tokenized_data:
          words = words.union(v)
        words = list(words)
        self.words_dict = {tok:i+1 for i, tok in enumerate(words)}

    def vectorize(self, tokenized_seq):
        """Converts sequence of tokens into sequence of indices.
        If the token does not appear in the vocabulary(self.wv) it is ommited
        Returns torch tensor of shape (seq_len,) and type long."""
        ### Your code goes here ###
        vec = []
        for w in tokenized_seq:
          if w in self.words_dict.keys():
              vec.append(self.words_dict[w]) 
        return torch.tensor(vec, dtype=torch.long)


class ImdbDataset(Dataset):
    SPLIT_TYPES = ["train", "test", "unsup"]

    def __init__(self, data, preprocess_fn, split="train"):
        super(ImdbDataset, self).__init__()
        if split not in self.SPLIT_TYPES:
            raise AttributeError(f"No such split type: {split}")

        self.split = split
        self.label = [i for i, c in enumerate(data.columns) if c == "sentiment"][0]
        self.data_col = [i for i, c in enumerate(data.columns) if c == "tokenized"][0]
        self.data = data[data["split"] == self.split]
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.preprocess_fn(self.data.iloc[idx, self.data_col].split())
        label = self.data.iloc[idx, self.label]
        return (seq, label)


naive_vectorizer = NaiveVectorizer(data.loc[data["split"] == "train", "tokenized"])

def get_datasets():
    train_dataset = ImdbDataset(data, naive_vectorizer.vectorize)
    test_dataset = ImdbDataset(data, naive_vectorizer.vectorize, split="test")
    return train_dataset, test_dataset


def custom_collate_fn(pairs):
    """This function is supposed to be used by dataloader to prepare batches
    Input: list of tuples (sequence, label)
    Output: sequences_padded_to_the_same_lenths, original_lenghts_of_sequences, lables.
    torch.nn.utils.rnn.pad_sequence might be usefull here
    """
    seqcs = []
    lengths = []
    labels = []
    for pair in pairs:
        seqcs.append(pair[0])
        lengths.append(len(pair[0]))
        labels.append(pair[1])
    seqcs.append(torch.zeros(3000))
    seqcs = pad_sequence(seqcs)
    seqcs = seqcs[:, :-1]
    lengths = torch.tensor(lengths)
    labels = torch.tensor(labels)
    return seqcs, lengths, labels



In [None]:
#naive_vectorizer.words_dict.keys()



In [None]:
"""Implement LSTMSentimentTagger. 
The model should use a LSTM module.
Use torch.nn.utils.rnn.pack_padded_sequence to optimize processing of sequences.
When computing vocab_size of embedding layer remeber that padding_symbol counts to the vocab.
Use sigmoid activation function.
"""
from torch.nn.utils.rnn import pack_padded_sequence


class LSTMSentimentTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, classes, drop):
        super(LSTMSentimentTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim, 36)
        self.fc = nn.Linear(36, classes) 
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(p=drop)


    def forward(self, sentence, lengths):
        embedds = self.word_embedding(sentence)
        packed = pack_padded_sequence(embedds, lengths, enforce_sorted=False)
        lstm_out, (hn, cn) = self.lstm(packed)
        hn = hn.view(-1, self.hidden_dim)
        out = self.fc1(hn)
        out = self.dropout(out)
        out = self.relu(out)
        out = self.fc(out)
        tag_scores = self.sigmoid(out)
        # tag_scores = tag_scores.float()
        return tag_scores

# Trainig loop and visualizations


In [None]:
def calculate_accuracy(preds, labels):
    acc = 0
    for p, l in zip(preds, labels):
        if p >= 0.5 :
            p = 1
        else:
            p = 0
        if p == l:
            acc += 1
    acc = acc / len(preds)
    return acc



In [None]:
def calculate_test_acc(test_dataset, batch_size, lstm_tag):
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn,shuffle=True)
    acc = 0
    for b in test_dataloader:
        inputs, lengths, labels = b
        labels = labels.reshape(batch_size,1)
        labels = labels.float()
        outputs = lstm_tag(inputs, lengths)
        for p, l in zip(outputs, labels):
            if p >= 0.5 :
                p = 1
            else:
                p = 0
            if p == int(l):
                acc += 1
    acc = acc / len(test_dataset)
    return acc
        


# ClearML

In [None]:
def train_new_model(batch_size, lr, epochs, train_dataset, test_dataset, task, drop):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=custom_collate_fn,shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn,shuffle=True)
    lstm_tag = LSTMSentimentTagger(embedding_dim=50, hidden_dim=9, vocab_size=len(naive_vectorizer.words_dict) + 1, classes=1, drop=drop)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(lstm_tag.parameters(), lr=lr)
    # losses = []
    # test_accs = []
    j = 0 
    for i in range(epochs):
        for n, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            inputs, lengths, labels = batch
            labels = labels.reshape(batch_size,1)
            labels = labels.float()
            outputs = lstm_tag(inputs, lengths)
            outputs = outputs.float()
            loss = criterion(outputs, labels)
            if n%100 == 0:
                logger.report_scalar(title='Loss', series='Train', iteration=j, value=loss)
                acc = calculate_accuracy(outputs, labels)
                logger.report_scalar(title='Accuracy', series='Train', iteration=j, value=acc)
                j += 1
            loss.backward()
            # losses.append(loss)
            optimizer.step()
        test_acc = calculate_test_acc(test_dataset, batch_size, lstm_tag)
        # test_accs.append(test_acc)
        print("EPOCH", i, " TEST ACC:", test_acc)
        logger.report_scalar(title='Test accuracy', series='Test', iteration=i, value=test_acc)

    return lstm_tag




In [None]:
net, taccs, loss = train_new_model(25, 0.01, 10, train_dataset, test_dataset)

In [None]:
lrs = [0.01, 0.005, 0.001]
batch_sizes = [50, 100, 25]
num_epochs = [10]
dropouts = [0.25, 0.45]
train_dataset, test_dataset = get_datasets()

for epochs in num_epochs:
    for batch_size in batch_sizes:
        for lr in lrs:
            for drop in dropouts:
                name_exp = 'lstm_batchsize_' +str(batch_size) +  '_lr_' + str(lr) + '_dropout_' + str(drop)
                task = Task.create(project_name='clearml_lstm', task_name=name_exp)
                task.mark_started()
                logger = task.get_logger()
                task.set_parameters({'lr': lr, 'batch_size': batch_size})
                net = train_new_model(batch_size, lr, epochs, train_dataset, test_dataset, task, drop)
                # for i, tacc in enumerate(taccs):
                #     logger.report_scalar(title='Accuracy', series='Test', iteration=i, value=tacc)
                # for i, loss in enumerate(losses):
                #     logger.report_scalar(title='Loss', series='Train', iteration=i, value=loss)
                task.mark_completed()
                task.close()
            
            


EPOCH 0  TEST ACC: 0.8376
EPOCH 1  TEST ACC: 0.85904
EPOCH 2  TEST ACC: 0.85888
EPOCH 3  TEST ACC: 0.85876
EPOCH 4  TEST ACC: 0.849
EPOCH 5  TEST ACC: 0.85016
EPOCH 6  TEST ACC: 0.85004
EPOCH 7  TEST ACC: 0.84452
EPOCH 8  TEST ACC: 0.83384
EPOCH 9  TEST ACC: 0.83652
2022-01-16 00:38:32,409 - clearml.Task - INFO - Waiting to finish uploads
2022-01-16 00:38:32,520 - clearml.Task - INFO - Finished uploading
EPOCH 0  TEST ACC: 0.80992
EPOCH 1  TEST ACC: 0.82516
EPOCH 2  TEST ACC: 0.82204
EPOCH 3  TEST ACC: 0.83228
EPOCH 4  TEST ACC: 0.83076
EPOCH 5  TEST ACC: 0.8276
EPOCH 6  TEST ACC: 0.83276
EPOCH 7  TEST ACC: 0.82532
EPOCH 8  TEST ACC: 0.8262
EPOCH 9  TEST ACC: 0.83136
EPOCH 0  TEST ACC: 0.80592
EPOCH 1  TEST ACC: 0.85736


In [None]:
import torch.optim as optim
# model for hidden_dim =3 after 5 epochs aroud scores almost 80%, unfortunately it seems to be overfitting
lr = 0.01
epochs = 10
batch_size = 25
train_dataset, test_dataset = get_datasets()
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=custom_collate_fn,shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=custom_collate_fn,shuffle=True)
criterion = nn.BCELoss() #nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_tag.parameters(), lr=lr)

In [None]:
lstm_tag = LSTMSentimentTagger(embedding_dim=50, hidden_dim=3, vocab_size=len(naive_vectorizer.words_dict) + 1, classes=1, drop=0.5)

In [None]:
#lstm_tag = LSTMSentimentTagger(embedding_dim=50, hidden_dim=2, vocab_size=len(naive_vectorizer.words_dict) + 1, classes=1)
#task = 
losses = []
accs = []
test_accs = []
for i in range(epochs):
    for n, batch in enumerate(train_dataloader):
      inputs, lengths, labels = batch
      labels = labels.reshape(batch_size,1)
      labels = labels.float()
      # zero the parameter gradients
      optimizer.zero_grad()
      # forward + backward + optimize
      outputs = lstm_tag(inputs, lengths)
      outputs = outputs.float()
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()
      if n % 100 == 0:
          print("Batch ", n, 'loss :', float(loss))
          if n%200 == 0:
              losses.append(loss)
              acc = calculate_accuracy(outputs, labels)
              print( "EPOCH", i+1, "Batch", n, 'training accuracy :', float(acc))
              accs.append(acc)
    test_acc = calculate_test_acc(test_dataset, batch_size, lstm_tag)
    test_accs.append(test_accs)
    print("EPOCH", i, " TEST ACC:", test_acc)
    

In [None]:
test_acc = calculate_test_acc(test_dataset, batch_size, lstm_tag)

In [None]:
test_acc

0.77568