# Task definition
Implement LSTM Sentiment Tagger for imdb reviews dataset.

1. (5pt) Fill missing code below
    * 1pt implement vectorization
    * 2pt implement \_\_init\_\_ and forward methods of models
    * 2pt implement collate function
2. (4pt) Implement training loop, choose proper loss function, use clear ml for max points.
    * 2pts is a baseline for well written, working code
    * 2pts if clear ml used properly
3. (3pt) Train the models (find proper hyperparams). Make sure you are not overfitting or underfitting. Visualize training of your best model (plot training, and test loss/accuracy in time). Your model should reach at least 87% accuracy. For max points it should exceed 89%. 
    * 1pt for accuracy above 89%
    * 1pt for accuracy above 87%
    * 1pt for visualizations

Remarks:
* Use embeddings of size 50
* Use 0.5 threshold when computing accuracy.
* Use supplied dataset for training and evaluation.
* You do not have to use validation set.
* You should monitor overfitting during training.
* For max points use clear ml to store and manage logs from your experiments. 
* We encourage to use pytorch lightning library (Addtional point for using it - however the sum must not exceed 12)

[Clear ML documentation](https://clear.ml/docs/latest/docs/)

[Clear ML notebook exercise from bootcamp](https://colab.research.google.com/drive/1wtLb4gg8beLS7smcyJlOZppn6_rQvSxL?usp=sharing)

In [4]:
!pip install clearml plotly-express nbformat kaleido

import os
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torchtext
from clearml import Task, Model

import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import Trainer

import pickle

import plotly.express as px
import nbformat



In [5]:

web_server = 'https://app.community.clear.ml'
api_server = 'https://api.community.clear.ml'
files_server = 'https://files.community.clear.ml'
access_key = ''#@param {type:"string"}
secret_key = ''#@param {type:"string"}

Task.set_credentials(web_host=web_server,
                     api_host=api_server,
                     files_host=files_server,
                     key=access_key,
                     secret=secret_key)

In [6]:
# !pip install gdown
# !gdown https://drive.google.com/uc?id=1hK-3iiRPlbePb99Fe-34LJNZ5yB-nduq
# !tar -xvzf imdb_dataset.gz
data = pd.read_csv("imdb_dataset.csv")

In [7]:
PADDING_VALUE = 0
PADDING_OBJECT = object()

def flatten(list_of_lists):
  result = []
  for l in list_of_lists:
    result.extend(l)
  return result

class NaiveVectorizer:
    def __init__(self, tokenized_data, **kwargs):
        """Converts data from string to vector of ints that represent words. 
        Prepare lookup dict (self.wv) that maps token to int. Reserve index 0 for padding.
        """
        tokenized_data = [seq.split() for seq in tokenized_data]
        self.wv = {}
        self.wv[PADDING_OBJECT] = PADDING_VALUE
        ### Your code goes here ###
        for i, token in enumerate(set(flatten(tokenized_data)), start = PADDING_VALUE + 1):
          self.wv[token] = i  
        
        self.vocab_size = len(self.wv)
        ##################################

    def vectorize(self, tokenized_seq):
        """Converts sequence of tokens into sequence of indices.
        If the token does not appear in the vocabulary(self.wv) it is ommited
        Returns torch tensor of shape (seq_len,) and type long."""
        ### Your code goes here ###
        indices_list = [self.wv[token] for token in tokenized_seq if token in self.wv]
        return torch.tensor(indices_list, dtype=torch.long)

        ##################################

class ImdbDataset(Dataset):
    SPLIT_TYPES = ["train", "test", "unsup"]

    def __init__(self, data, preprocess_fn, split="train"):
        super(ImdbDataset, self).__init__()
        if split not in self.SPLIT_TYPES:
            raise AttributeError(f"No such split type: {split}")

        self.split = split
        self.label = [i for i, c in enumerate(data.columns) if c == "sentiment"][0]
        self.data_col = [i for i, c in enumerate(data.columns) if c == "tokenized"][0]
        self.data = data[data["split"] == self.split]
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.preprocess_fn(self.data.iloc[idx, self.data_col].split())
        label = self.data.iloc[idx, self.label]
        return (seq, label)

# naive_vectorizer = NaiveVectorizer(data.loc[data["split"] == "train", "tokenized"])
with open('vectorizer.p', 'rb') as f:
    naive_vectorizer = pickle.load(f)

def get_datasets():
    train_dataset = ImdbDataset(data, naive_vectorizer.vectorize)
    test_dataset = ImdbDataset(data, naive_vectorizer.vectorize, split="test")

    return train_dataset, test_dataset


def custom_collate_fn(pairs):
    """This function is supposed to be used by dataloader to prepare batches
    Input: list of tuples (sequence, label)
    Output: sequences_padded_to_the_same_lenths, original_lenghts_of_sequences, lables.
    torch.nn.utils.rnn.pad_sequence might be usefull here
    """
    ### Your code goes here ###
    sequences_python = [seq for seq, _ in pairs]
    labels = [label for _, label in pairs]
    lengths = [len(seq) for seq in sequences_python]
    sequences_torch = torch.nn.utils.rnn.pad_sequence(sequences_python).squeeze().T

    #################################
    return sequences_torch, torch.tensor(lengths, dtype=int), torch.tensor(labels).reshape([-1, 1])

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_number_of_correct_predictions(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()
    return acc

train_set, test_set = get_datasets()

In [9]:
"""Implement LSTMSentimentTagger. 
The model should use a LSTM module.
Use torch.nn.utils.rnn.pack_padded_sequence to optimize processing of sequences.
When computing vocab_size of embedding layer remeber that padding_symbol counts to the vocab.
Use sigmoid activation function.
"""
class LSTMSentimentTagger(pl.LightningModule):
    def __init__(self, vocab_size, classes, embedding_dim=50, hidden_dim=2, layers_count=5, lr=1e-3, batch_size=64):
        super().__init__()
        ### Your code goes here ###
        self.save_hyperparameters()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.classes = classes
        self.layers_count = layers_count
        self.lr = lr
        self.batch_size = batch_size
        self.validation_correct = 0
        self.validation_total = 0

        self._init_modules()
        #################################
    
    def _init_modules(self):
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(input_size = self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.layers_count, batch_first=True)
        self.fc = nn.Linear(in_features = self.hidden_dim, out_features = self.classes)

    def forward(self, sentences, lengths):
        embeddings = torch.nn.utils.rnn.pack_padded_sequence(self.embedding(sentences), lengths.to('cpu'), batch_first=True, enforce_sorted=False)
        thru_lstm, (h_n, c_n) = self.lstm(embeddings)
        thru_lstm, _ = torch.nn.utils.rnn.pad_packed_sequence(thru_lstm, batch_first=True)
        number = self.fc(thru_lstm[torch.arange(thru_lstm.size(0)), lengths-1].squeeze())
        #################################
        return number
    
    def training_step(self, batch, batch_idx):
        seqs, lengths, labels = batch
        logits = self(seqs, lengths)
        loss = F.binary_cross_entropy_with_logits(logits, labels)
        acc = get_number_of_correct_predictions(logits, labels)
        self.log('train_loss', loss, on_epoch=True, on_step=True)
        self.log('train_correct', acc, on_epoch=True, on_step=True)
        return {'loss': loss}

    def validation_step(self, batch, batch_idx):
        seqs, lengths, labels = batch
        logits = self(seqs, lengths)
        loss = F.binary_cross_entropy_with_logits(logits, labels)
        acc = get_number_of_correct_predictions(logits, labels)
        self.log('val_loss', loss, on_epoch=True, on_step=True)
        self.log('val_correct', acc, on_epoch=True, on_step=True)
        return {'loss': loss}

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
    
    def train_dataloader(self):
        # num_workers = 8 because my CPU has 8 threads
        return DataLoader(train_set, self.batch_size, collate_fn=custom_collate_fn, shuffle=True, num_workers=8)

    def val_dataloader(self):
        return DataLoader(test_set, self.batch_size, collate_fn=custom_collate_fn, num_workers=8)


# Trainig loop and visualizations


In [7]:
config = {
    'lr': .0001,
    'hidden_dim': 200,
    'layers_count': 6,
    'embedding_dim': 50,
    'batch_size': 64,
    'n_epochs': 15,
}

for HIDDEN_DIM in [200, 100, 75]:
    for LAYERS_COUNT in [6, 5, 4]:
        for LR in [.0001, .0003, .00003]:
            config['hidden_dim'] = HIDDEN_DIM
            config['layers_count'] = LAYERS_COUNT
            config['lr'] = LR

            task = Task.init(project_name='lightning2', task_name='lightning example')
            task.connect(config)

            trainer = Trainer(gpus=1 if torch.cuda.is_available() else 0, max_epochs=config['n_epochs'])
            model = LSTMSentimentTagger(vocab_size=naive_vectorizer.vocab_size, classes=1, 
                                        embedding_dim=config['embedding_dim'], 
                                        hidden_dim=config['hidden_dim'], 
                                        layers_count=config['layers_count'],
                                        lr=config['lr'], 
                                        batch_size=config['batch_size'])
            trainer.fit(model)
            task.mark_completed()
            task.close()



ClearML Task: overwriting (reusing) task id=17ebdc2df698445da7995cd5bd3723b9
2022-01-09 02:05:45,009 - clearml.Task - INFO - No repository found, storing script code instead
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/17ebdc2df698445da7995cd5bd3723b9/output/log


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.8 M 
2 | fc        | Linear    | 201   
----------------------------------------
6.2 M     Trainable params
0         Non-trainable params
6.2 M     Total params
24.645    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:47<00:00,  4.68it/s, loss=0.246, v_num=68]
ClearML Task: created new task id=a57664e9bfd4448aa652dfc581294632
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/a57664e9bfd4448aa652dfc581294632/output/log
2022-01-09 02:48:24,162 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.8 M 
2 | fc        | Linear    | 201   
----------------------------------------
6.2 M     Trainable params
0         Non-trainable params
6.2 M     Total params
24.645    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:47<00:00,  4.67it/s, loss=0.692, v_num=69]
ClearML Task: created new task id=15fa246119a84950bf3f6c2b5a0b8911
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/15fa246119a84950bf3f6c2b5a0b8911/output/log
2022-01-09 03:30:33,610 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.8 M 
2 | fc        | Linear    | 201   
----------------------------------------
6.2 M     Trainable params
0         Non-trainable params
6.2 M     Total params
24.645    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:47<00:00,  4.67it/s, loss=0.359, v_num=70]
ClearML Task: created new task id=c7f65b8fd064472cb2e1b961746f6f9c
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/c7f65b8fd064472cb2e1b961746f6f9c/output/log
2022-01-09 04:12:39,276 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.5 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.8 M     Trainable params
0         Non-trainable params
5.8 M     Total params
23.358    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:22<00:00,  5.47it/s, loss=0.256, v_num=71]
ClearML Task: created new task id=bb4b3a00bf9e4ff6916502bf53b9ec25
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/bb4b3a00bf9e4ff6916502bf53b9ec25/output/log
2022-01-09 04:48:33,383 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.5 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.8 M     Trainable params
0         Non-trainable params
5.8 M     Total params
23.358    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:23<00:00,  5.47it/s, loss=0.312, v_num=72]
ClearML Task: created new task id=f6e122608bb6457cb7e741559165663f
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/f6e122608bb6457cb7e741559165663f/output/log
2022-01-09 05:24:28,513 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.5 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.8 M     Trainable params
0         Non-trainable params
5.8 M     Total params
23.358    Total estimated model params size (MB)


Epoch 8:  83%|████████▎ | 652/782 [02:04<00:24,  5.25it/s, loss=0.459, v_num=73]

Retrying (Retry(total=237, connect=237, read=240, redirect=240, status=240)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fd8d69bf340>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /v2.13/events.add_batch


Epoch 14: 100%|██████████| 782/782 [02:23<00:00,  5.46it/s, loss=0.416, v_num=73]
ClearML Task: created new task id=ee8f7b6367a74a4698af33ce27dee6b3
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/ee8f7b6367a74a4698af33ce27dee6b3/output/log
2022-01-09 06:00:23,633 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.2 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.5 M     Trainable params
0         Non-trainable params
5.5 M     Total params
22.072    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:03<00:00,  6.35it/s, loss=0.187, v_num=74]
ClearML Task: created new task id=6b01a4a830264179b10c378bbee5660c
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/6b01a4a830264179b10c378bbee5660c/output/log
2022-01-09 06:31:20,313 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.2 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.5 M     Trainable params
0         Non-trainable params
5.5 M     Total params
22.072    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:02<00:00,  6.36it/s, loss=0.163, v_num=75]
ClearML Task: created new task id=20408828b72a4a11849e5cda4affe196
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/20408828b72a4a11849e5cda4affe196/output/log
2022-01-09 07:02:20,580 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 1.2 M 
2 | fc        | Linear    | 201   
----------------------------------------
5.5 M     Trainable params
0         Non-trainable params
5.5 M     Total params
22.072    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [02:03<00:00,  6.35it/s, loss=0.306, v_num=76]
ClearML Task: created new task id=0078856f809a439ea2a9e00d578cbd7d
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/0078856f809a439ea2a9e00d578cbd7d/output/log
2022-01-09 07:33:19,154 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 464 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.8 M     Trainable params
0         Non-trainable params
4.8 M     Total params
19.265    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:48<00:00,  7.23it/s, loss=0.317, v_num=77]
ClearML Task: created new task id=eeac41b7074f4dde90004fc2fdb4d40f
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/eeac41b7074f4dde90004fc2fdb4d40f/output/log
2022-01-09 08:00:43,506 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 464 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.8 M     Trainable params
0         Non-trainable params
4.8 M     Total params
19.265    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:49<00:00,  7.17it/s, loss=0.245, v_num=78]
ClearML Task: created new task id=4d48c50a4e534d14a51f0a2cc6575c55
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/4d48c50a4e534d14a51f0a2cc6575c55/output/log
2022-01-09 08:28:04,527 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 464 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.8 M     Trainable params
0         Non-trainable params
4.8 M     Total params
19.265    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:49<00:00,  7.12it/s, loss=0.554, v_num=79]
ClearML Task: created new task id=d16450f6cab342f386eddfacc77b9d26
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/d16450f6cab342f386eddfacc77b9d26/output/log
2022-01-09 08:55:30,961 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 384 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.942    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:43<00:00,  7.59it/s, loss=0.261, v_num=80]
ClearML Task: created new task id=2184bf82035c4cd79c442e1a1603e997
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/2184bf82035c4cd79c442e1a1603e997/output/log
2022-01-09 09:21:17,482 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 384 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.942    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:33<00:00,  8.33it/s, loss=0.131, v_num=81] 
ClearML Task: created new task id=095ebb60e20a446bb8e917192c14d127
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/095ebb60e20a446bb8e917192c14d127/output/log
2022-01-09 09:45:08,386 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 384 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.942    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:32<00:00,  8.44it/s, loss=0.38, v_num=82] 
ClearML Task: created new task id=e8a247d4782f4a20bdba4216ffe34e41
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/e8a247d4782f4a20bdba4216ffe34e41/output/log
2022-01-09 10:08:41,525 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 303 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.619    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:28<00:00,  8.88it/s, loss=0.256, v_num=83]
ClearML Task: created new task id=4dc00e12927d4ea7b66d4954f345f404
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/4dc00e12927d4ea7b66d4954f345f404/output/log
2022-01-09 10:30:24,061 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 303 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.619    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:21<00:00,  9.59it/s, loss=0.125, v_num=84] 
ClearML Task: created new task id=7c9c0a08d54d4575813b8b6c9b731728
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/7c9c0a08d54d4575813b8b6c9b731728/output/log
2022-01-09 10:50:53,694 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 303 K 
2 | fc        | Linear    | 101   
----------------------------------------
4.7 M     Trainable params
0         Non-trainable params
4.7 M     Total params
18.619    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:20<00:00,  9.75it/s, loss=0.393, v_num=85]
ClearML Task: created new task id=07c53b64dd1742129b1c23a8716c8ead
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/07c53b64dd1742129b1c23a8716c8ead/output/log
2022-01-09 11:11:16,668 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 266 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.470    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:56<00:00,  6.69it/s, loss=0.276, v_num=86]
ClearML Task: created new task id=12a3982eea6f43808d550b55fbed48eb
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/12a3982eea6f43808d550b55fbed48eb/output/log
2022-01-09 11:40:37,649 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 266 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.470    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:56<00:00,  6.72it/s, loss=0.14, v_num=87]  
ClearML Task: created new task id=f50bad4444a841a29af1962f45eb58a3
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/f50bad4444a841a29af1962f45eb58a3/output/log
2022-01-09 12:09:58,634 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 266 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.470    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:56<00:00,  6.71it/s, loss=0.452, v_num=88]
ClearML Task: created new task id=2287c5bbd3f94b4fa0d7998ef84e60b2
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/2287c5bbd3f94b4fa0d7998ef84e60b2/output/log
2022-01-09 12:39:18,591 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 220 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.288    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:32<00:00,  8.48it/s, loss=0.596, v_num=89]
ClearML Task: created new task id=ab9574c1877b41d780e13158deaa7f25
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/ab9574c1877b41d780e13158deaa7f25/output/log
2022-01-09 13:02:39,437 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 220 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.288    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:31<00:00,  8.51it/s, loss=0.134, v_num=90] 
2022-01-09 13:25:53,810 - clearml.Task - INFO - Waiting to finish uploads
2022-01-09 13:25:54,167 - clearml.Task - INFO - Finished uploading
ClearML Task: created new task id=1b305a21f6dd4db2b9c124d3c8e3e682
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/1b305a21f6dd4db2b9c124d3c8e3e682/output/log
2022-01-09 13:26:01,910 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 220 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.6 M     Trainable params
0         Non-trainable params
4.6 M     Total params
18.288    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:31<00:00,  8.50it/s, loss=0.489, v_num=91]
ClearML Task: created new task id=27a2a672ae7e4e93805683a11adf157e
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/27a2a672ae7e4e93805683a11adf157e/output/log
2022-01-09 13:50:14,673 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 174 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total params
18.106    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:17<00:00, 10.12it/s, loss=0.247, v_num=92]
ClearML Task: created new task id=b9c679bc2a354348994ce77710475ec4
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/b9c679bc2a354348994ce77710475ec4/output/log
2022-01-09 14:09:46,861 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 174 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total params
18.106    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:17<00:00, 10.08it/s, loss=0.117, v_num=93] 
ClearML Task: created new task id=1425eeb729474e2bb2dabd62eb5b6f22
ClearML results page: https://app.community.clear.ml/projects/b62c3994002846a68db49133adfa829f/experiments/1425eeb729474e2bb2dabd62eb5b6f22/output/log
2022-01-09 14:29:22,317 - clearml.Task - INFO - No repository found, storing script code instead


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | embedding | Embedding | 4.4 M 
1 | lstm      | LSTM      | 174 K 
2 | fc        | Linear    | 76    
----------------------------------------
4.5 M     Trainable params
0         Non-trainable params
4.5 M     Total params
18.106    Total estimated model params size (MB)


Epoch 14: 100%|██████████| 782/782 [01:18<00:00,  9.98it/s, loss=0.412, v_num=94]
2022-01-09 14:49:02,719 - clearml.Task - INFO - Waiting to finish uploads
2022-01-09 14:49:03,440 - clearml.Task - INFO - Finished uploading


In [None]:
# Save the vectorizer

# import pickle

# with open('vectorizer.p', 'wb') as f:
#     pickle.dump(naive_vectorizer, f)

In [41]:
# Best model (on val/test set) out of all training sessions

task = Task.get_task('4dc00e12927d4ea7b66d4954f345f404')
clearml_model = Model('79b4c50e2cd444b79b18e25a56efc6bb')
# model = LSTMSentimentTagger.load_from_checkpoint(clearml_model.url)
model = LSTMSentimentTagger.load_from_checkpoint('./model.ckpt')

In [11]:
model

LSTMSentimentTagger(
  (embedding): Embedding(87028, 50)
  (lstm): LSTM(50, 100, num_layers=4, batch_first=True)
  (fc): Linear(in_features=100, out_features=1, bias=True)
)

In [12]:
trainer = Trainer(gpus=1 if torch.cuda.is_available() else 0)

test_data_loader = DataLoader(test_set, model.batch_size, collate_fn=custom_collate_fn, num_workers=8)

model.eval()
correct_predictions = 0
all_predictions = 0
for batch in test_data_loader:
    seqs, lengths, labels = batch
    logits = model(seqs, lengths)
    correct_predictions += get_number_of_correct_predictions(logits, labels)
    all_predictions += labels.shape[0]
    print('.', end='')
print()
print(f"Model accuracy on test set: {(correct_predictions / all_predictions) * 100}%")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


.......................................................................................................................................................................................................................................................................................................................................................................................................
Model accuracy on test set: 87.37999725341797%


In [36]:
scalars = task.get_reported_scalars()

def save_plot(scalars, key):
    actual_dict = scalars[key][key]
    x = actual_dict['x']
    y = actual_dict['y']
    fig = px.line(x=x, y=y, title=key)
    fig.write_image(f'./images/{key}.png')

save_plot(scalars, 'val_correct_epoch')
save_plot(scalars, 'val_correct_step')
save_plot(scalars, 'val_loss_epoch')
save_plot(scalars, 'val_loss_step')

save_plot(scalars, 'train_correct_epoch')
save_plot(scalars, 'train_correct_step')
save_plot(scalars, 'train_loss_epoch')
save_plot(scalars, 'train_loss_step')