### Assignment 2: Text Classification

In [1]:
# %load_ext autoreload
# %autoreload 2
import torch
import gc
import pickle

from random import seed
from torch.utils.data import DataLoader
from lstm_data_preprocessing import TextDataset
from lstm_model import RecurrentClassifier
from lstm_trainer import Trainer

In [2]:
seed(4)

Toxicity, such as insults, threats and hate speech, in online conversations is a real threat to productive sharing of opinons. To mitigate this problem automatic comment filtering system may be applied.
In this assignment you are provided with data, collected by [Jigsaw](https://jigsaw.google.com/
) company from Wikipedia’s talk page edits. Each comment was labeled with toxicity rating from 0 to 5. Here are some examples of the least toxic comments.

Then data is splitted into train and validation parts 

In [3]:
with open('data/train_cls.pkl', 'rb') as f:
    train_texts = pickle.load(f)
train_dataset = TextDataset([label for item in train_texts for label in item['summarized_ids']],
                            [emb for item in train_texts for emb in item['sentences_embeddings']])

del train_texts
gc.collect()

with open('data/val_cls.pkl', 'rb') as f:
    val_texts = pickle.load(f)
val_dataset = TextDataset([label for item in val_texts for label in item['summarized_ids']],
                          [emb for item in val_texts for emb in item['sentences_embeddings']])
del val_texts
gc.collect()

88

In [4]:
emb_matrix = torch.zeros((len(train_dataset.sentences_embeddings), train_dataset.sentences_embeddings[0].size(0)))
print(f'emb_matrix.shape: {emb_matrix.size()}')

for i, emb in enumerate(train_dataset.sentences_embeddings):
    assert type(emb) == torch.Tensor, f'type of emb: {type(emb)}'
    emb_matrix[i] = emb

emb_matrix.shape: torch.Size([1985671, 768])


Then pretrained embeddings are obtained with Gensim - it'll automatically download them for you. [Here](https://github.com/RaRe-Technologies/gensim-data#models
) you can see other pretrained embeddings.

Now let's define hyperparameters for our baseline model. It'll be a 2-layered unidiractional LSTM

In [6]:
config = {
    "freeze": True,
    "cell_type": "LSTM",
    "cell_dropout": 0.5,
    "num_layers": 2,
    "hidden_size": 300,
    "out_activation": "tanh",
    "bidirectional": True,
    "out_dropout": 0.5,
    "out_sizes": [300],
}

trainer_config = {
    "lr": 3e-4,
    "n_epochs": 30,
    "weight_decay": 1e-6,
    "batch_size": 256,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

clf_model = RecurrentClassifier(config, emb_matrix)

In [7]:
print(trainer_config['device'])

cpu


In [8]:
train_dataloader = DataLoader(train_dataset, 
                              batch_size=trainer_config["batch_size"],
                              shuffle=True,
                              num_workers=2,
                              collate_fn=train_dataset.collate_fn)
print(train_dataloader)
val_dataloader = DataLoader(val_dataset, 
                            batch_size=trainer_config["batch_size"],
                            shuffle=False,
                            num_workers=2,
                            collate_fn=val_dataset.collate_fn)
print(val_dataloader)
t = Trainer(trainer_config)
print(t)
t.fit(clf_model, train_dataloader, val_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x000001C53ECF8C10>
<torch.utils.data.dataloader.DataLoader object at 0x000001C53ECF8280>
<trainer.Trainer object at 0x000001C53ECF8B20>
epoch №1


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.27; Acc:0.926
epoch №2


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №3


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №4


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №5


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №6


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №7


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №8


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №9


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №10


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №11


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №12


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №13


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №14


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №15


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №16


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №17


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №18


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №19


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №20


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №21


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №22


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.266; Acc:0.926
epoch №23


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №24


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №25


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №26


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №27


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №28


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №29


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926
epoch №30


  0%|          | 0/7757 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

Loss=0.265; Acc:0.926


RecurrentClassifier(
  (cell): LSTM(768, 300, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (out_dropout): Dropout(p=0.5, inplace=False)
  (out_proj): Sequential(
    (0): Linear(in_features=1200, out_features=300, bias=True)
    (1): Linear(in_features=300, out_features=2, bias=True)
  )
)