# Assignment 5

Build CNN model for sentiment analysis (binary classification) of IMDB Reviews (https://www.kaggle.com/utathya/imdb-review-dataset). You can use data with label="unsup" for pretraining of embeddings. Here you are forbidden to use test dataset for pretraining of embeddings.
Your quality metric is accuracy score on test dataset. Look at "type" column for train/test split.
You can use pretrained embeddings from external sources.
You have to provide data for trials with different hyperparameter values.

You have to beat following baselines:
[3 points] acc = 0.75
[5 points] acc = 0.8
[8 points] acc = 0.9

[2 points] for using unsupervised data

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

idd = '1smuY3sJJ6wcL28i0QcBSlnEsimB5holu'
downloaded_ = drive.CreateFile({'id':idd}) 
downloaded_.GetContentFile('imdb_master.csv')

In [2]:
import pandas as pd 
import numpy as np
import spacy
from spacy.symbols import ORTH
import re
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, KeyedVectors

import torch
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator, Dataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.vocab import Vectors

SEED = 42
np.random.seed(SEED)

import nltk
nltk.download('stopwords')

spacy_en = spacy.load('en')
spacy_en.tokenizer.add_special_case("don't", [{ORTH: "do"}, {ORTH: "not"}])
spacy_en.tokenizer.add_special_case("didn't", [{ORTH: "did"}, {ORTH: "not"}]) #adding special case so that tokenizer("""don't""") != 'do'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#0. Preprocessing 

In [3]:
df = pd.read_csv('imdb_master.csv', sep=',', encoding= 'latin-1',  index_col=0)
df = df.drop(columns=['file'])
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [4]:
#for future embedding training
mask = df['type'] == 'train'
df_train_unsup = df[mask]
print(len(df_train_unsup))

75000


In [5]:
#Let's separate 'unsup' elements for now, but we will use them later
mask = df['label'] == 'unsup'
df_unsup = df[mask]
df = df[~mask]
len(df_unsup), len(df)

(50000, 50000)

In [6]:
#making sure that we don't have 'unsup' lables in test
mask = df_unsup['type'] == 'test'
len(df_unsup[mask])

0

In [7]:
#now we split our labled data
mask = df['type'] == 'train'
df_train = df[mask]
df_test = df[~mask]
len(df_train), len(df_test)

(25000, 25000)

In [0]:
df_train.to_csv("dataset_train.csv", index=False)
df_test.to_csv("dataset_test.csv", index=False)
df_unsup.to_csv("dataset_unsup.csv", index=False)

In [0]:
def tokenizer(text):
    return [tok.lemma_ for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]

In [10]:
#Using 'unsup'+'train' data to pretrain custom embeddings / no 'tets' data

tokenized_unsup = list(df_train_unsup['review'].apply(tokenizer))
w2v_model = Word2Vec(tokenized_unsup, size=100)
weights = torch.FloatTensor(w2v_model.wv.vectors)
w2v_model.wv.save_word2vec_format('pretrained_embeddings')
vectors = Vectors(name='pretrained_embeddings', cache='./')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
MAX_VOCAB_SIZE = 50000
classes={'neg': 0, 'pos': 1}


REVIEW = Field(sequential=True, include_lengths=False, batch_first=True, tokenize=tokenizer, pad_first=True, lower=True, eos_token='<eos>') 
LABEL = LabelField(dtype=torch.float, use_vocab=True, preprocessing=lambda x: classes[x])

train = TabularDataset('dataset_train.csv', 
                                format='csv', fields=[(None,None),('review', REVIEW),('label', LABEL)], 
                                skip_header=True)

test = TabularDataset('dataset_test.csv', 
                                format='csv', fields=[(None,None),('review', REVIEW),('label', LABEL)], 
                                skip_header=True)

dataset_unsup = TabularDataset('dataset_unsup.csv', 
                                format='csv', fields=[(None,None),('review', REVIEW), (None, None)], 
                                skip_header=True)

REVIEW.build_vocab(train, dataset_unsup, min_freq=2, vectors=vectors,
                   unk_init = torch.Tensor.normal_, max_size=MAX_VOCAB_SIZE) #we use 'unsup' data to build vocab/emb, but not test data
LABEL.build_vocab(train, dataset_unsup)
vocab = REVIEW.vocab

In [12]:
print('Vocab size:', len(REVIEW.vocab.itos))
REVIEW.vocab.itos[:10]

Vocab size: 50003


['<unk>', '<pad>', '<eos>', 'the', 'be', 'a', 'and', 'of', 'to', 'in']

In [0]:
#I tried to use train/test split with different proportions but the model is so overfiting that I decided to just train model on whole train dataset and test model after every epoch
#train, valid = train.split(0.95, stratified=True, random_state=np.random.seed(SEED))

In [13]:
print(train[0].review)
print(train[0].label)

['story', 'of', 'a', 'man', 'who', 'have', 'unnatural', 'feeling', 'for', 'a', 'pig', 'starts', 'out', 'with', 'a', 'open', 'scene', 'that', 'be', 'a', 'terrific', 'example', 'of', 'absurd', 'comedy', 'a', 'formal', 'orchestra', 'audience', 'be', 'turn', 'into', 'a', 'insane', 'violent', 'mob', 'by', 'the', 'crazy', 'chantings', 'of', '-pron-', 'singer', 'unfortunately', 'it', 'stay', 'absurd', 'the', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'make', 'it', 'just', 'too', 'off', 'putt', 'even', 'that', 'from', 'the', 'era', 'should', 'be', 'turn', 'off', 'the', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'to', 'a', '3', 'grader', 'on', 'a', 'technical', 'level', '-pron-', 'well', 'than', 'you', 'may', 'think', 'with', 'some', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'star', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'be', 'see', 'briefly']
0


# 1. MyModel

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels, padding_idx):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=padding_idx)
        self.embedding.weight.data.copy_(vocab.vectors)        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k) for k in kernels])
        #self.dropout = nn.Dropout(dropout)        
        self.fc = nn.Linear(hidden_size * len(kernels), 1)

        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.relu(z)
            z = F.max_pool1d(z, kernel_size=z.size(2)).squeeze(2)
            concatenated.append(z)
            
        x = torch.cat(concatenated, 1)
        #x = self.dropout(x)
        x = self.fc(x)
        return x

In [0]:
def create_model(batch_size, hidden_size, kernels):
    """
    Функция определяет модель по заданным гиперпараметрам и возвращает итераторы с заданным batch_size, а также оптимайзер и критерий
    """
    torch.cuda.empty_cache()    

    padding_idx = REVIEW.vocab.stoi[REVIEW.pad_token]
    
    model = MyModel(len(REVIEW.vocab.itos),
                    embed_size=100,
                    hidden_size=hidden_size,
                    kernels=kernels,
                    padding_idx = padding_idx
                )

    train_iterator, test_iterator = BucketIterator.splits(
        (train, test),
        batch_sizes=(batch_size, batch_size),
        shuffle=True,
        sort_key=lambda x: len(x.review),
    )

    optimizer = optim.Adam(model.parameters())
    criterion = nn.BCEWithLogitsLoss()
    return model, train_iterator, test_iterator, optimizer, criterion

In [0]:
def accuracy_score(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    preds = (preds == y).float()
    accuracy = preds.sum() / len(preds)
    return accuracy

In [0]:
def test_model(model, test_iterator):
    test_acc = []

    with torch.no_grad():
        for item in test_iterator:
            x = item.review
            y = item.label
            preds = model(x).squeeze(1)
            test_acc.append(accuracy_score(preds, y))
    test_acc = np.mean(test_acc) 
    return print('Test accuracy: {}'.format(np.mean(test_acc)))

In [0]:
def train_cnn(model, train_iterator, test_iterator, criterion, device, n_epochs=20):
    
    history = []

    for epoch in range(n_epochs):
        train_loss = []
        train_acc = []
        model.train()

    
        for item in tqdm(train_iterator):
            x = item.review
            y = item.label
            optimizer.zero_grad()
            preds = model(x).squeeze(1)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.data.detach().item())
            train_acc.append(accuracy_score(preds, y))

        train_loss = np.mean(train_loss)
        train_acc = np.mean(train_acc)

        model.eval()
        test_model(model, test_iterator)


        print('Epoch: {}. Train loss: {:.3f}. Train accuracy: {:.3f}.'.format(
            epoch, train_loss, train_acc))        
        
        history.append({
            'epoch': epoch,
            'train_loss': train_loss,
            'train_acc': train_acc,
        })

        if epoch % 5 == 0:
            torch.save(model.state_dict(), '/content/model_test')

    return history

In [0]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

In [0]:
def count_parameters(model):
    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return print(f'The model has {params:,} trainable parameters')

## Hyperparams 

Попробуем руками посмотреть/подобрать гиперпараметры. Для этого создадим несколько моделей с разными наборами гиперпараметров и выберем.

Пойдем по жадному пути. Сначала выберем kernels, затем batch_size, затем hidden_state. 

In [24]:
#kernels = [2,3,4,5] - падает коллаб
clean_tqdm()
model, train_iterator, test_iterator, optimizer, criterion = create_model(64, 128, [2,3,4,5])
count_parameters(model)
history = train_cnn(model, train_iterator, test_iterator,
          criterion, device='cpu', n_epochs=2)

  0%|          | 0/391 [00:00<?, ?it/s]

The model has 5,180,525 trainable parameters


 48%|████▊     | 189/391 [02:30<02:30,  1.34it/s]

KeyboardInterrupt: ignored

In [25]:
#kernels = [2,3]
clean_tqdm()
model, train_iterator, test_iterator, optimizer, criterion = create_model(64, 128, [2,3])
count_parameters(model)
history = train_cnn(model, train_iterator, test_iterator,
          criterion,  device='cpu', n_epochs=2)


  0%|          | 0/391 [00:00<?, ?it/s]

The model has 5,064,813 trainable parameters


100%|██████████| 391/391 [02:19<00:00,  2.75it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8735293745994568
Epoch: 0. Train loss: 0.383. Train accuracy: 0.827.


100%|██████████| 391/391 [02:21<00:00,  2.81it/s]


Test accuracy: 0.8704763650894165
Epoch: 1. Train loss: 0.255. Train accuracy: 0.897.


In [26]:
#kernels = [2,3, 4]
clean_tqdm()
model, train_iterator,  test_iterator, optimizer,  criterion = create_model(64, 128, [2,3,4])
count_parameters(model)
history = train_cnn(model, train_iterator, test_iterator,
          criterion,  device='cpu', n_epochs=2)

  0%|          | 0/391 [00:00<?, ?it/s]

The model has 5,116,269 trainable parameters


100%|██████████| 391/391 [03:31<00:00,  1.75it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8779650926589966
Epoch: 0. Train loss: 0.374. Train accuracy: 0.830.


100%|██████████| 391/391 [03:37<00:00,  1.80it/s]


Test accuracy: 0.8923993110656738
Epoch: 1. Train loss: 0.238. Train accuracy: 0.905.


Возьмем kernels = [ 2,3,4] - коллаб не падает. метрики приличные

In [0]:
# now we check batch_size = 128 - падает коллаб
clean_tqdm()
model, train_iterator, test_iterator, optimizer,  criterion = create_model(128, 128, [2,3,4])
history = train_cnn(model, train_iterator, test_iterator,
          criterion, device='cpu', n_epochs=2)

In [27]:
#hidden_size = 64
clean_tqdm()
model, train_iterator, test_iterator, optimizer,  criterion = create_model(64, 64, [2,3,4])
history = train_cnn(model, train_iterator, test_iterator,
          criterion,  device='cpu', n_epochs=2)

100%|██████████| 391/391 [02:22<00:00,  2.46it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8799952268600464
Epoch: 0. Train loss: 0.392. Train accuracy: 0.820.


100%|██████████| 391/391 [02:20<00:00,  2.70it/s]


Test accuracy: 0.8656089901924133
Epoch: 1. Train loss: 0.253. Train accuracy: 0.897.


Берем hidden_size = 128

Итого, лучшая модель будет такая (я запустила все ячейки выше пару раз, потом уже пропускала этот этап, чтобы не тратить время, сразу брала данные параметры): 

*   batch_size: 64
*   hidden_size: 128
*   kernels: [2,3,4]





# 3. Training and evaluating our model

In [0]:
model , train_iterator,  test_iterator, optimizer,criterion = create_model(64, 128, [2,3,4])
model.embedding.weight.data.copy_(REVIEW.vocab.vectors);

In [22]:
count_parameters(model)
model

The model has 5,116,269 trainable parameters


MyModel(
  (embedding): Embedding(50003, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 128, kernel_size=(2,), stride=(1,))
    (1): Conv1d(100, 128, kernel_size=(3,), stride=(1,))
    (2): Conv1d(100, 128, kernel_size=(4,), stride=(1,))
  )
  (fc): Linear(in_features=384, out_features=1, bias=True)
)

In [23]:
#model.load_state_dict(torch.load('model_test (4)'))

<All keys matched successfully>

In [27]:
clean_tqdm()
history = train_cnn(model, train_iterator,test_iterator,
          criterion,  device='cpu', n_epochs=11)

100%|██████████| 391/391 [04:26<00:00,  1.37it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8792838454246521
Epoch: 0. Train loss: 0.391. Train accuracy: 0.819.


100%|██████████| 391/391 [04:23<00:00,  1.58it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8961237072944641
Epoch: 1. Train loss: 0.244. Train accuracy: 0.903.


100%|██████████| 391/391 [04:30<00:00,  1.02s/it]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8997601866722107
Epoch: 2. Train loss: 0.175. Train accuracy: 0.932.


100%|██████████| 391/391 [04:33<00:00,  1.11it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.8885709643363953
Epoch: 3. Train loss: 0.110. Train accuracy: 0.964.


100%|██████████| 391/391 [04:31<00:00,  1.17it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.891104519367218
Epoch: 4. Train loss: 0.058. Train accuracy: 0.988.


100%|██████████| 391/391 [04:33<00:00,  1.38it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.9000160098075867
Epoch: 5. Train loss: 0.025. Train accuracy: 0.998.


100%|██████████| 391/391 [04:33<00:00,  1.29it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.9013347029685974
Epoch: 6. Train loss: 0.011. Train accuracy: 1.000.


100%|██████████| 391/391 [04:32<00:00,  1.15it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.901702344417572
Epoch: 7. Train loss: 0.005. Train accuracy: 1.000.


100%|██████████| 391/391 [04:35<00:00,  1.36it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.90115886926651
Epoch: 8. Train loss: 0.003. Train accuracy: 1.000.


100%|██████████| 391/391 [04:34<00:00,  1.48it/s]
  0%|          | 0/391 [00:00<?, ?it/s]

Test accuracy: 0.9010789394378662
Epoch: 9. Train loss: 0.002. Train accuracy: 1.000.


100%|██████████| 391/391 [04:33<00:00,  1.38it/s]


Test accuracy: 0.9010229706764221
Epoch: 10. Train loss: 0.002. Train accuracy: 1.000.


In [32]:
test_accuracy = test_model(model, test_iterator)
test_accuracy

Test accuracy: 0.9010229706764221


#4. More 'unsup' data

Идея простая: Берем две модели (TextBlob и SentimentIntensityAnalyzer), смотрим что они предсказывают для unsup данных. Если предсказания совпадают, берем, если нет - выкидываем. Но оказалось, что вторая модель большинство текстов определяет как нейтральные. Поэтому родилась такая идея: взять предсказание модели, взять TextBlob и сравнить. Если совпадают - добавляем в обучение.

In [33]:
!pip install vaderSentiment

Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/86/9e/c53e1fc61aac5ee490a6ac5e21b1ac04e55a7c2aba647bb8411c9aadf24e/vaderSentiment-3.2.1-py2.py3-none-any.whl (125kB)
[K     |██▋                             | 10kB 25.5MB/s eta 0:00:01[K     |█████▏                          | 20kB 1.8MB/s eta 0:00:01[K     |███████▉                        | 30kB 2.6MB/s eta 0:00:01[K     |██████████▍                     | 40kB 1.7MB/s eta 0:00:01[K     |█████████████                   | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▋                | 61kB 2.5MB/s eta 0:00:01[K     |██████████████████▎             | 71kB 2.9MB/s eta 0:00:01[K     |████████████████████▉           | 81kB 3.3MB/s eta 0:00:01[K     |███████████████████████▍        | 92kB 3.7MB/s eta 0:00:01[K     |██████████████████████████      | 102kB 2.8MB/s eta 0:00:01[K     |████████████████████████████▋   | 112kB 2.8MB/s eta 0:00:01[K     |███████████████████████████████▎| 12

In [24]:
#from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
def SIA_fill(sentence):
    sentence = ' '.join(tokenizer(sentence))
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(sentence)
    neg = vs['neg']
    pos = vs['pos']
    score = 'pos' if score > 0 else 'neg'
    return score

In [0]:
def TextBlob_fill(sentence):
    blob = TextBlob(sentence)
    sentence = ' '.join(tokenizer(sentence))
    score = blob.sentences[0].sentiment.polarity
    score = 'pos' if score > 0 else 'neg'
    return score

In [0]:
def model_fill(model, unsup_iterator):
    model.eval()
    labels = []

    with torch.no_grad():
        for item in unsup_iterator:
            x = item.review
            preds = model(x).squeeze(1)
            preds = torch.round(torch.sigmoid(preds))
            labels.extend(preds.int().tolist())
    return labels

In [0]:
unsup_iterator = BucketIterator(
        dataset_unsup,
        batch_size=64,
        shuffle=False,
        sort_key=lambda x: len(x.review),
    )

In [0]:
labels_to_add = model_fill(model, unsup_iterator)
df_unsup['label_model'] = labels_to_add
df_unsup['label_textblob'] = df_unsup['review'].apply(TextBlob_fill) #TextBlob
#df_unsup['label2'] = df_unsup['review'].apply(SIA_fill) #SentimentIntensityAnalyzer

In [0]:
#Заполняем поле label лейблами, если они совпадают для модели и textblob
def compare_and_fill(version1, version2):
    if (version1 == 'neg' and  version2 == 0) or (version1 == 'pos' and  version2 == 1) :
        result = 'neg' if version1 is 'neg' else 'pos'
    else:
        result = 'different'
    return result

for i, row in df_unsup.iterrows():
    result = compare_and_fill(row.label_textblob, row.label_model)
    df_unsup.at[i, 'label'] = result
df_unsup.head()

Unnamed: 0,type,review,label,label_model,label_textblob
50000,train,"I admit, the great majority of films released ...",pos,1,pos
50001,train,"Take a low budget, inexperienced actors doubli...",neg,0,neg
50002,train,"Everybody has seen 'Back To The Future,' right...",different,0,pos
50003,train,Doris Day was an icon of beauty in singing and...,pos,1,pos
50004,train,"After a series of silly, fun-loving movies, 19...",different,1,neg


In [0]:
mask = df_unsup['label'] == 'different'
print(len(df_unsup[mask]))
print(len(df_unsup[~mask]))
df_unsup_to_work = df_unsup[~mask]

18539
31461


In [0]:
#31к лейблов совпадают, будем работ
mask = df_unsup_to_work['label'] == 'pos'
print(len(df_unsup_to_work[mask]))
print(len(df_unsup_to_work[~mask]))
df_unsup_to_work.head()

16515
14946


Unnamed: 0,type,review,label,label_model,label_textblob
50000,train,"I admit, the great majority of films released ...",pos,1,pos
50001,train,"Take a low budget, inexperienced actors doubli...",neg,0,neg
50003,train,Doris Day was an icon of beauty in singing and...,pos,1,pos
50005,train,"This isn't exactly a musical, but it almost se...",pos,1,pos
50007,train,In the 1950's there were many film boigraphies...,pos,1,pos


Баланс классов нормальный. Работаем.

In [0]:
df_unsup_to_work = df_unsup_to_work.drop(columns=['label_model', 'label_textblob'])
df_unsup_to_work.head()

Unnamed: 0,type,review,label
50000,train,"I admit, the great majority of films released ...",pos
50001,train,"Take a low budget, inexperienced actors doubli...",neg
50003,train,Doris Day was an icon of beauty in singing and...,pos
50005,train,"This isn't exactly a musical, but it almost se...",pos
50007,train,In the 1950's there were many film boigraphies...,pos


In [0]:
df_unsup_to_work.to_csv("unsup_labels.csv", index=False)

In [0]:
dataset_unsup = TabularDataset('unsup_labels.csv', 
                                format='csv', fields=[(None, None), ('review', REVIEW), ('label', LABEL)], 
                                skip_header=True)

In [0]:
#let's concatenate train and unsup data
ds_concat  = train + dataset_unsup
list_of_ex = [x for x in ds_concat]
new_ds = Dataset(list_of_ex, [('review', REVIEW), ('label', LABEL)])

In [0]:
unsup_iterator = BucketIterator(
        new_ds,
        batch_size=64,
        shuffle=True,
        sort_key=lambda x: len(x.review),
    )

In [33]:
clean_tqdm()
history = train_cnn(model, unsup_iterator,  test_iterator,
          criterion,  device='cpu', n_epochs=6)

 65%|██████▍   | 573/883 [05:13<02:57,  1.74it/s]

KeyboardInterrupt: ignored

In [0]:
#У меня падает коллаб. но 0.9 уже есть и unsup data уже используется

In [36]:
test_accuracy = test_model(model, test_iterator)
test_accuracy

Test accuracy: 0.9010229706764221
