Для работы с датасетом используйте файл imdb_fixed.csv

Файл можно скачать по ссылке: https://drive.google.com/file/d/1cnu9mqWoaK4QESxQLKC-MHzcA8Q8OOzf/view

Он слишком большой для выкладывания в папку.

In [2]:
import pandas as pd
import numpy as np
# from sklearn.externals import joblib
import nltk
nltk.download('stopwords')
# import gensim
import spacy

# from sklearn import metrics

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Iterator, RawField

SEED = 42
np.random.seed(SEED)

import spacy


spacy_en = spacy.load('en_core_web_sm')

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text) if tok.text.isalpha()]


classes={
    'neg':0,
    'unsup':1,
    'pos':2
}

TEXT = Field(include_lengths=False, batch_first=True,
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True,
             stop_words=nltk.corpus.stopwords.words('english'))
LABEL = LabelField(dtype=tt.int64, use_vocab=True, preprocessing=lambda x: classes[x])
TYP = Field()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
from google.colab import drive

In [4]:
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import os
os.chdir('gdrive/My Drive/Colab Notebooks')

In [0]:
dataset = TabularDataset(
    'imdb_fixed.csv', format='csv',
    fields=[
        (None, None),
        ('type', TYP),
        ('review', TEXT),
        ('label', LABEL),
        (None, None)
    ],
    skip_header=True)

In [7]:
from torchtext.data import Dataset

TEXT.build_vocab(dataset, min_freq=5)
print(len(TEXT.vocab.itos))
print(TEXT.vocab.itos[:10])
LABEL.build_vocab(dataset)
TYP.build_vocab(dataset)

train_ds = Dataset(dataset.examples, dataset.fields, filter_pred=lambda x: 'train' in x.type)
test = Dataset(dataset.examples, dataset.fields, filter_pred=lambda x: 'test' in x.type)
train_ds, valid = train_ds.split(0.7, stratified=True)

print(np.unique([x.label for x in train_ds.examples], return_counts=True))
print(np.unique([x.label for x in valid.examples], return_counts=True))
print(np.unique([x.label for x in test.examples], return_counts=True))

52275
['<unk>', '<pad>', '<eos>', 'movie', 'film', 'one', 'like', 'good', 'would', 'even']
(array([0, 1, 2]), array([ 8750, 35000,  8750]))
(array([0, 1, 2]), array([ 3750, 15000,  3750]))
(array([0, 2]), array([12500, 12500]))


In [0]:
device = tt.device('cuda' if tt.cuda.is_available() else 'cpu')

class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, kernels):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.convs = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, k, padding=5) for k in kernels])
        
        self.fc = nn.Linear(hidden_size * len(kernels), 3)
        
    def forward(self, x):
        
        x = self.embedding(x)
        x = x.transpose(1,2)
        
        concatenated = []
        for conv in self.convs:
            z = conv(x)
            z = F.avg_pool1d(z, kernel_size=z.size(2))
            z = z.squeeze(2)
            concatenated.append(z)
            
        x = tt.cat(concatenated, 1)
        x = self.fc(x)
        return x

tt.cuda.empty_cache()

batch_size = 32

model = MyModel(len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
                kernels=[2,3,4,5]
               )

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_ds, valid, test),
    batch_sizes=(batch_size, batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.review),
#     sort_within_batch=True,
    device=device,
)

model = model.to(device)
optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = tt.round(tt.sigmoid(preds)).t()

    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.review).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with tt.no_grad():
    
        for batch in iterator:

            predictions = model(batch.review).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [10]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        tt.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 10m 37s
	Train Loss: 0.852 | Train Acc: 1483.42%
	 Val. Loss: 0.982 |  Val. Acc: 1145.03%
Epoch: 02 | Epoch Time: 10m 38s
	Train Loss: 0.755 | Train Acc: 1342.29%
	 Val. Loss: 1.097 |  Val. Acc: 1172.21%
Epoch: 03 | Epoch Time: 10m 41s
	Train Loss: 0.677 | Train Acc: 1344.46%
	 Val. Loss: 1.214 |  Val. Acc: 1231.96%
Epoch: 04 | Epoch Time: 10m 36s
	Train Loss: 0.589 | Train Acc: 1375.64%
	 Val. Loss: 1.385 |  Val. Acc: 1276.09%
Epoch: 05 | Epoch Time: 10m 40s
	Train Loss: 0.512 | Train Acc: 1405.53%
	 Val. Loss: 1.631 |  Val. Acc: 1282.91%


In [13]:
model.load_state_dict(tt.load('tut4-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 1.175 | Test Acc: 1007.16%
