10. Во время обучения, ячейки декодера получают на вход ожидаемый вывод прошлого шага, для повышеной точности обучения, то есть используется внешняя информация в виде правильных выходных последовательностей.
Во время тестирования и во время вывода, внешней информации у нас нет и на вход декодеру подается результаты перевода прошлого шага, то есть модель целиком работает независимо.
Появляющаяся проблема: так как выходная последовательность сильно зависит от контекста, то велик шанс, что в длинных некачественный вывод где-то в середине приведет к сильным ошибкам в конце последовательности, из-за потери правильного контекста.

In [0]:
import pandas as pd
import numpy as np
import sklearn
import spacy

SEED=1337
np.random.seed(SEED)

In [4]:
!unzip "exam_data.zip"

Archive:  exam_data.zip
  inflating: test.csv                
  inflating: train.csv               


In [5]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['target'] = (df_train['target'] == 5).astype(np.int)
df_test['target'] = (df_test['target'] == 5).astype(np.int)

df_train.shape

(48192, 3)

In [6]:
import nltk
import re
from nltk.stem import WordNetLemmatizer 
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
def prep_df(df):
    toks = []
    reviews = []
    for r in df['review']:
        tokens = nltk.word_tokenize(r)
        l = []
        for t in tokens:
            tnew = t.lower()
            tnew = re.sub(r'[^\w\s]','',tnew)
            if tnew not in stoplist:
                l.append(lemmatizer.lemmatize(tnew))
        toks.append(' '.join(l))

    df['tokens'] = toks


    df = df.drop(columns=['title'])
    df = df.drop(columns=['review'])


    label = []
    ls = []
    for t in df['target']:
        if int(t) == 5:
            label.append('pos')
        else:
            label.append('neg')
    df['label'] = label 
    
    df = df.drop(columns=['target'])
    return df

In [0]:
df_train = prep_df(df_train)

In [9]:
df_train.head()

Unnamed: 0,tokens,label
0,staff friendly breakfast nice extremely comf...,neg
1,excellent service approachable professional s...,neg
2,really top notch place spend day beginning end...,neg
3,little noisy false fire alarm midnight reaso...,neg
4,place many animal allergic pet although recei...,neg


In [0]:
df_test = prep_df(df_test)

In [11]:
df_test.head()

Unnamed: 0,tokens,label
0,old town stayed hotel mom visit renovation y...,neg
1,coming ocean park inn year usually book sever...,neg
2,perfect place quick get away queen room share...,neg
3,room best however good one night continuing t...,neg
4,sou le motif dune priode hivernale inacceptab...,neg


In [0]:
classes = ('pos', 'neg')

In [0]:
from torchtext import data

In [0]:
class DatasetFromDataFrame(data.Dataset):
    def __init__(self, path, text_field, label_field, col, gt, dfs, **kwargs):
        fields = [("text", text_field), ("label", label_field)]
        examples = []
        shape_ = dfs[path].values[:,1].shape[0]
        for i in tqdm(range(shape_), total=shape_, desc=f"Example:"):
            text = dfs[path][col].iloc[i]
            label = dfs[path][gt].iloc[i]
            examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

In [0]:
import torch
from tqdm import tqdm

In [0]:
TEXT = data.Field(tokenize='spacy', batch_first=True, lower=True)
LABEL = data.LabelField(dtype=torch.float)

In [23]:
dfs = {0: df_train, 1: df_test}
train_dataset = DatasetFromDataFrame(0, TEXT, LABEL, 'tokens', 'label', dfs)
test_dataset = DatasetFromDataFrame(1, TEXT, LABEL, 'tokens', 'label', dfs)

Example:: 100%|██████████| 48192/48192 [00:14<00:00, 3367.09it/s]
Example:: 100%|██████████| 5355/5355 [00:01<00:00, 3429.23it/s]


In [0]:
phrases = [token.split(' ') for token in list(df_train.tokens)]

In [0]:
EMBEDDING_DIM = 100
MAX_VOCAB = 49347
BATCH_SIZE = 100
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
from gensim.models import Word2Vec, KeyedVectors

In [27]:
w2v_model = Word2Vec(phrases, size=EMBEDDING_DIM)
weights = torch.FloatTensor(w2v_model.wv.vectors)
w2v_model.wv.save_word2vec_format('w2v_embeddings')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
for instance in list(tqdm._instances):
    tqdm._decr_instances(instance)

In [0]:
from torchtext.vocab import Vectors

In [30]:
vectors = Vectors(name='w2v_embeddings', cache='./')

TEXT.build_vocab(
    train_dataset,
    vectors=vectors,  # "glove.6B.100d"
    max_size=MAX_VOCAB,
    unk_init=torch.Tensor.normal_
)

LABEL.build_vocab(train_dataset)

  0%|          | 0/10824 [00:00<?, ?it/s]Skipping token b'10824' with 1-dimensional vector [b'100']; likely a header
 99%|█████████▉| 10693/10824 [00:00<00:00, 15038.97it/s]


In [0]:
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_dataset, test_dataset), batch_size=BATCH_SIZE, device=device
)

In [0]:
import torch.nn as nn


class NeuralNet(nn.Module):
    def __init__(self, vocab_size, pad_idx, num_filters, filter_sizes):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, EMBEDDING_DIM, padding_idx=pad_idx)

        self.convolutions = nn.ModuleList(
            [
                nn.Conv2d(
                    in_channels=1,
                    out_channels=num_filters,
                    kernel_size=(filter_size, EMBEDDING_DIM)
                )
                for filter_size in filter_sizes
            ]
        )

        self.linear = nn.Linear(len(filter_sizes) * num_filters, 1)

    def forward(self, text):

        embeddings = self.embedding(text)


        embeddings = embeddings.unsqueeze(1)

        convolved = [F.relu(convolution(embeddings)).squeeze(3) for convolution in self.convolutions]

        pooled = [F.max_pool1d(convolved_, convolved_.shape[2]).squeeze(2) for convolved_ in convolved]

        cat = torch.cat(pooled, dim=1)

        result = self.linear(cat)

        return result

In [0]:
VOCAB_SIZE = len(TEXT.vocab)
NUM_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = NeuralNet(VOCAB_SIZE, PAD_IDX, NUM_FILTERS, FILTER_SIZES)

In [0]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings);

In [0]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

In [0]:
from sklearn.metrics import f1_score 

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        y_preds = model(batch.text).squeeze(1)
        loss = criterion(y_preds, batch.label)
        epoch_loss += loss.item()
        y_preds = torch.round(torch.sigmoid(y_preds))
        epoch_acc += f1_score(batch.label.cpu().detach().numpy(), y_preds.cpu().detach().numpy()).item()
        loss.backward()
        optimizer.step()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            y_preds = model(batch.text).squeeze(1)
            loss = criterion(y_preds, batch.label)
            epoch_loss += loss.item()
            y_preds = torch.round(torch.sigmoid(y_preds))
            epoch_acc += f1_score(batch.label.cpu().detach().numpy(), y_preds.cpu().detach().numpy()).item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def train_and_evaluate_model(model, num_epochs=1):
    best_valid_loss = float('inf')
    model = model.to(device)

    # train model
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        print(f'Epoch: {epoch + 1}')
        print(f'train loss: {train_loss:.3f}, f1: {train_acc*100:.2f}%')

    # evaluate model
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'test loss: {test_loss:.3f}, f1: {test_acc*100:.2f}%')

In [111]:
train_and_evaluate_model(model)

  average, "true nor predicted", 'F-score is', len(true_sum)


Epoch: 1
train loss: 0.004, f1: 0.00%


RuntimeError: ignored

In [112]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'test loss: {test_loss:.3f}, f1: {test_acc*100:.2f}%')

RuntimeError: ignored