# Augmentation API usage

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install --upgrade torch
!pip install --upgrade wandb
!pip install --upgrade catalyst
!pip install --upgrade torchtext
!wandb login c54b2fcb6b8ca2808f5be303a8a3b6e464f52cca

Requirement already up-to-date: torch in /usr/local/lib/python3.6/dist-packages (1.4.0)
Requirement already up-to-date: wandb in /usr/local/lib/python3.6/dist-packages (0.8.32)
Requirement already up-to-date: catalyst in /usr/local/lib/python3.6/dist-packages (20.4.1)
Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.5.0)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [3]:
import pandas as pd
import numpy as np
import os
from google.cloud import translate_v2 as translate
import six
import torch
from catalyst import dl
import torch.nn as nn
import pdb
from torch import cuda
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
import sys
import torchtext
from torchtext import data
from torchtext.data import BPTTIterator, BucketIterator, Iterator
from torchtext import datasets
import torch.optim as O
from tqdm import tqdm
import datetime
import time
import catalyst
import wandb
from torch.utils.data import Dataset, DataLoader, TensorDataset

DEVICE = 'cuda'


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject


numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject

alchemy not available, to install alchemy, run `pip install alchemy-catalyst`.


In [4]:
print("PyTorch Version:", torch.__version__)
print("Catalyst Version:", catalyst.__version__)
print("Wandb Version:", wandb.__version__)

PyTorch Version: 1.4.0
Catalyst Version: 20.04.1
Wandb Version: 0.8.32


In [0]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "drive/My Drive/text-augmentation/text-augmentation.json"

In [0]:
def make_translation(text, target='ru'):
    translate_client = translate.Client()

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    result = translate_client.translate(
        text, target_language=target)

    return result['translatedText'], result['detectedSourceLanguage']

In [0]:
print(make_translation("yes, this example works fine"))

In [0]:
translate_client = translate.Client()
source_language = 'en'
target_language = 'ru'
def back_translation(text):
    global translate_client, target_language, source_lanaguage

    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    forward = translate_client.translate(text, target_language=target_language)
    res = translate_client.translate(forward['translatedText'], target_language=source_language)
    return res['translatedText']

In [0]:
print(back_translation("Complicated sentence in order to transform its representation but not meaning"))

# Model description

In [0]:
class BiLstm(nn.Module):
    def __init__(self, vocab_size, h_size, n_layers, dropout, padding_idx):
        super(BiLstm, self).__init__()
        self.embedding = nn.Embedding(vocab_size, h_size, padding_idx=padding_idx, scale_grad_by_freq=True)
        self.lstm = nn.LSTM(h_size, h_size, n_layers, bidirectional=True, dropout=dropout)
        self.out = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(2 * 4 * n_layers * h_size, h_size * 3),
            nn.BatchNorm1d(h_size * 3),
            nn.PReLU(),
            nn.Dropout(dropout),
            nn.Linear(h_size * 3, h_size * 3),
            nn.BatchNorm1d(h_size * 3),
            nn.PReLU(),
            nn.Dropout(dropout),
            nn.Linear(h_size * 3, 3)
        )

        self.init_weights()

    def forward(self, batch):
        prem, hyp = batch
        prem, prem_len = prem
        hyp, hyp_len = hyp

        prem, hyp = self.embedding(prem), self.embedding(hyp)

        # optimize perfomance of RNN by omitting padding token
        prem = nn.utils.rnn.pack_padded_sequence(input=prem, lengths=prem_len, enforce_sorted=False)
        hyp = nn.utils.rnn.pack_padded_sequence(input=hyp, lengths=hyp_len, enforce_sorted=False)
        _, h_prem = self.lstm(prem)
        _, h_hyp = self.lstm(hyp)

        h_prem = torch.cat(h_prem, dim=-1)
        h_prem = h_prem.permute(1,0,2)
        h_prem = h_prem.contiguous().view(h_prem.size(0),-1)

        h_hyp = torch.cat(h_hyp, dim=-1)
        h_hyp = h_hyp.permute(1,0,2)
        h_hyp = h_hyp.contiguous().view(h_hyp.size(0),-1)

        h = torch.cat([h_prem, h_hyp],dim=-1)
        h = h.contiguous().view(h.size(0), -1)
        
        return self.out(h)
 
    def init_weights(self):
        d = self.embedding.weight.size(1)
        nn.init.uniform_(self.embedding.weight, -1.0 / np.sqrt(d), 1.0 / np.sqrt(d))

# Training with augmentation

In [0]:
TEXT = torchtext.data.Field(lower=True, tokenize='spacy', include_lengths=True)
LABEL = torchtext.data.LabelField(sequential=False, is_target=True)

train, valid, test = datasets.SNLI.splits(TEXT, LABEL)

In [17]:
print("Example properties:", dir(train.examples[0]))

Example properties: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'fromCSV', 'fromJSON', 'fromdict', 'fromlist', 'fromtree', 'hypothesis', 'label', 'premise']


In [0]:
used_indices = set()

In [19]:
import numpy as np
import pandas as pd

data = np.full((0, 3), "")

sent1 = "some sent 1"
sent2 = "some sent 2"
appendix = np.array([sent1, sent2, "neutral"])
data = np.vstack((data, appendix.reshape((1, -1))))
print(data)

data = np.full((0, 3), "")

[['some sent 1' 'some sent 2' 'neutral']]


# Augmentation implementation (don't run if you have csv)

In [0]:
def save_in_csv(data):
    save_augmented_objects = pd.DataFrame(data=data, columns=['premise', 'hypothesis', 'label'])
    save_augmented_objects.to_csv("drive/My Drive/text-augmentation/augmented.csv", index=False)
    return

In [0]:
# Unite two datasets: source and augmented
ALREADY_SAVED = 15205
AUGMENTED_RATIO = 0.1
count_augmented_objects = 0
bad_examples = 0
print("Remained to load:", AUGMENTED_RATIO * len(train.examples) - ALREADY_SAVED)
start = time.time()
while count_augmented_objects < AUGMENTED_RATIO * len(train.examples) - ALREADY_SAVED:
    if count_augmented_objects % 10 == 5:
        print(data.shape[0])
    if count_augmented_objects % 100 == 5:
        print("Successful augmented objects, bad examples:", count_augmented_objects, bad_examples)
        print("Done in %.2f" % (time.time() - start))
        start = time.time()
        print("5 random saved objects:")
        indices = np.random.choice(np.arange(data.shape[0]), size=5)
        print(data[indices, :])
        print("size of set:", len(used_indices))
        save_in_csv(data)

    rand_ind = int(np.random.random() * len(train.examples))
    while rand_ind in used_indices:
        rand_ind = int(np.random.random() * len(train.examples))
    used_indices.add(rand_ind)
    premise, hypothesis, label = train.examples[rand_ind].hypothesis, train.examples[rand_ind].premise, \
                                 train.examples[rand_ind].label
    joined_premise, joined_hypothesis = ' '.join(premise), ' '.join(hypothesis)
    back_premise, back_hypothesis = back_translation(joined_premise), back_translation(joined_hypothesis)
    if joined_premise != back_premise and joined_hypothesis != back_hypothesis:
        count_augmented_objects += 1
        new_example = torchtext.data.example.Example()
        new_example.premise = back_premise.split()
        new_example.hypothesis = back_hypothesis.split()
        new_example.label = label
        train.examples.append(new_example)
        appendix = np.array([back_premise, back_hypothesis, label]).reshape((1, -1))
        data = np.vstack((data, appendix))
    else:
        bad_examples += 1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
25305
Successful augmented objects, bad examples: 25305 1694
Done in 46.43
5 random saved objects:
[['The boy first tries mountain biking.'
  'A young Caucasian man in shorts, a T-shirt, helmet and sunglasses rides a mountain bike down a heavily wooded hill.'
  'neutral']
 ['girl drinks her soda in the aquarium.'
  'girl drinks from straw, sitting near a glass of dolphinarium.'
  'neutral']
 ['baby sleeps on the street.'
  'a small child in a stocking and a bright jacket runs away from the bench.'
  'contradiction']
 ['children read calmly inside the building.'
  'Children play ball on the street in front of the building.'
  'contradiction']
 ['the ball is light.' 'young boys play ball on the beach.' 'neutral']]
size of set: 43257
25315
25325
25335
25345
25355
25355
25365
25375
25385
25395
25405
Successful augmented objects, bad examples: 25405 1705
Done in 46.26
5 random saved objects:
[['old people holding hands at the 

KeyboardInterrupt: ignored

In [0]:
save_augmented_objects = pd.DataFrame(data=data, columns=['premise', 'hypothesis', 'label'])
save_augmented_objects.to_csv("drive/My Drive/text-augmentation/augmented.csv", index=False)

# Load saved augmented data

In [20]:
df = pd.read_csv("drive/My Drive/text-augmentation/translate-augmented.csv")
print(df.sample(5))

                                                 premise  ...          label
11413  ['two', 'men', 'are', 'talking', 'to', 'each',...  ...        neutral
31444     ['surfer', 'sunbathes', 'on', 'the', 'beach.']  ...  contradiction
42145                           ['man', 'is', 'sitting']  ...     entailment
53434  ['A', 'man', 'jumps', 'into', 'a', 'dirty', 'p...  ...        neutral
1862                    ['nobody', 'holds', 'equipment']  ...  contradiction

[5 rows x 3 columns]


### Fulfil train object

In [21]:
for index, row in tqdm(df.iterrows()):
    new_example = torchtext.data.example.Example()
    new_example.premise = row.premise
    new_example.hypothesis = row.hypothesis
    new_example.label = row.label
    train.examples.append(new_example)

57153it [00:11, 4995.09it/s]


# Build vocabulary & train on augmented data

In [0]:
TEXT.build_vocab(train, valid, min_freq=5)
LABEL.build_vocab(train)

In [23]:
print("Final size of data:", len(train.examples))

Final size of data: 606520


In [0]:
# Simple wrapper to join torchtext and catalyst API

class IteratorWrapper(DataLoader):
    __initialized__ = False

    def __init__(self, iter: iter, augmented_ratio):
        self.batch_size = iter.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iter
        self.batch_sampler = iter
        self.__initialized__ = True

    def __iter__(self):
        return map(lambda batch: {
            'features': (batch.premise, batch.hypothesis),
            'targets': batch.label,
        }, self.batch_sampler.__iter__())

    def __len__(self):
        return len(self.batch_sampler)

In [0]:
# example of your hyperparameters 
# keep them as global vars or in the 'config' dict
n_layers = 4
h_size = 128
embed_dim = 512
num_epochs = 10
dropout = 0.1
batch_size = 512 # BatchNorm do not work properly with small batch sizes
augmented_ratio = 0.1
vocab_size = len(TEXT.vocab)

train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train, valid, test), batch_size=batch_size, device=DEVICE)
train_iter = IteratorWrapper(train_iter, augmented_ratio=augmented_ratio)
valid_iter = IteratorWrapper(valid_iter, augmented_ratio=augmented_ratio)
test_iter = IteratorWrapper(test_iter, augmented_ratio=augmented_ratio)
 
loaders = {'train': train_iter, 'valid': valid_iter}

model = BiLstm(
    vocab_size=vocab_size,
    h_size=h_size,
    n_layers=n_layers, 
    dropout=dropout,
    padding_idx=TEXT.vocab.stoi[TEXT.pad_token],
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, verbose=True, patience=2, factor=0.5)

In [0]:
logdir='wandb-log-dir-augmented'

In [27]:
runner = dl.SupervisedRunner(DEVICE)
runner.train(model=model, 
             loaders=loaders,
             num_epochs=num_epochs,
             logdir=logdir,
             criterion=nn.CrossEntropyLoss(),
             optimizer=optimizer, 
             scheduler=scheduler,  
             callbacks=[
                dl.callbacks.CheckpointCallback(2),
                dl.callbacks.AccuracyCallback(),
                dl.callbacks.EarlyStoppingCallback(3), # stop training, if valid loss does not improve last 3 epochs
                dl.callbacks.WandbLogger(
                    project="text-augmentation"
                )
             ],
             monitoring_params={
                 'entity': 'msaidov',
                 'project': 'text-augmentation',
                 'name': 'augmented-lstm-translate',
                 'group': 'examples',
                 'config': {
                     'model': 'bilstm',
                     'optimizer': str(optimizer),
                     'scheduler': 'plateau',
                     'early_stop': 3,
                     'vocab_size': vocab_size,
                     'h_size': h_size,
                     'n_layers': n_layers,
                     'dropout': dropout,
                     'batch_size': batch_size,
                     'embed_dim': embed_dim,
                 },
             },
#              check=True, # set if you want to check pipeline for correctness, without actual training
             verbose=True)


can't resolve package from __spec__ or __package__, falling back on __name__ and __path__



1/10 * Epoch (train): 100% 1185/1185 [19:48<00:00,  1.00s/it, accuracy01=0.699, loss=0.698]
1/10 * Epoch (valid): 100% 20/20 [00:02<00:00,  7.34it/s, accuracy01=0.649, loss=0.805]
[2020-04-18 15:06:13,333] 
1/10 * Epoch 1 (_base): lr=0.0010 | momentum=0.9000
1/10 * Epoch 1 (train): accuracy01=0.6348 | loss=0.7971
1/10 * Epoch 1 (valid): accuracy01=0.7131 | loss=0.6808
2/10 * Epoch (train): 100% 1185/1185 [19:50<00:00,  1.00s/it, accuracy01=0.705, loss=0.668]
2/10 * Epoch (valid): 100% 20/20 [00:02<00:00,  7.15it/s, accuracy01=0.649, loss=0.775]
[2020-04-18 15:26:07,758] 
2/10 * Epoch 2 (_base): lr=0.0010 | momentum=0.9000
2/10 * Epoch 2 (train): accuracy01=0.7079 | loss=0.6758
2/10 * Epoch 2 (valid): accuracy01=0.7426 | loss=0.6195
Early exiting
3/10 * Epoch (train):   1% 9/1185 [00:11<22:13,  1.13s/it, accuracy01=0.721, loss=0.610]


unclosed <ssl.SSLSocket fd=93, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6, laddr=('172.28.0.2', 56668), raddr=('35.186.228.49', 443)>



In [28]:
#  Evaluate on the test setsssssssss

runner = dl.SupervisedRunner()
y_pred = runner.predict_loader(model, test_iter, verbose=True)
# Do not forget, that the NN compute the class logits, from which you actually have to make prediction
y_pred = np.argmax(y_pred, axis=1)
y_true = np.concatenate([x['targets'].cpu().numpy() for x in test_iter])

test_score = accuracy_score(y_true, y_pred)
print(test_score)

# send test score to wandb
wandb.log({'acc/test': test_score})


1/1 * Epoch (infer):   0% 0/20 [00:00<?, ?it/s][A
1/1 * Epoch (infer):   0% 0/20 [00:00<?, ?it/s][A
1/1 * Epoch (infer):   5% 1/20 [00:00<00:03,  5.04it/s][A
1/1 * Epoch (infer):   5% 1/20 [00:00<00:03,  5.04it/s][A
1/1 * Epoch (infer):  10% 2/20 [00:00<00:03,  5.51it/s][A
1/1 * Epoch (infer):  10% 2/20 [00:00<00:03,  5.51it/s][A
1/1 * Epoch (infer):  15% 3/20 [00:00<00:02,  6.15it/s][A
1/1 * Epoch (infer):  15% 3/20 [00:00<00:02,  6.15it/s][A
1/1 * Epoch (infer):  20% 4/20 [00:00<00:02,  6.83it/s][A
1/1 * Epoch (infer):  20% 4/20 [00:00<00:02,  6.83it/s][A
1/1 * Epoch (infer):  25% 5/20 [00:00<00:02,  6.83it/s][A
1/1 * Epoch (infer):  30% 6/20 [00:00<00:01,  7.48it/s][A
1/1 * Epoch (infer):  30% 6/20 [00:00<00:01,  7.48it/s][A
1/1 * Epoch (infer):  35% 7/20 [00:00<00:01,  7.65it/s][A
1/1 * Epoch (infer):  35% 7/20 [00:00<00:01,  7.65it/s][A
1/1 * Epoch (infer):  40% 8/20 [00:01<00:01,  8.15it/s][A
1/1 * Epoch (infer):  40% 8/20 [00:01<00:01,  8.15it/s][A
1/1 * Epoch 


unclosed file <_io.TextIOWrapper name='wandb-log-dir-augmented/wandb/run-20200418_144620-3n6anoep/wandb-metadata.json' mode='r' encoding='UTF-8'>


unclosed <socket.socket fd=81, family=AddressFamily.AF_INET, type=2049, proto=0, laddr=('0.0.0.0', 0)>


unclosed <ssl.SSLSocket fd=92, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6, laddr=('172.28.0.2', 36856), raddr=('35.186.228.49', 443)>

