In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install --upgrade wandb
!pip install catalyst
!pip install alchemy-catalyst
!wandb login c54b2fcb6b8ca2808f5be303a8a3b6e464f52cca

is_alchemy_used = True
%load_ext tensorboard

Requirement already up-to-date: wandb in /usr/local/lib/python3.6/dist-packages (0.8.27)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[32mSuccessfully logged in to Weights & Biases![0m


In [3]:
import wandb
wandb.init(project="text-augmentation")

W&B Run: https://app.wandb.ai/msaidov/text-augmentation/runs/8bpgvsw5

In [0]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [0]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import pdb
from torch import cuda
import torch.nn.functional as F
import sys
import torchtext
from torchtext import data
from torchtext.data import BPTTIterator, BucketIterator, Iterator
from torchtext import datasets
import torch.optim as O
from tqdm import tqdm
import datetime
import time
from catalyst.dl import SupervisedRunner
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [0]:
def get_device():
	if torch.cuda.is_available():
		return torch.device('cuda:0')
	else:
		return torch.device('cpu')

In [0]:
device = get_device()

In [0]:
LSTM_STACKING_NUMBER = 3
FULLY_CONNECTED_DIM = 2048
LSTM_INPUT_SIZE = 300
EMBEDDING_DIMENSION = 300
HIDDEN_DIMENSION = 512
OUTPUT_DIMENSION = 3
BATCH_SIZE = 128
MODEL_PATH = 'drive/My Drive/text-augmentation/log-directory/baseline-github.pt'
LOG_DIRECTORY = 'drive/My Drive/text-augmentation/log-directory/logs/'
LR_STEP = 0.001
EPOCHS_NUMBER = 5
DP_RATIO = 0.2
LOG_INTERVAL = 50
BPTT_LEN = 5
MINI_BATCH = 32

In [0]:
class BiLSTM(nn.Module):
	def __init__(self, vocab_size, dp_ratio=DP_RATIO,
	             embed_dim=EMBEDDING_DIMENSION, hidden_dim=HIDDEN_DIMENSION,
	             lstm_input_size=LSTM_INPUT_SIZE, fully_connected_dim=FULLY_CONNECTED_DIM):
		super(BiLSTM, self).__init__()
		self.vocab_size = vocab_size
		self.embed_dim = embed_dim
		self.dp_ratio = dp_ratio
		self.hidden_dim = hidden_dim
		self.lstm_input_size = lstm_input_size
		self.linear_input_size = 8 * self.hidden_dim
		self.fully_connected_dim = fully_connected_dim

		self.embedding = nn.Embedding(self.vocab_size, self.embed_dim)
		self.projection = nn.Linear(self.embed_dim, self.lstm_input_size)
		self.lstm = nn.LSTM(self.lstm_input_size, self.hidden_dim,
		                    LSTM_STACKING_NUMBER, bidirectional=True)
		self.out = nn.Sequential(
			nn.Linear(self.linear_input_size, self.fully_connected_dim),
			nn.Dropout(p=self.dp_ratio),
			nn.ReLU(),
			nn.Linear(self.fully_connected_dim, self.fully_connected_dim),
			nn.Dropout(p=self.dp_ratio),
			nn.ReLU(),
			nn.Linear(self.fully_connected_dim, self.fully_connected_dim),
			nn.Dropout(p=self.dp_ratio),
			nn.ReLU(),
			nn.Linear(self.fully_connected_dim, OUTPUT_DIMENSION)
		)
		pass

	def forward(self, batch):
		premise_embed = self.embedding(batch[0])
		hypothesis_embed = self.embedding(batch[1])
		premise_proj = F.relu(self.projection(premise_embed))
		hypothesis_proj = F.relu(self.projection(hypothesis_embed))
		encoded_premise, (h0_premise, c0_premise) = self.lstm(premise_proj)
		encoded_hypothesis, (h0_hypothesis, c0_hypothesis) = self.lstm(hypothesis_proj)

		h0_premise = h0_premise[-1, :, :]
		c0_premise = c0_premise[-1, :, :]
		h0_hypothesis = h0_hypothesis[-1, :, :]
		c0_hypothesis = c0_hypothesis[-1, :, :]
		h0_premise.unsqueeze_(-1)
		c0_premise.unsqueeze_(-1)
		h0_hypothesis.unsqueeze_(-1)
		c0_hypothesis.unsqueeze_(-1)
		h0_premise = h0_premise.expand(-1, -1, BATCH_SIZE)
		c0_premise = c0_premise.expand(-1, -1, BATCH_SIZE)
		h0_hypothesis = h0_hypothesis.expand(-1, -1, BATCH_SIZE)
		c0_hypothesis = c0_hypothesis.expand(-1, -1, BATCH_SIZE)
  
		h0_premise = h0_premise.transpose(2, 0)
		c0_premise = c0_premise.transpose(2, 0)
		h0_premise = h0_premise.transpose(2, 1)
		c0_premise = c0_premise.transpose(2, 1)
		h0_hypothesis = h0_hypothesis.transpose(2, 0)
		c0_hypothesis = c0_hypothesis.transpose(2, 0)
		h0_hypothesis = h0_hypothesis.transpose(2, 1)
		c0_hypothesis = c0_hypothesis.transpose(2, 1)

		h0_premise = h0_premise.mean(1)
		c0_premise = c0_premise.mean(1)
		h0_hypothesis = h0_hypothesis.mean(1)
		c0_hypothesis = c0_hypothesis.mean(1)
		encoded_premise = encoded_premise.mean(1)
		encoded_hypothesis = encoded_hypothesis.mean(1)
		
		lstm_premise = torch.cat((h0_premise, c0_premise), 1)
		ltsm_hypothesis = torch.cat((h0_hypothesis, c0_hypothesis), 1)

		premise = torch.cat((encoded_premise, lstm_premise), 1)
		hypothesis = torch.cat((encoded_hypothesis, ltsm_hypothesis), 1)
  
		combined = torch.cat((premise, hypothesis), 1)
		return self.out(combined)

In [0]:
inputs = data.Field(lower=True, tokenize='spacy', batch_first=True)
answers = data.LabelField(sequential=False, unk_token=None, is_target=True)

train, validate, test = datasets.SNLI.splits(inputs, answers)

In [0]:
class SNLI():
	def __init__(self, batch_size=BATCH_SIZE):
		self.inputs = inputs
		self.answers = answers
		self.train, self.validate, self.test = train, validate, test
		self.inputs.build_vocab(self.train, self.validate)
		self.answers.build_vocab(self.train)
		self.train_iterator, self.validate_iterator, self.test_iterator = data.BucketIterator.splits(
			  (self.train, self.validate, self.test), batch_size=batch_size, device=device)

	def vocab_size(self):
		return len(self.inputs.vocab)

	def out_dim(self):
		return len(self.answers.vocab)

	def labels(self):
		return self.answers.vocab.stoi

dataset = SNLI()

In [0]:
class BucketIteratorWrapper(DataLoader):
    __initialized__ = False

    def __init__(self, iterator: Iterator):
        self.batch_size = iterator.batch_size
        self.num_workers = 1
        self.collate_fn = None
        self.pin_memory = False
        self.drop_last = False
        self.timeout = 0
        self.worker_init_fn = None
        self.sampler = iterator
        self.batch_sampler = iterator
        self.__initialized__ = True

    def __iter__(self):
        return map(lambda batch: {
                    'features': (batch.premise, batch.hypothesis),
                    'targets': batch.label,
                }, self.batch_sampler.__iter__())

    def __len__(self):
        return len(self.batch_sampler)

In [0]:
train_iter = Iterator(train, batch_size=BATCH_SIZE, shuffle=True)
valid_iter = Iterator(validate, batch_size=BATCH_SIZE)

train_iter = BucketIteratorWrapper(train_iter)
valid_iter = BucketIteratorWrapper(valid_iter)
 
loaders = {'train': train_iter, 'valid': valid_iter}

In [0]:
class TrainAndValidate():
    def __init__(self, dataset, batch_size=BATCH_SIZE, embed_dim=EMBEDDING_DIMENSION,
                 hidden_dim=HIDDEN_DIMENSION, dp_ratio=DP_RATIO, epochs=EPOCHS_NUMBER,
                 lr=LR_STEP, save_path=MODEL_PATH):
        print("Training process has begun at: {}".format(datetime.datetime.now()))
        self.batch_size = batch_size
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.dp_ratio = dp_ratio
        self.epochs = epochs
        self.lr = lr
        self.dataset = dataset
        self.save_path = save_path
        self.log_interval = LOG_INTERVAL

        self.model = BiLSTM(self.dataset.vocab_size())
        self.model.to(device)
        wandb.watch(self.model)
        self.criterion = nn.CrossEntropyLoss(reduction='mean')
        self.optimizer = O.Adam(self.model.parameters(), lr=self.lr)
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer)
        self.best_accuracy = -1
        print("Resource preparation done: {}".format(datetime.datetime.now()))

    def save_model(self, current_accuracy):
        if current_accuracy > self.best_accuracy:
            self.best_accuracy = current_accuracy
            model_options = {'vocab_size' : self.dataset.vocab_size(),
                             'embed_dim' : EMBEDDING_DIMENSION,
                             'dp_ratio' : DP_RATIO, 
                             'hidden_dim' : HIDDEN_DIMENSION,
                             'out_dim': OUTPUT_DIMENSION
                            }
            torch.save({
                'accuracy': self.best_accuracy,
                'options': self.model_options,
                'model_dict': self.model.state_dict(),
                }, self.save_path)
        return

    def execute(self):
        n_correct, n_total, n_loss = 0, 0, 0
        print(f"Number of iterations: {len(self.dataset.train_iterator)}")
        runner = SupervisedRunner()
        runner.train(model=self.model, criterion=self.criterion, optimizer=self.optimizer,
                    scheduler=self.scheduler, loaders=loaders, logdir=LOG_DIRECTORY,
                    num_epochs=EPOCHS_NUMBER, verbose=True)
        return

In [15]:
task = TrainAndValidate(dataset)

Training process has begun at: 2020-02-15 01:04:20.185339
Resource preparation done: 2020-02-15 01:04:24.646069


In [16]:
%tensorboard --logdir 'drive/My Drive/text-augmentation/log-directory/logs/'

In [17]:
task.execute()

Number of iterations: 4292


1/5 * Epoch (train): 100% 4291/4292 [28:38<00:00,  2.61it/s, loss=0.795]

RuntimeError: ignored

In [0]:
torch.save({'model_dict': task.model.state_dict()}, 'drive/My Drive/text-augmentation/github-baseline.pt')