In [1]:
%load_ext autoreload
%autoreload 2

import sys

import torch

if r"../../../kb-data-cleaning/kbclean" not in sys.path:
    sys.path.append(r"../../../kb-data-cleaning/kbclean")

method = "vseq2seq"

In [2]:
import yaml

hparams = yaml.load(open(f"../../config/{method}.yaml", "r"), Loader=yaml.FullLoader)
hparams

{'batch_size': 2000,
 'enc_emb_dim': 50,
 'dec_emb_dim': 50,
 'enc_hid_dim': 50,
 'dec_hid_dim': 50,
 'latent_dim': 10,
 'dropout_p': 0.5,
 'lr': 0.0005,
 'use_sm': False,
 'amp_level': 'O1',
 'teacher_forcing_ratio': 0.5,
 'max_length': 100}

In [3]:
from argparse import Namespace

import torch

hparams = Namespace(**hparams)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import pandas as pd

df = pd.read_csv("../../data/test/ed2_cols/Beers_abv.csv", dtype=str, keep_default_na=False)

data = df.iloc[:, 0].values.tolist()

In [5]:
import string

import regex as re
from torchnlp.encoders import LabelEncoder
from torchnlp.encoders.text import CharacterEncoder

printable = set(string.printable)


def to_regex(s):
    s = "".join(filter(lambda x: x in printable, s))
    s = re.sub("[A-Z]", "A", s)
    s = re.sub("[a-z]", "a", s)
    s = re.sub("[0-9]", "0", s)
    return s


# data = [to_regex(val) for val in data]

char_encoder = CharacterEncoder(data, append_eos=True)

In [6]:
hparams.vocab_size = char_encoder.vocab_size
hparams.vocab_size

17

In [7]:
from torch.utils.data import DataLoader, SequentialSampler, random_split
from torchnlp.encoders.text import stack_and_pad_tensors
from torchnlp.samplers import BucketBatchSampler

data_with_labels = [
    example[:100] for example in data if example
]


def collate_fn(batch):
    inputs, lengths = char_encoder.batch_encode(batch)
    return inputs, lengths, batch

train_length = int(len(data_with_labels) * 0.7)
train_dataset, val_dataset = random_split(
    list(data_with_labels), [train_length, len(data_with_labels) - train_length],
)

len(train_dataset)

1643

In [8]:
train_dataloader = DataLoader(
    train_dataset, batch_size=hparams.batch_size, collate_fn=collate_fn, num_workers=16,
)

val_dataloader = DataLoader(
    val_dataset, batch_size=hparams.batch_size, collate_fn=collate_fn, num_workers=16,
)

In [9]:
import random

from pytorch_lightning.callbacks import Callback


class PredictionCallback(Callback):
    def __init__(self, data):
        self.data = data

    def on_epoch_end(self, trainer, pl_module):
        sampled_data = random.choices(self.data, k=10)
        inp, lengths, examples = collate_fn(sampled_data)
        dec_outputs, _ = pl_module.forward(inp.cuda(), lengths.cuda())
        best_outputs = torch.argmax(dec_outputs, dim=2)
        print(list(zip(char_encoder.batch_decode(best_outputs, lengths), examples)))

In [10]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger

from models.auto_encoder import Seq2Seq, VSeq2Seq
char_encoder.padding_index

if method == "seq2seq":
    seq2seq = Seq2Seq(hparams, char_encoder.padding_index)
else:
    seq2seq = VSeq2Seq(hparams, char_encoder.padding_index)

trainer = Trainer(
    gpus=[0, 1, 2, 3],
    amp_level=hparams.amp_level,
    distributed_backend="dp",
    callbacks=[PredictionCallback(data_with_labels)],
    logger=TensorBoardLogger("../../tt_logs", "vseq2seq"),
    max_epochs=1
)
trainer.fit(
    seq2seq, train_dataloader=train_dataloader, val_dataloaders=[val_dataloader]
)

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0,1,2,3]

   | Name                           | Type      | Params
---------------------------------------------------------
0  | attn                           | Attention | 7 K   
1  | attn.attn                      | Linear    | 7 K   
2  | attn.v                         | Linear    | 50    
3  | encoder                        | Encoder   | 36 K  
4  | encoder.embedding              | Embedding | 850   
5  | encoder.rnn                    | GRU       | 30 K  
6  | encoder.fc                     | Linear    | 5 K   
7  | encoder.dropout                | Dropout   | 0     
8  | decoder                        | Decoder   | 42 K  
9  | decoder.embedding              | Embedding | 850   
10 | decoder.rnn                    | GRU       | 30 K  
11 | decoder.fc_out                 | Linear    | 3 K   
12 | decoder.dropout                | Dropout   | 0     
13 | hidden2latent    

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

[('00.</s>7', '0.05'), ('040<unk></s>2', '0.049'), ('0985</s>', '0.09'), ('048<copy><unk>2', '0.068'), ('00.4%9', '0.07%'), ('04<pad>145', '0.053'), ('09.419', '0.052'), ('0<copy>84</s>', '0.08'), ('04<unk>91%9</s>44%454497%458<unk>', '0.052000000000000005%'), ('011419', '0.054')]



1

In [11]:
def collate_fn2(batch):
    data, lengths = char_encoder.batch_encode(batch)
    return data.cuda(), lengths.cuda()

full_dataloader = DataLoader(
    [example[:100] for example in data], batch_size=hparams.batch_size, collate_fn=collate_fn2
)

In [12]:
encoded_batches = []

for data, lengths in full_dataloader:
    encoded_batches.append(seq2seq.encode(data, lengths).detach().cpu().numpy())

In [13]:
len(encoded_batches), encoded_batches[0].shape

(2, (2000, 20))

In [14]:
import numpy as np

encoded_data = np.concatenate(encoded_batches, axis=0)
np.save("../../data/numpy/encoded.npy", encoded_data)