# Notebook para geração do módulo de Dataset usando Pytorch Lightning

O objetivo deste notebook é gerar um módulo de dataset com as seguintes funções:

- Separe o texto em treino, teste e validação.
- Gere os labels automaticamente a partir dos textos de dataset em `data/IWSLT/raw`.

In [5]:
import pytorch_lightning as pl
import os
import torch
import transformers
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from typing import Dict, List, Tuple
from torch.utils.data import DataLoader, random_split

dataset_path = Path('../../data/IWSLT/raw/')

In [6]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
filepath = os.path.join(dataset_path, 'train.txt')
punc_dict = {
    ',COMMA':        1,
    '.PERIOD':       2,
    '?QUESTIONMARK': 3,
}

In [7]:
class IWSLTDataset(Dataset):
    def __init__(self, path, tokenizer, max_len, punc_dict, tok_max_len=278):

        self.path = path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.punc_dict = punc_dict
        self.tok_max_len = tok_max_len

        data = self._load_data(path)
        token_list, punc_list = self._preprocess_IWSLT(data)
        self.data, self.labels = self._tokens_to_sentence(token_list, punc_list, max_len)

    def _load_data(self, path):
        with open(path) as f:
            data = f.read()
            data = data.split()
        return data

    def _preprocess_IWSLT(self, data):
        token_list = list()
        punc_list = list()
        
        for token in data:
            if token in punc_dict:
                punc_list.pop()
                punc_list.append(self.punc_dict[token])
            else:
                token_list.append(token)
                punc_list.append(0)
        return token_list, punc_list

    def _tokens_to_sentence(self, token_list, punc_list, max_len):
        phrases = list()
        labels = list()

        for i in range(0, len(token_list), max_len):
            j = i + max_len if max_len < len(token_list) else len(token_list)
            phrases.append(' '.join(token_list[i:j]))
            labels.append(punc_list[i:j])
        return phrases[:-1], labels[:-1]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = tokenizer.encode_plus(
            self.data[idx],
            max_length=self.tok_max_len,
            pad_to_max_length=True,
            truncation=True,
            return_tensors='pt'
        )
        target = torch.LongTensor(self.labels[idx])

        data['input_ids'] = data['input_ids'].float().squeeze(0)
        data['attention_mask'] = data['attention_mask'].float().squeeze(0)

        return {
            'sentence': self.data[idx],
            'input_ids': data['input_ids'],
            'attention_mask': data['attention_mask'],
            'target': target,
        }


In [8]:
ds = IWSLTDataset(path=filepath,
                  tokenizer=tokenizer,
                  max_len=200,
                  punc_dict=punc_dict)
        

In [9]:
class IWSLTDataModule(pl.LightningDataModule):
    def __init__(self, train, test, val, tokenizer, max_len, punc_dict, tok_max_len=278, batch_size=8):
        super().__init__()
        self.train_path = train
        self.test_path = test
        self.val_path =  val
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.punc_dict = punc_dict
        self.tok_max_len = tok_max_len
        self.batch_size = batch_size

    def prepare_data(self, sentence_size=200):
        self.train_dataset = IWSLTDataset(self.train_path, self.tokenizer, self.max_len,
                                          self.punc_dict, self.tok_max_len)
        self.test_dataset = IWSLTDataset(self.test_path, self.tokenizer, self.max_len,
                                          self.punc_dict, self.tok_max_len)
        self.val_dataset = IWSLTDataset(self.val_path, self.tokenizer, self.max_len,
                                          self.punc_dict, self.tok_max_len)
    def setup(self):
        pass

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=8)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test-dataset, batch_size=self.batch_size)

In [10]:
class Model(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.l1 = torch.nn.LSTM(1, 50, batch_first=True)
        self.l2 = torch.nn.Linear(50, 30522)

    def forward(self, x):
        return torch.softmax(self.l2(self.l1(x)[0]), dim=-1)
    
    def training_step(self, batch, batch_idx):
        x = batch['input_ids'].float().unsqueeze(2)
        y_hat = self(x)
        y = batch['input_ids'].long()
        loss = torch.nn.functional.cross_entropy(y_hat.permute(0, 2, 1), y)
        return pl.TrainResult(loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.02)

In [11]:
train = os.path.join(dataset_path, 'train.txt')
test = os.path.join(dataset_path, 'ref.txt') 
val = os.path.join(dataset_path, 'dev.txt')

dm = IWSLTDataModule(train, test, val, tokenizer, 200, punc_dict, 278, batch_size=48)
dm.prepare_data()
dm.setup()
model = Model()
logger = TensorBoardLogger(
    save_dir=os.getcwd(),
    version=1,
    name='lightning_logs'
)
trainer = pl.Trainer(max_epochs=20, logger=logger, gpus=1)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0,1]


In [4]:
trainer.fit(model, dm)

NameError: name 'trainer' is not defined