# Notebook para geração do módulo de Dataset usando Pytorch Lightning

O objetivo deste notebook é gerar um módulo de dataset com as seguintes funções:

- Separe o texto em treino, teste e validação.
- Gere os labels automaticamente a partir dos textos de dataset em `data/IWSLT/raw`.

In [1]:
import pytorch_lightning as pl
import os
from pathlib import Path
from typing import Dict, List, Tuple

dataset_path = Path('../../data/IWSLT/raw/')

In [2]:
class IWSLTDataset(pl.LightningDataModule):
    def __init__(self, dataset_path, tokenizer):
        super().__init__()
        self.dataset_path = dataset_path
        self.filenames = {
            'train': 'train.txt',
            'ref': 'ref.txt',
            'dev': 'dev.txt',
            'asr': 'asr.txt',
        }
    
    def _preprocess_IWSLT(self, data, punc):
    token_list = list()
    punc_list = list()
    
    for token in data:
        if token in punc_dict:
            punc_list.pop()
            punc_list.append(punc_dict[token])
        else:
            token_list.append(token)
            punc_list.append(0)
    return token_list, punc_list

    def _tokens_to_sentence(self, token_list, punc_list, sentence_size=200):
        phrases = list()
        labels = list()

        for i in range(0, len(token_list), sentence_size):
            j = i + sentence_size if sentence_size < len(token_list) else len(token_list)
            phrases.append(' '.join(token_list[i:j]))
            labels.append(punc_list[i:j])
        return phrases, labels

    def prepare_data(self):
        # called on 1 gpu
        pass

    def setup(self):
        # called on every GPU
        pass
        
    def train_dataloader(self):
        pass

    def val_dataloader(self):
        pass

    def test_dataloader(self, ref=True):
        pass

IndentationError: expected an indented block (<ipython-input-2-f28886035e51>, line 13)

In [3]:
import transformers
x = os.walk(dataset_path)
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

filepath = os.path.join(dataset_path, 'train.txt')

In [4]:
with open(filepath) as f:
    data = f.read()

data = data.split()

punc_dict = {
    ',COMMA':        1,
    '.PERIOD':       2,
    '?QUESTIONMARK': 3,
}

def preprocess_IWSLT(data: List[str], punc: Dict[str, int]) -> List[Tuple[str, int]]:
    token_list = list()
    punc_list = list()
    
    for token in data:
        if token in punc_dict:
            punc_list.pop()
            punc_list.append(punc_dict[token])
        else:
            token_list.append(token)
            punc_list.append(0)
    return token_list, punc_list


token_list, punc_list = preprocess_IWSLT(data, punc_dict)

In [5]:
max_len = 50
list(zip(token_list[:max_len], punc_list[:max_len]))
token_list[:max_len]


['it',
 'can',
 'be',
 'a',
 'very',
 'complicated',
 'thing',
 'the',
 'ocean',
 'and',
 'it',
 'can',
 'be',
 'a',
 'very',
 'complicated',
 'thing',
 'what',
 'human',
 'health',
 'is',
 'and',
 'bringing',
 'those',
 'two',
 'together',
 'might',
 'seem',
 'a',
 'very',
 'daunting',
 'task',
 'but',
 'what',
 'i',
 "'m",
 'going',
 'to',
 'try',
 'to',
 'say',
 'is',
 'that',
 'even',
 'in',
 'that',
 'complexity',
 'there',
 "'s",
 'some']

In [16]:
def tokens_to_sentence(token_list, punc_list, sentence_size=200):
    phrases = list()
    labels = list()

    for i in range(0, len(token_list), sentence_size):
        j = i + sentence_size if sentence_size < len(token_list) else len(token_list)
        phrases.append(' '.join(token_list[i:j]))
        labels.append(punc_list[i:j])
    return phrases, labels


In [18]:
phrases, labels = tokens_to_sentence(token_list, punc_list)

In [22]:
phrases[1], labels[1]

("into the water rolf bolin who was a professor at the hopkin 's marine station where i work wrote in the 1940s that the fumes from the scum floating on the inlets of the bay were so bad they turned lead-based paints black people working in these canneries could barely stay there all day because of the smell but you know what they came out saying they say you know what you smell you smell money that pollution was money to that community and those people dealt with the pollution and absorbed it into their skin and into their bodies because they needed the money we made the ocean unhappy we made people very unhappy and we made them unhealthy the connection between ocean health and human health is actually based upon another couple simple adages and i want to call that pinch a minnow hurt a whale the pyramid of ocean life now when an ecologist looks at the ocean i have to tell you we look at the ocean in a very different way and we see different things than when a regular person looks at 

In [23]:
tokenizer.batch_encode_plus(phrases[:1])

{'input_ids': [[101, 2009, 2064, 2022, 1037, 2200, 8552, 2518, 1996, 4153, 1998, 2009, 2064, 2022, 1037, 2200, 8552, 2518, 2054, 2529, 2740, 2003, 1998, 5026, 2216, 2048, 2362, 2453, 4025, 1037, 2200, 4830, 16671, 2075, 4708, 2021, 2054, 1045, 1005, 1049, 2183, 2000, 3046, 2000, 2360, 2003, 2008, 2130, 1999, 2008, 11619, 2045, 1005, 1055, 2070, 3722, 6991, 2008, 1045, 2228, 2065, 2057, 3305, 2057, 2064, 2428, 2693, 2830, 1998, 2216, 3722, 6991, 2024, 1050, 1005, 1056, 2428, 6991, 2055, 1996, 3375, 2671, 1997, 2054, 1005, 1055, 2183, 2006, 2021, 2477, 2008, 2057, 2035, 3492, 2092, 2113, 1998, 1045, 1005, 1049, 2183, 2000, 2707, 2007, 2023, 2028, 2065, 23603, 9932, 1050, 1005, 1056, 3407, 9932, 1050, 1005, 1056, 6343, 3407, 2057, 2113, 2008, 2157, 2057, 1005, 2310, 5281, 2008, 1998, 2065, 2057, 2074, 2202, 2008, 1998, 2057, 3857, 2013, 2045, 2059, 2057, 2064, 2175, 2000, 1996, 2279, 3357, 2029, 2003, 2008, 2065, 1996, 4153, 9932, 1050, 1005, 1056, 3407, 9932, 1050, 1005, 1056, 6343, 3407