# Script - data

In [1]:
from collections import namedtuple

import sentencepiece as spm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

### 데이터 출처: [The Stanford Natural Language Processing Group](https://nlp.stanford.edu/projects/nmt/)
- Train set
    - [train.en](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en)
    - [train.de](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de)
- Test set
    - [newstest2013.en](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.en)
    - [newstest2013.de](https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.de)

### Download
#### Train: WMT 2014 en-de
- WMT'14 EN
    - size: 0.65GB
    - \# of sents: 4.5M

In [2]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en -P ./data

--2019-08-27 17:34:51--  https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 644937323 (615M) [text/plain]
Saving to: ‘./data/train.en’


2019-08-27 17:44:47 (1.03 MB/s) - ‘./data/train.en’ saved [644937323/644937323]



- WMT'14 DE
    - size: 0.72GB
    - \# of sents: 4.5M

In [3]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de -P ./data

--2019-08-27 17:44:47--  https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 717610118 (684M) [text/plain]
Saving to: ‘./data/train.de’


2019-08-27 17:52:13 (1.54 MB/s) - ‘./data/train.de’ saved [717610118/717610118]



#### Test: newstest2013
- newstest13 EN
    - size: 356KB
    - \# of sents: 3000

In [4]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.en -P ./data

--2019-08-27 17:52:13--  https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.en
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 355027 (347K) [text/plain]
Saving to: ‘./data/newstest2013.en’


2019-08-27 17:52:15 (350 KB/s) - ‘./data/newstest2013.en’ saved [355027/355027]



- newstest13 DE
    - size: 410KB
    - \# of sents: 3000

In [5]:
!wget https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.de -P ./data

--2019-08-27 17:52:15--  https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/newstest2013.de
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405428 (396K) [text/plain]
Saving to: ‘./data/newstest2013.de’


2019-08-27 17:52:17 (387 KB/s) - ‘./data/newstest2013.de’ saved [405428/405428]



- - - 
### Build Vocabulary: SentencePiece
- vocab_size: 32000(32K)

#### WPM_EN
- model type: unigram(WPM)
- outputs
    - WPM_EN.model
    - WPM_EN.vocab

In [3]:
%%time
spm.SentencePieceTrainer.Train('--input=./data/train.en \
--model_prefix=WPM_EN \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram \
--pad_id=3')

CPU times: user 8min 53s, sys: 4.26 s, total: 8min 57s
Wall time: 5min 30s


True

#### WPM_DE
- model type: WPM
- outputs
    - WPM_DE.model
    - WPM_DE.vocab

In [6]:
%%time
spm.SentencePieceTrainer.Train('--input=./data/train.de \
--model_prefix=WPM_DE \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=unigram \
--pad_id=3')

CPU times: user 16min 29s, sys: 3.41 s, total: 16min 33s
Wall time: 6min 48s


True

#### BPE_EN
- model type: BPE
- outputs
    - BPE_EN.model
    - BPE_EN.vocab

In [7]:
%%time
spm.SentencePieceTrainer.Train('--input=./data/train.en \
--model_prefix=BPE_EN \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=bpe \
--pad_id=3')

CPU times: user 2min 44s, sys: 1.75 s, total: 2min 45s
Wall time: 2min 11s


True

#### BPE_DE
- model type: BPE
- outputs
    - BPE_DE.model
    - BPE_DE.vocab

In [9]:
%%time
spm.SentencePieceTrainer.Train('--input=./data/train.de \
--model_prefix=BPE_DE \
--vocab_size=32000 \
--character_coverage=1.0 \
--model_type=bpe \
--pad_id=3')

CPU times: user 4min 38s, sys: 2.28 s, total: 4min 40s
Wall time: 4min 3s


True

- - -
### Tokenizer

In [2]:
class Tokenizer(object):
    def __init__(self, model_path):
        self.tokenizer = spm.SentencePieceProcessor()
        self.tokenizer.load(model_path)
        
    def encode(self, sentence):
        """ encode: a sentence to list of ids """
        
        return self.tokenizer.encode_as_ids(sentence)
    
    def decode(self, ids):
        """ decode: list of ids to a sentence """
        
        return self.tokenizer.DecodeIds(ids)

### NMTDataset

In [25]:
class NMTDataset(Dataset):
    
    def __init__(self, sent_pairs, src_tokenizer, tar_tokenizer, device):
        super().__init__()
        
        self.src_tokenizer = src_tokenizer
        self.tar_tokenizer = tar_tokenizer
        self.device = device
        self.sent_pairs = sent_pairs # list of tuples
        
    @classmethod    
    def from_txt(cls, src_path, tar_path, src_tokenizer, tar_tokenizer, device):
        
        SentPair = namedtuple('SentPair', ['id','src_sent', 'tar_sent'])
        sent_pairs = list()
        with open(src_path, 'r') as src_file, open(tar_path, 'r') as tar_file:
            for id_, (src_sent, tar_sent) in enumerate(zip(src_file.readlines(), tar_file.readlines())):
                sent_pair = SentPair(id_, src_sent, tar_sent)
                sent_pairs.append(sent_pair)
                
        return cls(sent_pairs, src_tokenizer, tar_tokenizer, device)
        
    def __len__(self):
        
        return len(self.sent_pairs)
    
    def __getitem__(self, idx):
        
        return self.sent_pairs[idx]
    
    def _preprocess(self, sent_pair):
        """sentence to index, length"""
        
        id_, src_sent, tar_sent = sent_pair
        src_ids = self.src_tokenizer.encode(src_sent)
        tar_ids = [1]+self.tar_tokenizer.encode(tar_sent)+[2]
        # bos_id: 1 / eos_id: 2
        src_len = len(src_ids)
        tar_len = len(tar_ids)
        
        return id_, src_ids, src_len, tar_ids, tar_len
    
    def _collate(self, batch):
        """list of index, length to tensor"""
        
        id_list = list()
        src_ids_list = list()
        src_len_list = list()
        tar_ids_list = list()
        tar_len_list = list()
        
        for sent_pair in batch:
            id_, src_ids, src_len, tar_ids, tar_len = self._preprocess(sent_pair)
            id_list.append(id_)
            src_ids_list.append(torch.tensor(src_ids, dtype=torch.long, device=self.device)) 
            tar_ids_list.append(torch.tensor(tar_ids, dtype=torch.long, device=self.device)) 
            src_len_list.append(src_len)
            tar_len_list.append(tar_len)
        
        id_ = id_list
        src_ids = nn.utils.rnn.pad_sequence(src_ids_list, batch_first=True, padding_value=3)
        tar_ids = nn.utils.rnn.pad_sequence(tar_ids_list, batch_first=True, padding_value=3)
        
        src_len = torch.tensor(src_len_list, dtype=torch.long, device=self.device)
        tar_len = torch.tensor(tar_len_list, dtype=torch.long, device=self.device)
        
        return id_, src_ids, src_len, tar_ids, tar_len
    
    def _split(self, dataset):
        """split train/test set """
    
        train_size = int(0.7 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
        
        return train_dataset, test_dataset
        
    def to_dataloader(self, batch_size=128, n_workers=0, split=True):
        res = None 
        if split:
            train_dataset, test_dataset = self._split(self)
            train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=self._collate, 
                                          num_workers=n_workers)
            test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=self._collate, 
                                         num_workers=n_workers)
            
            res = train_dataloader, test_dataloader
        else:
            dataloader = DataLoader(self, batch_size=batch_size, collate_fn=self._collate, num_workers=n_workers)
            
            res = dataloader
        
        return res

- - -
### main

In [26]:
device = torch.device('cpu')

In [27]:
%%time

src_tokenizer = Tokenizer('BPE_EN.model')
tar_tokenizer = Tokenizer('BPE_DE.model')
dataset = NMTDataset.from_txt('./data/train.en', './data/train.de', src_tokenizer, tar_tokenizer, device)

CPU times: user 8.18 s, sys: 898 ms, total: 9.08 s
Wall time: 9.07 s


In [28]:
%%time 

train_dataloader, valid_dataloader = dataset.to_dataloader(batch_size=8, n_workers=0, split=True)

CPU times: user 1.3 s, sys: 217 ms, total: 1.52 s
Wall time: 915 ms


In [29]:
def check_batch(dataloader):
    batch = next(iter(dataloader))
    for i in range(len(batch)):
        if i == 0:
            print(len(batch[i]))
        else:
            print(batch[i].shape)

In [30]:
check_batch(train_dataloader)

8
torch.Size([8, 41])
torch.Size([8])
torch.Size([8, 40])
torch.Size([8])


In [31]:
check_batch(valid_dataloader)

8
torch.Size([8, 58])
torch.Size([8])
torch.Size([8, 59])
torch.Size([8])


In [32]:
%%time 

dataloader = dataset.to_dataloader(batch_size=8, n_workers=0, split=False)

CPU times: user 97 µs, sys: 10 µs, total: 107 µs
Wall time: 118 µs


In [33]:
check_batch(dataloader)

8
torch.Size([8, 35])
torch.Size([8])
torch.Size([8, 54])
torch.Size([8])
