In [1]:
TRAIN_TOKENIZERS = False

WORD_TOKENIZER_FILE_NAME = './wtoken.json'
BPE_TOKENIZER_FILE_NAME = './bpetoken.json'

BPE_VOCAB_SIZE = 10000
WORD_LEVEL_VOCAB_SIZE = 5000

UNK_TOKEN = "[UNK]"
PAD_TOKEN = "[PAD]"
SOS_TOKEN = "[SOS]"
EOS_TOKEN = "[EOS]"
ALL_TOKENS = [UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN]

ALL_TRAINING_DATA = [
    './cultural.txt',
    './economics.txt',
    './politics.txt',
    './sports.txt'
]

LM_TRAINING_DATA = ['./t.txt'] #ALL_TRAINING_DATA[:1]

# <div class="green">Tokenization</div>

In [2]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel, BPE
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.processors import TemplateProcessing

## <span class="blue">Word Tokenizer</span>

In [3]:
if TRAIN_TOKENIZERS:
    word_tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
    word_tokenizer.pre_tokenizer = Whitespace()
    word_trainer = WordLevelTrainer(vocab_size=WORD_LEVEL_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    word_tokenizer.train(ALL_TRAINING_DATA, word_trainer)
    word_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    word_tokenizer.save(WORD_TOKENIZER_FILE_NAME)
else:
    word_tokenizer = Tokenizer.from_file(WORD_TOKENIZER_FILE_NAME)

In [4]:
word_tokenizer.id_to_token

<function Tokenizer.id_to_token(self, id)>

## <span class="blue">BPE Tokenizer</span>

In [5]:
if TRAIN_TOKENIZERS:
    bpe_tokenizer = Tokenizer(BPE(unk_token=UNK_TOKEN))
    bpe_tokenizer.pre_tokenizer = Whitespace()
    bpe_trainer = BpeTrainer(vocab_size=BPE_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    bpe_tokenizer.train(ALL_TRAINING_DATA, bpe_trainer)
    bpe_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    bpe_tokenizer.save(BPE_TOKENIZER_FILE_NAME)
else:
    bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_FILE_NAME)

## <span class="blue">Post Processing</span>

In [6]:
def add_post_processor_to(tokenizer: Tokenizer):
    tokenizer.post_processor = TemplateProcessing(
        single=f"{SOS_TOKEN} $0 {EOS_TOKEN}",
        special_tokens=[
            (X, tokenizer.token_to_id(X)) for X in [SOS_TOKEN, EOS_TOKEN]
        ]
    )
add_post_processor_to(word_tokenizer)
add_post_processor_to(bpe_tokenizer)

In [7]:
import copy
import tqdm
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizerFast

class TokenizedTextDataset(Dataset):
    def __init__(self, lines, bsz:int, tokenizer, tokenizer_args):                
        lines_tokenized = [tokenizer(line, **tokenizer_args)['input_ids'][0] for line in lines]
        self.__lines = []
        for i in tqdm.tqdm(range(0, len(lines_tokenized), bsz)):
            tensor = torch.stack(lines_tokenized[i: i+bsz]).T.to('cuda')
            self.__lines.append(tensor)
        
    def __len__(self):
        return len(self.__lines)
    
    def __getitem__(self, idx):
        tensor = self.__lines[idx]
        return tensor[:-1], tensor[1:].reshape(-1)


class TextDataset(Dataset):
    def __init__(self, corpus_files):
        dataset_lines = []

        for file_name in LM_TRAINING_DATA:
            with open(file_name, 'r') as f:
                dataset_lines += f.readlines()
        dataset_lines = [line.strip() for line in dataset_lines]
                
        self.__lines = dataset_lines
        
    def __len__(self):
        return len(self.__lines)
    
    def __getitem__(self, idx):
        return self.__lines[idx]
    
    def get_tokenized(self, bsz, tokenizer, **tokenizer_args):
        fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=copy.deepcopy(tokenizer))
        fast_tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
        return TokenizedTextDataset(self.__lines, bsz, fast_tokenizer, tokenizer_args)

In [8]:
dataset = TextDataset(LM_TRAINING_DATA)

In [9]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTMModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, bsz):
        super().__init__()
        dropout=0.5
        nlayers = 1
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers
        self.bsz = bsz

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.weight)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.lstm(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, self.bsz, self.nhid), weight.new_zeros(self.nlayers, self.bsz, self.nhid))

In [10]:
EPOCHS = 300
MAX_LENGTH = 128
BPTT = 32
CLIP = 0.25
HIDDEN_SIZE = 100
EMBEDING_SIZE = 100
BATCH_SIZE = 20
INITIAL_LR = 20

criterion = nn.NLLLoss()

In [11]:
import time

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach().to('cuda')
    else:
        return tuple(repackage_hidden(v) for v in h)

def train_epoch(epoch, tokenized_dataset:TokenizedTextDataset, model, criterion, batch_size, log_interval, lr):
    # Turn on training mode which enables dropout.
    total_loss = 0.
    start_time = time.time()
    hidden = model.init_hidden()
    for batch_idx in range(0, len(tokenized_dataset)):
        data, targets = tokenized_dataset[batch_idx]
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch_idx % log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch_idx, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()


In [12]:
# Loop over epochs.
def train_all(dataset: TextDataset, tokenizer, save_interval, save_patch):
    dataset_tokenized = dataset.get_tokenized(BATCH_SIZE, tokenizer, truncation='longest_first', return_tensors="pt", max_length=MAX_LENGTH, padding='max_length')
    best_val_loss = None
    
    n_tokens = tokenizer.get_vocab_size()
    model = LSTMModel(n_tokens, EMBEDING_SIZE, HIDDEN_SIZE, BATCH_SIZE)
    model.to('cuda')
    model.train()
    
    for epoch in range(1, EPOCHS+1):
        epoch_start_time = time.time()
        train_epoch(epoch, dataset_tokenized, model, criterion, BATCH_SIZE, 1000, INITIAL_LR)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s |'.format(epoch, (time.time() - epoch_start_time)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if epoch % save_interval == 0:
            with open(save_patch, 'wb') as f:
                torch.save(model, f)
    with open(save_patch, 'wb') as f:
        torch.save(model, f)

In [13]:
train_all(dataset, word_tokenizer, 5, './lstm_word')

100%|██████████| 2000/2000 [00:02<00:00, 912.67it/s]


| epoch   1 |  1000 batches | lr 20.00 | ms/batch 10.01 | loss  2.86 | ppl    17.39
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 19.98s |
-----------------------------------------------------------------------------------------
| epoch   2 |  1000 batches | lr 20.00 | ms/batch  9.98 | loss  2.56 | ppl    12.93


KeyboardInterrupt: 

In [14]:
train_all(dataset, bpe_tokenizer, 5, './lstm_bpe')

100%|██████████| 2000/2000 [00:00<00:00, 22863.85it/s]


| epoch   1 |  1000 batches | lr 20.00 | ms/batch 15.33 | loss  3.77 | ppl    43.18
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 30.61s |
-----------------------------------------------------------------------------------------
| epoch   2 |  1000 batches | lr 20.00 | ms/batch 15.30 | loss  3.41 | ppl    30.19
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 30.57s |
-----------------------------------------------------------------------------------------


KeyboardInterrupt: 