In [161]:
import re
import pandas as pd
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import numpy as np
from functools import reduce

# Harry text

In [120]:
with open("data/Harry.txt", 'r') as f:
    text = f.read().lower()

In [122]:
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\npage.*\n', '\n', text)
text = re.sub(r'\n+', ' ', text)
text = re.sub(r'mr\.', 'mr ', text)
text = re.sub(r'mrs\.', 'mrs ', text)
text = re.sub(r',', '', text)
text = re.sub(r',', '', text)
text = re.sub(r'\.\.\.', ' . ', text)
text = re.sub(r'\.', ' .', text)
text = re.sub(r'\?', ' . ', text)
text = re.sub(r'\!', ' . ', text)
text = re.sub(r'\\', ' ', text)
text = re.sub(r'-', ' ', text)
text = re.sub(r'—', ' ', text)
text = re.sub(r'“', ' ', text)
text = re.sub(r'”', ' ', text)
text = re.sub(r'"', ' ', text)
text = re.sub(r"'", ' ', text)
text = re.sub(r'\(', ' ', text)
text = re.sub(r'\)', ' ', text)
text = re.sub(r':', ' ', text)
text = re.sub(r';', ' ', text)
text = re.sub(r'¦', ' ', text)
text = re.sub(r'‘', ' ', text)
text = re.sub(r'’', ' ', text)
text = re.sub(r'•', ' ', text)
text = re.sub(r'/', ' ', text)
text = re.sub(r'\s+', ' ', text)
text = text.strip()

# Example texts

In [774]:
def read_raw_text(filename):
    with open("data/" + filename, 'r') as f:
        txt = f.read().lower().split('\n')
    return ''.join(txt)

In [775]:
train_text = read_raw_text('train_example.en')
val_text = read_raw_text('val_example.en')
test_text = read_raw_text('test_example.en')

In [776]:
index_to_symbol = list(set(train_text))
symbol_to_index = {letter: index for index, letter in enumerate(index_to_symbol)}

In [777]:
def data_process(raw_text_iter):
    data = [torch.tensor([symbol_to_index[symb] for symb in raw_text_iter],
                       dtype=torch.long)]
    return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))

In [779]:
train_data = data_process(train_text)
val_data = data_process(val_text)
test_data = data_process(test_text)

In [782]:
def batchify(data, bsz=20):
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [783]:
train_data = batchify(train_data)
val_data = batchify(val_data)
test_data = batchify(test_data)

In [321]:
symbols = pd.DataFrame({'symbols' : list(set(text))})

In [784]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [785]:
class HarryTransformer(nn.Module):

    def __init__(self, n_token, n_inp, n_head, n_hid, n_layers, dropout=0.5):
        super(HarryTransformer, self).__init__()
        self.model_type = 'HarryTransformer'
        self.pos_encoder = PositionalEncoding(n_inp, dropout)
        encoder_layers = TransformerEncoderLayer(n_inp, n_head, n_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, n_layers)
        self.encoder = nn.Embedding(n_token, n_inp)
        self.n_inp = n_inp
        self.decoder = nn.Linear(n_inp, n_token)

        self.init_weights()

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.n_inp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, src_mask)
        output = self.decoder(output)
        return output

In [470]:
get_symbols = lambda data: reduce(lambda x, y: x + list(y), data , [])

In [471]:
symbolic_sentences = list(map(get_symbols, sentences))

In [472]:
padded_sentences = list(map(lambda x: x[:100], symbolic_sentences))

In [483]:
symbols_map = dict(zip(list(set(text)), range(len(set(text)))))

In [877]:
ntokens = len(symbol_to_index) # the size of vocabulary
emsize = 300 # embedding dimension
nhid = 500 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4 # the number of heads in the multiheadattention models
dropout = 0.4 # the dropout value
model = HarryTransformer(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [878]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

In [580]:
num_sentence = lambda s: list(map(lambda x: symbols_map[x], s))

In [584]:
numeric_sentences = list(map(num_sentence, padded_sentences))

In [588]:
from random import sample

indices = sample(range(len(numeric_sentences)), int(0.3 * len(numeric_sentences)))

val_indices = indices[0: int(len(indices) / 3)]
test_indices = indices[int(len(indices) / 3): ]
train_indices = list(filter(lambda x: x not in indices, list(range(len(numeric_sentences)))))

In [697]:
train_data = [sentence for i, sentence in enumerate(numeric_sentences) if i in train_indices]
val_data = [sentence for i, sentence in enumerate(numeric_sentences) if i in val_indices]
test_data = [sentence for i, sentence in enumerate(numeric_sentences) if i in test_indices]

In [596]:
concat = lambda l: reduce(lambda x, y: x + y, l, [])

In [602]:
train_concat = concat(train_data)
val_concat = concat(val_data)
test_concat = concat(test_data)

In [745]:
train_data = torch.tensor(train_concat[:len(train_concat) - (len(train_concat) % 20)]).view(20, -1).transpose(0, 1).to(device)
val_data = torch.tensor(val_concat[:len(val_concat) - (len(val_concat) % 20)]).view(20, -1).transpose(0, 1).to(device)
test_data = torch.tensor(test_concat[:len(test_concat) - (len(test_concat) % 20)]).view(20, -1).transpose(0, 1).to(device)

In [879]:
import time
from tqdm import tqdm

criterion = nn.CrossEntropyLoss()
lr = 0.03 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in tqdm(enumerate(range(0, train_data.size(0) - 1, bptt))):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, -1))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [880]:
best_val_loss = float("inf")
epochs = 5 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, -1))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

202it [00:26,  7.50it/s]

| epoch   1 |   200/ 2585 batches | lr 0.03 | ms/batch 132.95 | loss  3.02 | ppl    -1.00


402it [00:53,  7.87it/s]

| epoch   1 |   400/ 2585 batches | lr 0.03 | ms/batch 134.11 | loss  2.56 | ppl    -1.00


602it [01:20,  7.53it/s]

| epoch   1 |   600/ 2585 batches | lr 0.03 | ms/batch 134.03 | loss  2.48 | ppl    -1.00


802it [01:47,  7.42it/s]

| epoch   1 |   800/ 2585 batches | lr 0.03 | ms/batch 136.03 | loss  2.43 | ppl    -1.00


1002it [02:14,  7.70it/s]

| epoch   1 |  1000/ 2585 batches | lr 0.03 | ms/batch 133.06 | loss  2.42 | ppl    -1.00


1202it [02:40,  7.95it/s]

| epoch   1 |  1200/ 2585 batches | lr 0.03 | ms/batch 132.56 | loss  2.39 | ppl    -1.00


1402it [03:07,  7.91it/s]

| epoch   1 |  1400/ 2585 batches | lr 0.03 | ms/batch 131.78 | loss  2.38 | ppl    -1.00


1602it [03:33,  7.57it/s]

| epoch   1 |  1600/ 2585 batches | lr 0.03 | ms/batch 130.99 | loss  2.37 | ppl    -1.00


1759it [03:54,  7.49it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\olegggatttor\appdata\local\programs\python\python38\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-880-02e15041ff84>", line 7, in <module>
    train()
  File "<ipython-input-879-ee1124f1d257>", line 19, in train
    output = model(data, src_mask)
  File "c:\users\olegggatttor\appdata\local\programs\python\python38\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-785-e769432dd371>", line 29, in forward
    output = self.transformer_encoder(src, src_mask)
  File "c:\users\olegggatttor\appdata\local\programs\python\python38\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
    result = self.forward(*input, **kwargs)
  File "c:\users\olegggatttor\appdata\local\programs\python\python38\lib\site-packages\torch\nn\modules\transfo

TypeError: object of type 'NoneType' has no len()

In [856]:
test_loss = evaluate(best_model, test_data)

In [857]:
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss,-1))
print('=' * 89)

| End of training | test loss  2.63 | test ppl    -1.00


In [875]:
def gen(pref, n=100):
    res = pref
    for i in range(n):
        sentence = list(res)
        num_test = torch.tensor(list(map(lambda x: num_sentence(x), sentence)))
        src_mask = best_model.generate_square_subsequent_mask(num_test.size(0)).to(device)
        pred = best_model(num_test, src_mask)[len(res) - 1, :].argmax().item()
        res += index_to_symbol[pred]
    return res

In [876]:
gen('and')

'and  e e  oe oe  oe oe oe oe   oe oe  oe oe oe oe oe oe oe a oe oe a           a oa oe oa oa o     t a '

In [736]:
sentence = [['h'], ['a'], ['r'], ['r']]
num_test = torch.tensor(list(map(num_sentence, sentence)))
src_mask = best_model.generate_square_subsequent_mask(num_test.size(0)).to(device)
pred = best_model(num_test, src_mask).view(-1)

In [738]:
num_test

tensor([[16],
        [21],
        [13],
        [13]])

In [739]:
data, targets = get_batch(num_test, 0)

In [740]:
data

tensor([[16],
        [21],
        [13]])

In [741]:
targets

tensor([21, 13, 13])