In [1]:
import re
import itertools
from collections import Counter
from itertools import chain

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import Layout


import unidecode
from tqdm.notebook import tqdm, trange

In [None]:
# inspired by https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [2]:
def show_tensor(X, title=''):
    plt.imshow(X.detach().numpy())
    plt.title(f'{title} {X.shape}')
    plt.show()

In [6]:
SENT_LENGTH = 7

def tokenize(s):
    words = re.split(r'\W+', s.strip())
    while not words[-1]:
        words.pop()
    return words


df = pd.read_csv('data/fra.txt', sep='\t', names=['eng', 'fra', 'license'])
df.drop(columns=['license'], inplace=True)

exclude_chars = r'[^a-z ]'
df['fra_ascii'] = df['fra'].map(unidecode.unidecode).map(str.lower)
df['fra_ascii'] = df['fra_ascii'].map(lambda s: re.sub(exclude_chars, '', s))
df['eng_ascii'] = df['eng'].map(unidecode.unidecode).map(str.lower)
df['eng_ascii'] = df['eng_ascii'].map(lambda s: re.sub(exclude_chars, '', s))
df['eng_list'] = df['eng_ascii'].map(tokenize)
df['fra_list'] = df['fra_ascii'].map(tokenize)

print('Whole:', len(df))

df = df[(df['fra_list'].map(len) <= SENT_LENGTH) & (df['eng_list'].map(len) <= SENT_LENGTH)]
print(f'Only short (<={SENT_LENGTH}):', len(df))

# Most common words filter
#most_common_eng = [r[0] for r in eng_words.most_common(10)]
#df = df[df['eng_ascii'].str.contains('|'.join(most_common_eng))]

SENT_LENGTH += 1  # add room for SOS and EOS tokens

Whole: 232736
Only short (<=7): 155267


In [7]:
EOS_token = 0
SOS_token = 1

special_tokens = ['<EOS>', '<SOS>']

fra_words = Counter()
for row in df['fra_list']:
    fra_words.update(row)
    
eng_words = Counter()
for row in df['eng_list']:
    eng_words.update(row)

fra_dict = {w: i for i, w in enumerate(chain.from_iterable([special_tokens, fra_words.keys()]))}
eng_dict = {w: i for i, w in enumerate(chain.from_iterable([special_tokens, eng_words.keys()]))}
fra_dict_rev = {v:k for k, v in fra_dict.items()}
eng_dict_rev = {v:k for k, v in eng_dict.items()}

for _ in range(len(df)):
    fra_words.update(special_tokens)
    eng_words.update(special_tokens)

fra_weights = [1/c for c in fra_words.values()]
eng_weights = [1/c for c in eng_words.values()]

print(f"{len(fra_dict)=}")
print(f"{len(eng_dict)=}")

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, train=True, batch_size=1, split=0.8):

        df = df.sample(frac=1, ignore_index=True, random_state=0)
        
        row_split = int(df.shape[0] * 0.8)
        print(f'Train split: {row_split}/{df.shape[0]}')
        if train:
            self.df = df[:row_split]
        else:
            self.df = df[row_split:]
        self.batch_size = batch_size

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, idx):
        batch = self.df.iloc[idx * self.batch_size: (idx + 1) * self.batch_size]
        return batch['fra_list'], batch['eng_list']

len(fra_dict)=24420
len(eng_dict)=12539


In [None]:
BATCH_SIZE = 32
EMBED_SIZE = 128
INPUT_CLASES = len(fra_dict)
OUTPUT_CLASES = len(eng_dict)

print(f"{SENT_LENGTH=}")
print(f"{BATCH_SIZE=}")
print(f"{EMBED_SIZE=}")
print(f"{INPUT_CLASES=}")
print(f"{OUTPUT_CLASES=}")
print()

train_ds = Dataset(df, train=True, batch_size=BATCH_SIZE)

t = None

def assert_shape(t, dims):
    assert len(t.shape) == len(dims), f'Expected shape {dims}, got {t.shape}'
    for td, dd in zip(list(t.shape), dims):
        assert td == dd, f"Got shape: {list(t.shape)} , expected: {dims}"


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super().__init__()

        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        assert_shape(input, [BATCH_SIZE, SENT_LENGTH])

        input = self.embedding(input)
        embedded = self.dropout(input)
        output, hidden = self.gru(embedded)

        assert_shape(output, [BATCH_SIZE, SENT_LENGTH, EMBED_SIZE])
        assert_shape(hidden, [1, BATCH_SIZE, EMBED_SIZE])

        return output, hidden

class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super().__init__()

        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_output, encoder_hidden, target):

        assert_shape(encoder_output, [BATCH_SIZE, SENT_LENGTH, EMBED_SIZE])
        assert_shape(encoder_hidden, [1, BATCH_SIZE, EMBED_SIZE])

        if target is not None:
            assert_shape(target, [BATCH_SIZE, SENT_LENGTH])

        batch_size = encoder_output.shape[0]

        decoder_input = torch.empty(batch_size, 1, dtype=torch.long).fill_(SOS_token)
        decoder_hidden = encoder_hidden

        decoder_outputs = []

        for i in range(SENT_LENGTH):
            #print(i)
            decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target is not None:
                # train
                decoder_input = target[:, i].unsqueeze(1)
            else:
                # eval
                pred = decoder_output.argmax(dim=2)
                decoder_input = pred.detach()

        decoder_outputs = torch.cat(decoder_outputs, dim=1)

        # Required DOWNSTREAM
        assert_shape(decoder_outputs, [BATCH_SIZE, SENT_LENGTH, OUTPUT_CLASES])
        assert_shape(decoder_hidden, [1, BATCH_SIZE, EMBED_SIZE])
        
        #decoder_output = F.log_softmax(decoder_output, dim=-1)
        return decoder_outputs, decoder_hidden, None

    def forward_step(self, input, hidden):
        assert_shape(input, [BATCH_SIZE, 1])
        assert_shape(hidden, [1, BATCH_SIZE, EMBED_SIZE])

        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden


class Translator(nn.Module):
    def __init__(self, input_size, output_size, hidden_size):
        super().__init__()
        self.encoder = Encoder(
            input_size=input_size,
            hidden_size=hidden_size,
        )
        self.decoder = Decoder(
            output_size=output_size,
            hidden_size=hidden_size,
        )

    def forward(self, input, target=None):        
        #print(f"{input.shape=}")
        output, hidden = self.encoder(input)
        output, hidden, target = self.decoder(output, hidden, target)
        return output, hidden

if False:
    fra_eng_translator = Translator(
        input_size=INPUT_CLASES,
        output_size=OUTPUT_CLASES,
        hidden_size=EMBED_SIZE
    )


    lr = 0.01
    optimizer = torch.optim.Adam(fra_eng_translator.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=1, mode='min')
    criterion = nn.CrossEntropyLoss(weight=torch.tensor(eng_weights), reduction='mean')

def stc_to_tensor(stcs: list, word_dict):
    t = torch.zeros([len(stcs), SENT_LENGTH], dtype=torch.int64)
    for i, s in enumerate(stcs):
        t[i, :len(s) + 1] = torch.tensor([word_dict[w] for w in s] + [EOS_token], dtype=torch.int64)
    return t

def eval(s):
    fra_eng_translator.eval()
    
    eval_input = stc_to_tensor(s, fra_dict)
    output, hidden = fra_eng_translator.forward(eval_input)
    eval_out = output.argmax(dim=2)
    #wt.value = f"{eval_input}"
    wt.value = f"{output[0].T}"

    outs = []
    
    for r in eval_out:
        words = []
        for c in r:
            if c == EOS_token:
                break
            words.append(eng_dict_rev[c.item()])

        outs.append(words)
    return outs

losses = []

w = widgets.Textarea(
    value='',
    description='Preview:',
    disabled=False,
    layout=Layout(width='1000px', height='400px')
)

wt = widgets.Textarea(
    value='',
    description='Eval:',
    disabled=False,
    layout=Layout(width='1000px', height='250px')
)


display(w)
display(wt)

def train():

    fra_eng_translator.train()
    
    epoch_losses = []
    pbar = tqdm(train_ds, total=len(train_ds), disable=False, leave=False)

    optimizer.zero_grad()

    for X, y in pbar:
        
        # reject because shape asserts
        if X.shape[0] != BATCH_SIZE:
            break

        input = stc_to_tensor(X, fra_dict)
        target = stc_to_tensor(y, eng_dict)

        output, hidden = fra_eng_translator.forward(input, target)

        # Required DOWNSTREAM
        # https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy
        output = output.movedim(1, 2)
        assert_shape(output, [BATCH_SIZE, OUTPUT_CLASES, SENT_LENGTH])
        assert_shape(target, [BATCH_SIZE, SENT_LENGTH])

        loss = criterion(output, target)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        evals = []
        for i in range(1):
            input, target = train_ds[i]
            output = eval(input)
            for i, t, o in zip(input.tolist(), target.tolist(), output):
                evals.append(f"{' '.join(i)} --- {' '.join(t)} >>> {' '.join(o)}")

        w.value = '\n'.join(evals)

        epoch_losses.append(loss.detach())

        sliding_mean_loss = np.mean(epoch_losses[-1000:])
        losses.append(sliding_mean_loss)
        

        epoch_losses.append(loss.detach())
        pbar.set_description(f"lr: {scheduler.get_last_lr()} Loss: {sliding_mean_loss:.10f}")

    scheduler.step(sliding_mean_loss)


for e in tqdm(range(100), desc="Epoch"):
    train()

SENT_LENGTH=8
BATCH_SIZE=32
EMBED_SIZE=128
INPUT_CLASES=24420
OUTPUT_CLASES=12539

Train split: 124213/155267


Textarea(value='', description='Preview:', layout=Layout(height='400px', width='1000px'))

Textarea(value='', description='Eval:', layout=Layout(height='250px', width='1000px'))

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/3881 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

