In [3]:
import os 
import torch 
from d2l import torch as d2l

## Preprocessing

In [6]:
with open("../data/fra-eng/fra.txt") as f:
    raw_text = f.read()

In [11]:
def preprocess(text):
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    no_space = lambda char, prev_char: char in ',.!?' and prev_char != ' '
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text.lower())]
    return ''.join(out)

In [18]:
text = preprocess(raw_text)
text[:100]

'go .\tva !\nhi .\tsalut !\nrun !\tcours !\nrun !\tcourez !\nwho ?\tqui ?\nwow !\tça alors !\nfire !\tau feu !\nhel'

In [19]:
def tokenize(text):
    """
    Turn the raw text into a tuple[list[list[str]]], where the outer
    tuple
    """
    eng, fra = [], []
    for line in text.split("\n"):
        parts = line.split("\t")
        if len(parts) == 2:
            p1, p2 = parts[0], parts[1]
            eng.append(p1.split(" ") + ["<eos>"])
            fra.append(p2.split(" ") + ["<eos>"])        
    return (eng, fra)

In [21]:
tokenized = tokenize(text)
tokenized[:10]

([['go', '.', '<eos>'],
  ['hi', '.', '<eos>'],
  ['run', '!', '<eos>'],
  ['run', '!', '<eos>'],
  ['who', '?', '<eos>'],
  ['wow', '!', '<eos>'],
  ['fire', '!', '<eos>'],
  ['help', '!', '<eos>'],
  ['jump', '.', '<eos>'],
  ['stop', '!', '<eos>'],
  ['stop', '!', '<eos>'],
  ['stop', '!', '<eos>'],
  ['wait', '!', '<eos>'],
  ['wait', '!', '<eos>'],
  ['go', 'on', '.', '<eos>'],
  ['go', 'on', '.', '<eos>'],
  ['go', 'on', '.', '<eos>'],
  ['hello', '!', '<eos>'],
  ['hello', '!', '<eos>'],
  ['i', 'see', '.', '<eos>'],
  ['i', 'try', '.', '<eos>'],
  ['i', 'won', '!', '<eos>'],
  ['i', 'won', '!', '<eos>'],
  ['i', 'won', '.', '<eos>'],
  ['oh', 'no', '!', '<eos>'],
  ['attack', '!', '<eos>'],
  ['attack', '!', '<eos>'],
  ['cheers', '!', '<eos>'],
  ['cheers', '!', '<eos>'],
  ['cheers', '!', '<eos>'],
  ['cheers', '!', '<eos>'],
  ['get', 'up', '.', '<eos>'],
  ['go', 'now', '.', '<eos>'],
  ['go', 'now', '.', '<eos>'],
  ['go', 'now', '.', '<eos>'],
  ['got', 'it', '!', '<eos>'