In [1]:
import numpy as np
import json

### *Helper Functions*

In [2]:
def json_save(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f, indent= 4)

### *Data Preprocessing*

In [3]:
datapath = 'fra-eng/fra.txt'

with open(datapath, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")

In [4]:
data = {'input_text': [], 'target_text': []}

In [5]:
input_characters = set()
target_characters = set()

In [6]:
for line in lines[: min(5000, len(lines) - 1)]:
    input_text, target_text, _ = line.split("\t")
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = "\t" + target_text + "\n"
    data['input_text'].append(input_text)
    data['target_text'].append(target_text)

In [7]:
json_save('data/data.json', data)

In [8]:
for i in range(len(data['input_text'])):
    for char in data['input_text'][i]:
            if char not in input_characters:
                input_characters.add(char)
    for char in data['target_text'][i]:
        if char not in target_characters:
            target_characters.add(char)

### *Insights about the data*

In [9]:
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in data['input_text']])
max_decoder_seq_length = max([len(txt) for txt in data['target_text']])

print("Number of samples:", len(data['input_text']))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 5000
Number of unique input tokens: 68
Number of unique output tokens: 89
Max sequence length for inputs: 13
Max sequence length for outputs: 59


In [10]:
input_encoder = dict([(char, i) for i, char in enumerate(input_characters)])
target_encoder = dict([(char, i) for i, char in enumerate(target_characters)])

In [11]:
json_save('data/input-encoder.json', input_encoder)
json_save('data/target-encoder.json', target_encoder)