In [1]:
import codecs
from tensorflow.keras.callbacks import EarlyStopping
import embedding_models
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import create_model

In [2]:
with codecs.open("./data/test_eng", 'r', encoding='utf-8') as f:
        test_eng = [line.strip() for line in f.readlines()]
with codecs.open("./data/test_pt", 'r', encoding='utf-8') as f:
        test_pt = [line.strip() for line in f.readlines()]
        
with codecs.open("./data/train_eng", 'r', encoding='utf-8') as f:
        train_eng = [line.strip() for line in f.readlines()]
with codecs.open("./data/train_pt", 'r', encoding='utf-8') as f:
        train_pt = [line.strip() for line in f.readlines()]

## Vektorization and Padding

In [4]:
# Calculate the 90th percentile of sentence lengths in characters
eng_lengths_char = [len(sentence) for sentence in train_eng]
pt_lengths_char = [len(sentence) for sentence in train_pt]

percentile_90_eng_char = np.percentile(eng_lengths_char, 90)
percentile_90_pt_char = np.percentile(pt_lengths_char, 90)

print(f"90th percentile sentence length in characters for English: {percentile_90_eng_char}")
print(f"90th percentile sentence length in characters for Portuguese: {percentile_90_pt_char}")

num_eng_below_90th_char = sum(length <= percentile_90_eng_char for length in eng_lengths_char)
num_pt_below_90th_char = sum(length <= percentile_90_pt_char for length in pt_lengths_char)

total_eng_sentences_char = len(eng_lengths_char)
total_pt_sentences_char = len(pt_lengths_char)

print(f"Number of English sentences at or below 90th percentile (character level): {num_eng_below_90th_char}")
print(f"Total number of English sentences: {total_eng_sentences_char}")

print(f"Number of Portuguese sentences at or below 90th percentile (character level): {num_pt_below_90th_char}")
print(f"Total number of Portuguese sentences: {total_pt_sentences_char}")

# Set max length based on 90th percentile in characters
pt_max_len_char = int(percentile_90_pt_char)
en_max_len_char = int(percentile_90_eng_char)

print(f"Maximum sentence length for Portuguese (characters): {pt_max_len_char}")
print(f"Maximum sentence length for English (characters): {en_max_len_char}")


90th percentile sentence length in characters for English: 261.0
90th percentile sentence length in characters for Portuguese: 280.0
Number of English sentences at or below 90th percentile (character level): 72084
Total number of English sentences: 80000
Number of Portuguese sentences at or below 90th percentile (character level): 72034
Total number of Portuguese sentences: 80000
Maximum sentence length for Portuguese (characters): 280
Maximum sentence length for English (characters): 261


In [5]:

# Tokenizers for character-level encoding
input_tokenizer = Tokenizer(char_level=True)
target_tokenizer = Tokenizer(char_level=True)

# Fit tokenizers on the text data
input_tokenizer.fit_on_texts(train_eng)
target_tokenizer.fit_on_texts(train_pt)

# Convert text sequences to integer sequences
train_eng_seq = input_tokenizer.texts_to_sequences(train_eng)
train_pt_seq = target_tokenizer.texts_to_sequences(train_pt)

# Pad sequences to the max length
train_eng_seq_padded = pad_sequences(train_eng_seq, maxlen=en_max_len_char, padding='post')
train_pt_seq_padded = pad_sequences(train_pt_seq, maxlen=pt_max_len_char, padding='post')


In [6]:
train_pt_seq_input = train_pt_seq_padded[:, :-1]
train_pt_seq_target = train_pt_seq_padded[:, 1:]

## Load averaged GloVe embeddings as Char Embeddings

In [21]:
char_embedding_index = embedding_models.load_char_embeddings()
char_embedding_eng = embedding_models.apply_embedding_matrix(char_embedding_index, input_tokenizer, 100)
char_embedding_index = embedding_models.load_char_embeddings(language="portuguese")
char_embedding_pt = embedding_models.apply_embedding_matrix(char_embedding_index, target_tokenizer, 100)

## Train the Model

In [23]:
model  = create_model.create_seq2seq_charlevel_model(
    name="seq2seq_charlevel",
    source_vocab_size=len(input_tokenizer.word_index)+1,  # Add 1 for the 0 index
    target_vocab_size=len(target_tokenizer.word_index)+1,
    embedding_matrix_encoder=char_embedding_eng,
    embedding_matrix_decoder=char_embedding_pt,
    max_len_encoder=en_max_len_char,
    max_len_decoder=pt_max_len_char-1,  # Subtracting 1 to shift the target by 1 position
    embedding_dim=100,
    latent_dim=256
)
callback = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", metrics=['accuracy'])


In [24]:
history = model.fit([train_eng_seq_padded, train_pt_seq_input],
                    train_pt_seq_target,
                    epochs=5,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[callback])

Epoch 1/5
[1m   3/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15:20[0m 924ms/step - accuracy: 0.3585 - loss: 3.2045

KeyboardInterrupt: 