# Using Neural Machine Translation to Translate Between German & English

In [None]:
import numpy as np
import tensorflow as tf

In [None]:
# Reading and opening the file
with open("/content/drive/MyDrive/6 Spring 2024/CSC402/Chapter16/Neural Machine Translation/deu.txt") as f:
  dict_text = f.read()

In [None]:
dict_text[:80] # Showing the first 80 characters

'Go.\tGeh.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (R'

In [None]:
# Code implemented from LofiAI to remove unnecessary text from file
def remove_text(input_file, output_file, start_char, end_char):
    with open(input_file, 'r') as file:
        lines = file.readlines()

    with open(output_file, 'w') as file:
        for line in lines:
            start_index = line.find(start_char)
            end_index = line.find(end_char, start_index + 1)
            if start_index != -1 and end_index != -1:
                updated_line = line[:start_index] + line[end_index + 1:]
                file.write(updated_line+ '\n')
            else:
                file.write(line)

In [None]:
# Code taken from LofiAI, calling function above
input_file = '/content/drive/MyDrive/6 Spring 2024/CSC402/Chapter16/Neural Machine Translation/deu.txt'
output_file = '/content/drive/MyDrive/6 Spring 2024/CSC402/Chapter16/Neural Machine Translation/clean_du.txt'
start_char = '\tCC-BY'
end_char = '\n'
remove_text(input_file, output_file, start_char, end_char)

In [None]:
# Opening and reading the clean file
with open('/content/drive/MyDrive/6 Spring 2024/CSC402/Chapter16/Neural Machine Translation/clean_du.txt') as f:
  du = f.read()

In [None]:
du[:80] # Showing the first 80 characters

'Go.\tGeh.\nHi.\tHallo!\nHi.\tGrüß Gott!\nRun!\tLauf!\nRun.\tLauf!\nWow!\tPotzdonner!\nWow!\tD'

In [None]:
# Identifying eng-du pairs and splitting on the tab
pairs = [line.split("\t") for line in du.splitlines()]
np.random.seed(42)  # extra code – ensures reproducibility on CPU

# Shuffling the pairs to help training
np.random.shuffle(pairs)

# Zipping the pairs together so the eng phrase is associated with the du phrase
sentences_en, sentences_du = zip(*pairs)  # separates the pairs into 2 lists

In [None]:
# Printing the first three shuffled pairs
for i in range(3):
    print(sentences_en[i], "=>", sentences_du[i])

He was lying on the couch. => Er lag auf dem Sofa.
All generalizations are false, including this one. => Alle Verallgemeinerungen sind falsch, einschließlich dieser hier.
She dyed her white skirt red. => Sie färbte ihren weißen Rock rot.


In [None]:
# Defining the vocab_size and max_length parameters to vectorize each english and german word
vocab_size = 1000
max_length = 50

# Vectorizing the english and german words
text_vec_layer_en = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)
text_vec_layer_du = tf.keras.layers.TextVectorization(
    vocab_size, output_sequence_length=max_length)

# Explanation from Colab AI:
# adapt method updates internal parameters of text_vec_layer objects
# Better represents input sentences
text_vec_layer_en.adapt(sentences_en) # Vectorized english words adapted to english sentences in sentences_en
# Vectorized german words adapted to german sentences
# Preprocessed with start and end tokens
text_vec_layer_du.adapt([f"startofseq {s} endofseq" for s in sentences_du])

In [None]:
text_vec_layer_en.get_vocabulary()[:10] # Printing the first ten english vocab words

['', '[UNK]', 'tom', 'to', 'you', 'the', 'i', 'a', 'is', 'that']

In [None]:
text_vec_layer_du.get_vocabulary()[:10] # Printing first ten german vocab words

['',
 '[UNK]',
 'startofseq',
 'endofseq',
 'ich',
 'tom',
 'nicht',
 'ist',
 'das',
 'sie']

In [None]:
# Defining the training and validation sequences
X_train = tf.constant(sentences_en[:100_000])
X_valid = tf.constant(sentences_en[100_000:])
X_train_dec = tf.constant([f"startofseq {s}" for s in sentences_du[:100_000]])
X_valid_dec = tf.constant([f"startofseq {s}" for s in sentences_du[100_000:]])
Y_train = text_vec_layer_du([f"{s} endofseq" for s in sentences_du[:100_000]])
Y_valid = text_vec_layer_du([f"{s} endofseq" for s in sentences_du[100_000:]])

In [None]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
# Initializing the encoder and decoder inputs as an empty string list
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
embed_size = 128
# Setting the encoder id's to english vectorized inputs
encoder_input_ids = text_vec_layer_en(encoder_inputs)

# Setting decoder id's to german vectorized inputs
decoder_input_ids = text_vec_layer_du(decoder_inputs)

# Defining the encoder embedding layer
encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)

# Defining the decoder embedding layer
decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,
                                                    mask_zero=True)

# Defining the embeddings to use in the layers
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
# Defining the encoder as an LSTM model
encoder = tf.keras.layers.LSTM(512, return_state=True)
encoder_outputs, *encoder_state = encoder(encoder_embeddings)

In [None]:
# Defining the decoder as an LSTM model
decoder = tf.keras.layers.LSTM(512, return_sequences=True)
decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)

In [None]:
# Using Dense layers
output_layer = tf.keras.layers.Dense(vocab_size, activation="softmax")

Y_proba = output_layer(decoder_outputs)

In [None]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],
                       outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

# Training the model
model.fit((X_train, X_train_dec), Y_train, epochs=1,
          validation_data=((X_valid, X_valid_dec), Y_valid))



<keras.src.callbacks.History at 0x7bf2f148a680>

In [None]:
# Takes the encoder input and decodes the sentence - translates it
def translate(sentence_en):
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # encoder input
        X_dec = np.array(["startofseq " + translation])  # decoder input
        y_proba = model.predict((X, X_dec))[0, word_idx]  # last token's probas
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_du.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

## Testing the translator

In [None]:
translate("I like soccer")



'ich mag [UNK]'

Google translate: Ich mag Fußball

In [None]:
translate("Hello, I am a student!")



'[UNK] ist ein [UNK]'

Google translate: Hallo, ich bin Student!

In [None]:
translate("From the pomogranite in thine hand, Persephonie, thou shalt reign.")



'[UNK] die [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]'

Google translate: Aus dem Granatapfel in deiner Hand, Persephonie, sollst du herrschen.

### Improvements in future versions
- Need to take out unknown and check length of characters -- see if matches vocab_size to vectorization
- Translation is not very good, so maybe this will help
- Do more epochs (use a better runtime)
  - I wasn't able to connect to a better runtime, so 1 epoch took 1 hour and 30 minutes, which is why I only did one
  - With more epochs, I would hope the translations would improve.