In [1]:
# Necessary and residual imports
import nltk
from nltk.corpus import udhr
import keras
from nltk import FreqDist
from gensim.models.word2vec import Word2Vec
import pandas as pd
import regex
import nltk.tokenize.casual
import tensorflow as tf

In [2]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import gc

import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Load dataset: (from https://huggingface.co/datasets/versae/bibles)
# https://stackoverflow.com/questions/39263929/how-can-i-read-tar-gz-file-using-pandas-read-csv-with-gzip-compression-option
df = pd.read_csv('books_labels.tar.gz', compression='gzip', header=0, sep=',', quotechar='"', on_bad_lines='skip')

In [4]:
# Select Spanish texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
#spanish_df = df[df['language'] == 'SPA']['text']
#spanish_list = spanish_df.to_list() # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://ioflood.com/blog/dataframe-to-list-pandas/%23:~:text%3DIt%2520is%2520utilized%2520with%2520the,tolist()%2520.%26text%3DIn%2520the%2520example%2520above%252C%2520we,convert%2520it%2520into%2520a%2520list.&ved=2ahUKEwiP3deMnLSFAxVD4ckDHUo7BsEQFnoECA4QAw&usg=AOvVaw3GgfIVdsVo9Dxul02uata1

In [5]:
set(df['language'].to_list())

{'ALB',
 'ARA',
 'AZB',
 'BAQ',
 'BEL',
 'BLA',
 'BUL',
 'CEB',
 'CHA',
 'CHI',
 'CKB',
 'COP',
 'CZE',
 'DAN',
 'ENG',
 'ESP',
 'FIN',
 'FRE',
 'GAE',
 'GER',
 'GOT',
 'GRC',
 'HAT',
 'HEB',
 'HIN',
 'HUN',
 'ITA',
 'JAM',
 'KOR',
 'LAT',
 'MAR',
 'NDS',
 'NL_',
 'NOR',
 'PON',
 'POR',
 'RUM',
 'RUS',
 'SCR',
 'SHU',
 'SPA',
 'SWA',
 'SWE',
 'SYR',
 'TAM',
 'TGL',
 'THA',
 'TUR',
 'UND',
 'VIE',
 'WIU',
 'XKL',
 'YBY',
 'ZIA',
 nan}

In [6]:
set(df[df['language'] == 'ENG']['file_name_translation'].to_list())

{'SF_2004-04-25_ENG_MARCGAL_(THE EPISTLE TO THE GALATIANS(DETERING))',
 'SF_2009-01-20_ENG_ACV_(A CONSERVATIVE VERSION)',
 'SF_2009-01-20_ENG_ASV_(AMERICAN STANDARD VERSION)',
 'SF_2009-01-20_ENG_BBE_(BIBLE IN BASIC ENGLISH)',
 'SF_2009-01-20_ENG_BIBLE_AKJV_(AMERICAN KING JAMES VERSION)',
 'SF_2009-01-20_ENG_BIBLE_TYNDALE_(WILLAM TYNDALE BIBLE)',
 'SF_2009-01-20_ENG_DARBY_(DARBY BIBLE)',
 'SF_2009-01-20_ENG_DIAGLOT_(DIAGLOT NT - 1865)',
 'SF_2009-01-20_ENG_GB_(GENEVA BIBLE)',
 'SF_2009-01-20_ENG_JUBL2000_(ENGLISH JUBILEE 2000 BIBLE)',
 'SF_2009-01-20_ENG_KJ2000_(KING JAMES 2000)',
 'SF_2009-01-20_ENG_KJV_(KJV+)',
 'SF_2009-01-20_ENG_LEESER_(LEESER OLD TESTAMENT)',
 'SF_2009-01-20_ENG_LONT_(LIVING ORACLES NT)',
 "SF_2009-01-20_ENG_LXXE_(BRENTON'S ENGLISH SEPTUAGINT)",
 'SF_2009-01-20_ENG_MONT_(MONTGOMERY NEW TESTAMENT)',
 'SF_2009-01-20_ENG_NOYES_(NOYES TRANSLATION)',
 'SF_2009-01-20_ENG_NSB_(NEW SIMPLIFIED BIBLE)',
 'SF_2009-01-20_ENG_RNT_(THE RIVERSIDE NEW TESTAMENT)',
 'SF_2009-01-20

In [7]:
# Select Spanish texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
spanish_df = df[df['language'] == 'SPA']
portuguese_df = df[df['language'] == 'POR']
english_df = df[df['language'] == 'ENG']
german_df = df[df['language'] == 'GER']

spanish_df = spanish_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'Nueva Biblia de las Américas (NBLA)'] # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.reddit.com/r/Reformed/comments/xoye82/most_accurate_spanish_bible_translations_spanish/&ved=2ahUKEwj7qbSKtbuFAxV4IEQIHbm6B_UQFnoECCwQAQ&usg=AOvVaw2YZJ8eHalVoloIoaaATROH
spanish_df = spanish_df.set_index('id')
spanish_df = spanish_df.drop(columns=['books_labels.csv', 'codebook', 'language', 'book_file_name', 'file_name_translation', 'source', 'year', 'genre', 'genre-multilabel', 'testament', 'division'])
portuguese_df = portuguese_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'SF_2009-01-20_POR_ACF_(PORTUGUESE CORRIGIDA FIEL (1753_1995))']
portuguese_df = portuguese_df.set_index('id')
portuguese_df = portuguese_df.drop(columns=['books_labels.csv', 'codebook', 'language', 'book_file_name', 'file_name_translation', 'source', 'year', 'genre', 'genre-multilabel', 'testament', 'division'])
english_df = english_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'SF_2009-01-20_ENG_ASV_(AMERICAN STANDARD VERSION)'] # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.reddit.com/r/Reformed/comments/xoye82/most_accurate_spanish_bible_translations_spanish/&ved=2ahUKEwj7qbSKtbuFAxV4IEQIHbm6B_UQFnoECCwQAQ&usg=AOvVaw2YZJ8eHalVoloIoaaATROH
english_df = english_df.set_index('id')
english_df = english_df.drop(columns=['books_labels.csv', 'codebook', 'language', 'book_file_name', 'file_name_translation', 'source', 'year', 'genre', 'genre-multilabel', 'testament', 'division'])
german_df = german_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'SF_2009-01-20_GER_LUTH1912_(LUTHER 1912)']
german_df = german_df.set_index('id')
german_df = german_df.drop(columns=['books_labels.csv', 'codebook', 'language', 'book_file_name', 'file_name_translation', 'source', 'year', 'genre', 'genre-multilabel', 'testament', 'division'])


  spanish_df = spanish_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'Nueva Biblia de las Américas (NBLA)'] # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.reddit.com/r/Reformed/comments/xoye82/most_accurate_spanish_bible_translations_spanish/&ved=2ahUKEwj7qbSKtbuFAxV4IEQIHbm6B_UQFnoECCwQAQ&usg=AOvVaw2YZJ8eHalVoloIoaaATROH
  portuguese_df = portuguese_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'SF_2009-01-20_POR_ACF_(PORTUGUESE CORRIGIDA FIEL (1753_1995))']
  english_df = english_df.sort_values(by=['file_name_translation', 'id'])[df['file_name_translation'] == 'SF_2009-01-20_ENG_ASV_(AMERICAN STANDARD VERSION)'] # https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.reddit.com/r/Reformed/comments/xoye82/most_accurate_spanish_bible_translations_spanish/&ved=2ahUKEwj7qbSKtbuFAxV4IEQIHbm6B_UQFnoECCwQAQ&usg=AOvVaw2YZJ8eHalVoloIoaaATROH
  german_df = german_df.

In [8]:
spanish_df.head(5)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
b.1CH.001.001,"(A)Adán, Set, Enós,"
b.1CH.001.002,"Cainán, Mahalaleel, Jared,"
b.1CH.001.003,"Enoc, Matusalén, Lamec,"
b.1CH.001.004,"Noé, Sem, Cam y Jafet."
b.1CH.001.005,"(B)Los hijos de Jafet fueron Gomer, Magog, Mad..."


In [9]:
portuguese_df.head(5)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
b.1CH.001.001,"ADÃO, Sete, Enos,"
b.1CH.001.002,"Cainã, Maalaleel, Jerede,"
b.1CH.001.003,"Enoque, Matusalém, Lameque,"
b.1CH.001.004,"Noé, Sem, Cão e Jafé."
b.1CH.001.005,"Os filhos de Jafé foram: Gomer, Magogue, Madai..."


In [10]:
english_df.head(5)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
b.1CH.001.001,"Adam, Seth, Enosh,"
b.1CH.001.002,"Kenan, Mahalalel, Jared,"
b.1CH.001.003,"Enoch, Methuselah, Lamech,"
b.1CH.001.004,"Noah, Shem, Ham, and Japheth."
b.1CH.001.005,"The sons of Japheth: Gomer, and Magog, and Mad..."


In [11]:
german_df.head(5)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
b.1CH.001.001,"Adam, Seth, Enos,"
b.1CH.001.002,"Kenan, Mahalaleel, Jared,"
b.1CH.001.003,"Henoch, Methusalah, Lamech,"
b.1CH.001.004,"Noah, Sem, Ham, Japheth."
b.1CH.001.005,"Die Kinder Japheths sind diese: Gomer, Magog, ..."


In [12]:
spanish_df = spanish_df.rename(columns={'text':'spanish_text'})
portuguese_df = portuguese_df.rename(columns={'text':'portuguese_text'})
english_df = english_df.rename(columns={'text':'english_text'})
german_df = german_df.rename(columns={'text':'german_text'})

multi_df = spanish_df.join(portuguese_df)
multi_df = multi_df.join(english_df)
multi_df = multi_df.join(german_df)
multi_df = multi_df.dropna()

print("Spanish:", multi_df['spanish_text']['b.1CH.001.011'], "\nPortuguese:", multi_df['portuguese_text']['b.1CH.001.011'], "\nEnglish:", multi_df['english_text']['b.1CH.001.011'], "\nGerman:", multi_df['german_text']['b.1CH.001.011'])

spanish_list = multi_df['spanish_text'].to_list()
portuguese_list = multi_df['portuguese_text'].to_list()
english_list = multi_df['english_text'].to_list()
german_list = multi_df['german_text'].to_list()

print('Spanish:', spanish_list[11], 'Portuguese:', portuguese_list[11])

Spanish: (C)Y Mizrayim fue el padre del pueblo de Ludim, Anamim, Lehabim, Neftuhim, 
Portuguese: E Mizraim gerou aos ludeus e aos anameus e aos leabeus e aos naftueus, 
English: And Mizraim begat Ludim, and Anamim, and Lehabim, and Naphtuhim, 
German: Mizraim zeugte die Luditer, die Anamiter, die Lehabiter, die Naphthuhiter,
Spanish: Canaán fue el padre de Sidón su primogénito, y de Het, Portuguese: E Canaã gerou a Sidom seu primogênito, e a Hete,


In [13]:
# Format and tokenize lists to make corpi
spanish_list = [regex.sub(r'\([a-zA-z0-9]\)', '', item) for item in spanish_list]
spanish_corpus = [nltk.tokenize.casual_tokenize(item) for item in spanish_list]
print(spanish_corpus[0])
portuguese_corpus = [nltk.tokenize.casual_tokenize(item) for item in portuguese_list]
print(portuguese_corpus[0])
english_corpus = [nltk.tokenize.casual_tokenize(item) for item in english_list]
print(english_corpus[0])
german_corpus = [nltk.tokenize.casual_tokenize(item) for item in german_list]
print(german_corpus[0])

['Adán', ',', 'Set', ',', 'Enós', ',']
['ADÃO', ',', 'Sete', ',', 'Enos', ',']
['Adam', ',', 'Seth', ',', 'Enosh', ',']
['Adam', ',', 'Seth', ',', 'Enos', ',']


In [14]:
# Select Portuguese texts
# https://stackoverflow.com/questions/17424182/extracting-all-rows-from-pandas-dataframe-that-have-certain-value-in-a-specific
# portuguese_df = df[df['language'] == 'POR']['text']
# portuguese_list = portuguese_df.to_list()

In [15]:
# Garbage collect old variables
del df
del spanish_list, portuguese_list, spanish_df, portuguese_df, spanish_list, portuguese_list, spanish_df, portuguese_df, multi_df
gc.collect()

NameError: name 'spanish_list' is not defined

In [16]:
# Prepare vocabulary
spanish_encoder_texts, spanish_input_texts, spanish_target_texts = [], [], []
portuguese_encoder_texts, portuguese_input_texts, portuguese_target_texts = [], [], []
english_encoder_texts, english_input_texts, english_target_texts = [], [], []
german_encoder_texts, german_input_texts, german_target_texts = [], [], []
spanish_vocabulary = set()
portuguese_vocabulary = set()
english_vocabulary = set()
german_vocabulary = set()
spanish_start_token = '[SPSTART]'
portuguese_start_token = '[POSTART]'
english_start_token = '[ENSTART]'
german_start_token = '[GESTART]'
stop_token = '[END]'
unknown_token = '[UNK]'
pad_token = '[PAD]'
spanish_vocabulary.add(spanish_start_token)
spanish_vocabulary.add(stop_token)
spanish_vocabulary.add(unknown_token)
spanish_vocabulary.add(pad_token)
portuguese_vocabulary.add(portuguese_start_token)
portuguese_vocabulary.add(stop_token)
portuguese_vocabulary.add(unknown_token)
portuguese_vocabulary.add(pad_token)
english_vocabulary.add(english_start_token)
english_vocabulary.add(stop_token)
english_vocabulary.add(unknown_token)
english_vocabulary.add(pad_token)
german_vocabulary.add(german_start_token)
german_vocabulary.add(stop_token)
german_vocabulary.add(unknown_token)
german_vocabulary.add(pad_token)

for spanish_text in spanish_corpus:
    spanish_encoder_texts.append([spanish_start_token] + spanish_text + [stop_token])
    spanish_input_texts.append([spanish_start_token] + spanish_text)
    spanish_target_texts.append(spanish_text + [stop_token])
    for char in spanish_text:
        if char not in spanish_vocabulary:
            spanish_vocabulary.add(char)

for portuguese_text in portuguese_corpus:
    portuguese_encoder_texts.append([portuguese_start_token] + portuguese_text + [stop_token])
    portuguese_input_texts.append([portuguese_start_token] + portuguese_text)
    portuguese_target_texts.append(portuguese_text + [stop_token])
    for char in portuguese_text:
        if char not in portuguese_vocabulary:
            portuguese_vocabulary.add(char)

for english_text in english_corpus:
    english_encoder_texts.append([english_start_token] + english_text + [stop_token])
    english_input_texts.append([english_start_token] + english_text)
    english_target_texts.append(english_text + [stop_token])
    for char in english_text:
        if char not in english_vocabulary:
            english_vocabulary.add(char)

for german_text in german_corpus:
    german_encoder_texts.append([german_start_token] + german_text + [stop_token])
    german_input_texts.append([german_start_token] + german_text)
    german_target_texts.append(german_text + [stop_token])
    for char in german_text:
        if char not in german_vocabulary:
            german_vocabulary.add(char)

unified_vocabulary = spanish_vocabulary.union(portuguese_vocabulary).union(english_vocabulary).union(german_vocabulary)

print(len(spanish_vocabulary), len(portuguese_vocabulary), len(english_vocabulary), len(german_vocabulary), len(unified_vocabulary))
print(spanish_encoder_texts[0], portuguese_input_texts[0], portuguese_target_texts[0])

26888 29155 13471 21854 83611
['[SPSTART]', 'Adán', ',', 'Set', ',', 'Enós', ',', '[END]'] ['[POSTART]', 'ADÃO', ',', 'Sete', ',', 'Enos', ','] ['ADÃO', ',', 'Sete', ',', 'Enos', ',', '[END]']


In [17]:
# Finish vocabulary
spanish_vocabulary = sorted(spanish_vocabulary)
portuguese_vocabulary = sorted(portuguese_vocabulary)
english_vocabulary = sorted(english_vocabulary)
german_vocabulary = sorted(german_vocabulary)

# Define maxima
spanish_vocab_size = len(spanish_vocabulary)
portuguese_vocab_size = len(portuguese_vocabulary)
spanish_vocab_size = len(spanish_vocabulary)
portuguese_vocab_size = len(portuguese_vocabulary)
unified_vocab_size = len(unified_vocabulary)
max_spanish_seq_length = max([len(txt) for txt in spanish_target_texts])
max_portuguese_seq_length = max([len(txt) for txt in portuguese_target_texts])
max_english_seq_length = max([len(txt) for txt in english_target_texts])
max_german_seq_length = max([len(txt) for txt in german_target_texts])
max_unified_seq_length = max(max_spanish_seq_length, max_portuguese_seq_length, max_english_seq_length, max_german_seq_length)

# Create indicies
spanish_token_index = dict([(token, i) for i, token in
                          enumerate(spanish_vocabulary)])
portuguese_token_index = dict([(token, i) for i, token in
                          enumerate(portuguese_vocabulary)])
unified_token_index = dict([(token, i) for i, token in
                          enumerate(unified_vocabulary)])
reverse_spanish_token_index = dict([(i, token) for token, i in
                          spanish_token_index.items()])
reverse_portuguese_token_index = dict([(i, token) for token, i in
                          portuguese_token_index.items()])
reverse_unified_token_index = dict([(i, token) for token, i in
                          unified_token_index.items()])
print(max_spanish_seq_length, max_unified_seq_length, max_portuguese_seq_length)

114 114 109


In [18]:
def convert_text_to_indices(texts, token_index, max_seq_length, pad_token='[PAD]'):
    data = np.zeros((len(texts), max_seq_length),
                    dtype='int32')
    for i, text in enumerate(texts):
        for t, token in enumerate(text):
            data[i, t] = token_index[token]
        for t in range(len(text), max_seq_length):
            data[i, t] = token_index[pad_token]
    
    return data

In [19]:
import numpy as np

# Convert sentences to numpy arrays
spanish_encoder_input_data = convert_text_to_indices(spanish_encoder_texts, unified_token_index, max_spanish_seq_length+1)
spanish_decoder_input_data = convert_text_to_indices(spanish_input_texts, unified_token_index, max_spanish_seq_length)
spanish_decoder_target_data = convert_text_to_indices(spanish_target_texts, unified_token_index, max_spanish_seq_length)

portuguese_encoder_input_data = convert_text_to_indices(portuguese_encoder_texts, unified_token_index, max_portuguese_seq_length+1)
portuguese_decoder_input_data = convert_text_to_indices(portuguese_input_texts, unified_token_index, max_portuguese_seq_length)
portuguese_decoder_target_data = convert_text_to_indices(portuguese_target_texts, unified_token_index, max_portuguese_seq_length)

english_encoder_input_data = convert_text_to_indices(english_encoder_texts, unified_token_index, max_english_seq_length+1)
english_decoder_input_data = convert_text_to_indices(english_input_texts, unified_token_index, max_english_seq_length)
english_decoder_target_data = convert_text_to_indices(english_target_texts, unified_token_index, max_english_seq_length)

german_encoder_input_data = convert_text_to_indices(german_encoder_texts, unified_token_index, max_german_seq_length+1)
german_decoder_input_data = convert_text_to_indices(german_input_texts, unified_token_index, max_german_seq_length)
german_decoder_target_data = convert_text_to_indices(german_target_texts, unified_token_index, max_german_seq_length)

# spanish_encoder_input_data = np.zeros((len(spanish_encoder_texts), max_spanish_seq_length),
#                                dtype='int32')
# spanish_decoder_input_data = np.zeros((len(spanish_input_texts), max_spanish_seq_length),
#                                dtype='int32')
# spanish_decoder_target_data = np.zeros((len(spanish_target_texts), max_spanish_seq_length),
#                                dtype='int32')

# portuguese_encoder_input_data = np.zeros((len(portuguese_encoder_texts), max_portuguese_seq_length),
#                                dtype='int32')
# portuguese_decoder_input_data = np.zeros((len(portuguese_input_texts), max_portuguese_seq_length),
#                                dtype='int32')
# portuguese_decoder_target_data = np.zeros((len(portuguese_target_texts), max_portuguese_seq_length),
#                                dtype='int32')



# for i, (input_text, target_text) in enumerate(zip(spanish_input_texts, spanish_target_texts)):
#     for t, token in enumerate(input_text):
#         spanish_encoder_input_data[
#             i, t] = unified_token_index[token]
#     for t, token in enumerate(target_text):
#         spanish_decoder_input_data[
#             i, t] = unified_token_index[token]
#         if t > 0:
#             spanish_decoder_target_data[i, t - 1] = spanish_token_index[token]

# for i, (input_text, target_text) in enumerate(zip(portuguese_input_texts, portuguese_target_texts)):
#     for t, token in enumerate(input_text):
#         portuguese_encoder_input_data[
#             i, t] = unified_token_index[token]
#     for t, token in enumerate(target_text):
#         portuguese_decoder_input_data[
#             i, t] = unified_token_index[token]
#         if t > 0:
#             portuguese_decoder_target_data[i, t - 1] = portuguese_token_index[token]

print([reverse_unified_token_index[value] for value in portuguese_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in portuguese_decoder_input_data[0]])
print([reverse_unified_token_index[value] for value in portuguese_decoder_target_data[0]])
print([reverse_unified_token_index[value] for value in spanish_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in spanish_decoder_input_data[0]])
print([reverse_unified_token_index[value] for value in spanish_decoder_target_data[0]])
print([reverse_unified_token_index[value] for value in german_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in german_decoder_input_data[0]])
print([reverse_unified_token_index[value] for value in german_decoder_target_data[0]])
print([reverse_unified_token_index[value] for value in english_encoder_input_data[0]])
print([reverse_unified_token_index[value] for value in english_decoder_input_data[0]])
print([reverse_unified_token_index[value] for value in english_decoder_target_data[0]])

['[POSTART]', 'ADÃO', ',', 'Sete', ',', 'Enos', ',', '[END]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[POSTART]', 'ADÃO'

In [20]:
# Display some input
spanish_encoder_input_data

array([[47350, 31925, 40495, ..., 78318, 78318, 78318],
       [47350, 81800, 40495, ..., 78318, 78318, 78318],
       [47350, 51664, 40495, ..., 78318, 78318, 78318],
       ...,
       [47350, 61402, 44224, ..., 78318, 78318, 78318],
       [47350, 53346, 77023, ..., 78318, 78318, 78318],
       [47350, 26332, 29722, ..., 78318, 78318, 78318]], dtype=int32)

In [21]:
# Define hyperparameters
batch_size = 128
epochs = 10
num_neurons = 256

In [22]:
num_layers = 2
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [23]:
# Import from customized file
from transformers import *

In [24]:
# Create the transformers from the encoder and decoder
transformer = Transformer(num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=unified_vocab_size,
    target_vocab_size=unified_vocab_size,
    dropout_rate=dropout_rate)
# spanish_transformer = ComposedTransformer(encoder, spanish_decoder, spanish_vocab_size)
# portuguese_transformer = ComposedTransformer(encoder, portuguese_decoder, portuguese_vocab_size)

2024-04-24 01:04:23.872683: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2024-04-24 01:04:23.872707: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2024-04-24 01:04:23.872712: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2024-04-24 01:04:23.872733: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-24 01:04:23.872746: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [25]:
# Define learning rate
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [26]:
# Compile both models
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [27]:
# Prepare training data
# dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data[1000:2000], portuguese_decoder_input_data[1000:2000]), portuguese_decoder_target_data[1000:2000]))
# batched_dataset = dataset.batch(batch_size)
# spanish_dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data, spanish_decoder_input_data), spanish_decoder_target_data))
# portuguese_dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data, spanish_decoder_input_data), spanish_decoder_target_data))
# spanish_batched_dataset = spanish_dataset.batch(batch_size)
# portuguese_batched_dataset = portuguese_dataset.batch(batch_size)

In [28]:
# Give a summary of the transformer
transformer.summary()

In [29]:
# Define a translator
class Translator(tf.Module):
  def __init__(self, transformer, token_index, reverse_token_index, input_start_char, output_start_char):
    self.transformer = transformer
    self.token_index = token_index
    self.reverse_token_index = reverse_token_index
    self.input_start_char = input_start_char
    self.output_start_char = output_start_char

  def __call__(self, sentence, max_length):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #   sentence = sentence[tf.newaxis]
    tokenized_sentence = [self.input_start_char] + nltk.tokenize.casual_tokenize(sentence) + ['[END]']
    sentence_tensor = np.zeros(max_length, dtype='int64')
    for t, token in enumerate(tokenized_sentence):
      sentence_tensor[t] = self.token_index[token]
    #sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence_tensor[tf.newaxis]
    
    # As the output language is English, initialize the output with the
    # English `[START]` token.
    #start_end = self.tokenizers.en.tokenize([''])[0]
    start = self.token_index[self.output_start_char]
    end = self.token_index['[END]']

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())[tf.newaxis]
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0][0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`. https://www.tensorflow.org/api_docs/python/tf/cast
    text = [item.numpy() for item in output]  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    # self.transformer([encoder_input, output[:,:-1]], training=False)
    # attention_weights = self.transformer.decoder.last_attn_scores

    return text

In [30]:
# Train the transformers
tf.config.run_functions_eagerly(True)
# transformer.fit(batched_dataset, epochs=epochs)

In [31]:
print(len(spanish_encoder_input_data), len(portuguese_decoder_input_data), len(portuguese_decoder_target_data))

28391 28391 28391


In [45]:
sentence = 'Estas son las palabras de Amós, que era un pastor de Tecoa'
translator = Translator(transformer, unified_token_index, reverse_unified_token_index, '[SPSTART]', '[POSTART]')
# portuguese_translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, unified_token_index, reverse_unified_token_index)
portuguese_translated_text = translator(
    sentence,20)
for i in portuguese_translated_text:
    try:
        print(reverse_unified_token_index[i], end=' ')
    except Exception:
        print(i)
        continue

[POSTART] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [32]:
for t in range(7):
    print(f"Epoch {t}:")
    for i in range(1):
        dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data, portuguese_decoder_input_data), portuguese_decoder_target_data))
        batched_dataset = dataset.batch(int(batch_size / (2**t)))
        transformer.fit(batched_dataset)
        # dataset = tf.data.Dataset.from_tensor_slices(((spanish_encoder_input_data[i*2000:i*2000+2000], german_decoder_input_data[i*2000:i*2000+2000]), german_decoder_target_data[i*2000:i*2000+2000]))
        # batched_dataset = dataset.batch(batch_size)
        # transformer.fit(batched_dataset)
        # dataset = tf.data.Dataset.from_tensor_slices(((english_encoder_input_data[i*2000:i*2000+2000], spanish_decoder_input_data[i*2000:i*2000+2000]), spanish_decoder_target_data[i*2000:i*2000+2000]))
        # batched_dataset = dataset.batch(batch_size)
        # transformer.fit(batched_dataset)
        # dataset = tf.data.Dataset.from_tensor_slices(((portuguese_encoder_input_data[i*2000:i*2000+2000], portuguese_decoder_input_data[i*2000:i*2000+2000]), portuguese_decoder_target_data[i*2000:i*2000+2000]))
        # batched_dataset = dataset.batch(batch_size)
        # transformer.fit(batched_dataset)
    sentence = 'Estas son las palabras de Amós, que era un pastor de Tecoa'
    translator = Translator(transformer, unified_token_index, reverse_unified_token_index, '[SPSTART]', '[POSTART]')
    # portuguese_translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, unified_token_index, reverse_unified_token_index)
    portuguese_translated_text = translator(
        sentence,20)
    for i in portuguese_translated_text:
        try:
            print(reverse_unified_token_index[i], end=' ')
        except Exception:
            print(i)
            continue
        #transformer.save_weights(f'./models/spanish-portuguese_full_epoch{t}.weights.h5')

Epoch 0:




[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m874s[0m 4s/step - loss: 11.3238 - masked_accuracy: 2.1533e-05




[POSTART] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] Epoch 1:
[1m444/444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m705s[0m 2s/step - loss: 6.5240 - masked_accuracy: 0.7295
[POSTART] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] Epoch 2:
[1m440/888[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m6:45[0m 906ms/step - loss: 1.5746 - masked_accuracy: 0.7731

KeyboardInterrupt: 

In [50]:
tf.saved_model.save(transformer, export_dir=f'./translators/{t}')
transformer = tf.saved_model.load(export_dir='./translators/0')

INFO:tensorflow:Assets written to: ./translators/1/assets


INFO:tensorflow:Assets written to: ./translators/1/assets


In [28]:
# spanish_transformer.save_weights('./models/spanish120epoch.weights.h5')
# portuguese_transformer.save_weights('./models/portuguese120epoch.weights.h5')
#transformer.save_weights('./models/spanish-portuguese_full_epoch59.weights.h5')
# transformer.save('./models/spanish-portuguese_full_test.keras') # https://discuss.tensorflow.org/t/save-a-tensorflow-model-with-a-transformer-layer/7221
# transformer = tf.keras.models.load_model('./models/spanish-portuguese_full_test.keras') # https://discuss.tensorflow.org/t/save-a-tensorflow-model-with-a-transformer-layer/7221
transformer.load_weights('./models/spanish-portuguese_full_epoch9.weights.h5')

  trackable.load_own_variables(weights_store.get(inner_path))


In [51]:
# Define a translator
class Translator(tf.Module):
  def __init__(self, transformer, input_token_index, reverse_input_token_index, output_token_index, reverse_output_token_index):
    self.transformer = transformer
    self.input_token_index = input_token_index
    self.output_token_index = output_token_index
    self.reverse_input_token_index = reverse_input_token_index
    self.reverse_output_token_index = reverse_output_token_index

  def __call__(self, sentence, max_length):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    # assert isinstance(sentence, tf.Tensor)
    # if len(sentence.shape) == 0:
    #   sentence = sentence[tf.newaxis]
    tokenized_sentence = ['[SPSTART]'] + nltk.tokenize.casual_tokenize(sentence) + ['[END]']
    sentence_tensor = np.zeros(max_length, dtype='int64')
    for t, token in enumerate(tokenized_sentence):
      sentence_tensor[t] = self.input_token_index[token]
    #sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence_tensor[tf.newaxis]
    
    # As the output language is English, initialize the output with the
    # English `[START]` token.
    #start_end = self.tokenizers.en.tokenize([''])[0]
    start = self.input_token_index['[POSTART]']
    end = self.output_token_index['[END]']

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)
    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())[tf.newaxis]
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0][0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`. https://www.tensorflow.org/api_docs/python/tf/cast
    text = [item.numpy() for item in output]  # Shape: `()`.

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    # self.transformer([encoder_input, output[:,:-1]], training=False)
    # attention_weights = self.transformer.decoder.last_attn_scores

    return text

In [52]:
# Test the translator
sentence = 'Estas son las palabras de Amós, que era un pastor de Tecoa'
translator = Translator(transformer, unified_token_index, reverse_unified_token_index, unified_token_index, reverse_unified_token_index)
# portuguese_translator = Translator(portuguese_transformer, unified_token_index, reverse_unified_token_index, unified_token_index, reverse_unified_token_index)
portuguese_translated_text = translator(
    sentence,15)
# portuguese_translated_text = portuguese_translator(
#     sentence,15)

TypeError: '_UserObject' object is not callable

In [31]:
# Print the raw lists
#print(spanish_translated_text)
print(portuguese_translated_text)

[43282, 26068, 21446, 8956, 2923, 1776, 44249, 33554, 44249, 14713, 13664, 19175, 40547, 48550, 16274, 19832]


In [32]:
# Print the sentences themselves
# for i in range(20):
#     try:
#         print(reverse_spanish_token_index[spanish_translated_text[i]], end=' ')
#     except Exception:
#         continue
# print()
for i in portuguese_translated_text:
    try:
        print(reverse_unified_token_index[i], end=' ')
    except Exception:
        print(i)
        continue

[POSTART] progresarán hendidura envidies composto ventre exacta intentaram exacta hendiste revés sereditas zelo secou Maldade - 

In [43]:
del translator
gc.collect()

2867