In [1]:
import pathlib
import random
import json
from tensorflow import keras
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.translate import ibm1, AlignedSent, Alignment, PhraseTable, StackDecoder
from collections import defaultdict
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import numpy as np



In [3]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [4]:
with open(text_file, encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append((spa, eng))
    

In [5]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [6]:
train_spa_texts = [pair[0] for pair in train_pairs]
train_eng_texts = [pair[1] for pair in train_pairs]
tokenized_eng_sents = [word_tokenize(i) for i in train_eng_texts]
tokenized_spa_sents = [word_tokenize(i) for i in train_spa_texts]

In [7]:
print(train_spa_texts[:10])
print("===============")
print(tokenized_spa_sents[:10])

['Tom sacó la billetera y pagó la cuenta.', 'Pasaron dos semanas.', 'Tom abrió la cerveza.', 'No hay una sola alma en los estacionamientos.', '¿Cuánto tiempo trabajó Tom acá?', 'Tiende a enfadarse por nada.', 'Demasiados dulces engordan.', 'Algo que deberías saber acerca de mí es que soy un animal de costumbres.', '¿Lo tienes todo?', 'Pasé el fin de semana leyendo una larga novela.']
[['Tom', 'sacó', 'la', 'billetera', 'y', 'pagó', 'la', 'cuenta', '.'], ['Pasaron', 'dos', 'semanas', '.'], ['Tom', 'abrió', 'la', 'cerveza', '.'], ['No', 'hay', 'una', 'sola', 'alma', 'en', 'los', 'estacionamientos', '.'], ['¿Cuánto', 'tiempo', 'trabajó', 'Tom', 'acá', '?'], ['Tiende', 'a', 'enfadarse', 'por', 'nada', '.'], ['Demasiados', 'dulces', 'engordan', '.'], ['Algo', 'que', 'deberías', 'saber', 'acerca', 'de', 'mí', 'es', 'que', 'soy', 'un', 'animal', 'de', 'costumbres', '.'], ['¿Lo', 'tienes', 'todo', '?'], ['Pasé', 'el', 'fin', 'de', 'semana', 'leyendo', 'una', 'larga', 'novela', '.']]


In [8]:
spanish_json = json.dumps(tokenized_spa_sents)
f1 = open("spanish.json","w")
f1.write(spanish_json)
f1.close()
english_json = json.dumps(tokenized_eng_sents)
f2 = open("english.json","w")
f2.write(english_json)
f2.close()

In [9]:
from IBM_Model1.IBM_Model1 import IBM

ibm_ourversion = IBM()
ibm_ourversion.load()

en_list = ibm_ourversion.getEnglishDict().keys()
spa_list = ibm_ourversion.getSpanishDict().keys()
# translationTable = ibm_ourversion.getTranslationTable()


Loaded Language Maps
Loaded Spanish to English Matrix


In [14]:
from tqdm import tqdm

def translation_model_generation():
    bilingual_text = []
    english_file = open("english.json","r")
    english_text = english_file.read()
    english_list = json.loads(english_text)
    spanish_file = open("spanish.json","r")
    spanish_text = spanish_file.read()
    spanish_list = json.loads(spanish_text)
    for iter in zip(english_list, spanish_list): #opposite
        bilingual_text.append(AlignedSent(iter[0],iter[1]))
    ibm1_model = ibm1.IBMModel1(bilingual_text, 3)
    return ibm1_model

def language_model_generation():
    spanish_file = open("english.json","r") #opposite
    spanish_text = spanish_file.read()
    spanish_list = json.loads(spanish_text)
    fdist = defaultdict(lambda: 1e-300)
    # words = []
    # for sentence in spanish_list:
    #     for w in sentence:
    #         words.append(w)
    # fdist = nltk.FreqDist(words)
    fdist = nltk.FreqDist(w for sentence in spanish_list for w in sentence)

    #fdist.setdefault(lambda:1e-300, 1e-300)
    language_model = type('',(object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()
    return language_model

def phrase_table_generation(ibm1_model):
    phrase_table = PhraseTable()
    for english_word in en_list:
        for spanish_word in spa_list:
            # phrase_table.add((spanish_word,),(english_word,), np.log(ibm1_model.translation_table[english_word][spanish_word]))
            print(ibm1_model.translation_table[english_word][spanish_word], ibm_ourversion.translate(english_word, spanish_word))
    # # for chinese_word in ibm1_model.translation_table.keys():
    # for english_word in tqdm(en_list):
    #     for spanish_word in spa_list:
    #         phrase_table.add((spanish_word,),(english_word,), np.log(ibm_ourversion.translate(english_word,spanish_word)))
    

    return phrase_table



In [15]:
translation_model = translation_model_generation()

In [16]:
phrase_table = phrase_table_generation(translation_model)

KeyError: None

In [18]:
phrase_table.translations_for(('Me',))[0:2]

[PhraseTableEntry(trg_phrase=('I',), log_prob=-0.1707006859897912),
 PhraseTableEntry(trg_phrase=('me',), log_prob=-2.24894505153687)]

In [19]:
language_model = language_model_generation()

In [20]:
stack_decoder1 = StackDecoder(phrase_table,language_model) 

In [21]:
print(stack_decoder1.translate(['Yo', 'Soy','bueno']))

  language_model = type('',(object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()


["'m", 'good']


In [22]:
test_spa_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_spa_texts)
    tokens = list(word_tokenize(input_sentence))
    translated = stack_decoder1.translate(tokens)
    print(input_sentence)
    #print(tokens)
    #print(translated)
    print(' '.join(translated))
    print("===================")

  language_model = type('',(object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()
