In [None]:
import pathlib
import random
import json
from tensorflow import keras
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.translate import ibm1,AlignedSent,Alignment,PhraseTable,StackDecoder
from collections import defaultdict

In [None]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [None]:
with open(text_file, encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]  
text_pairs = []
for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append((spa, eng))

In [None]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

In [None]:
train_spa_texts = [pair[0] for pair in train_pairs]
train_eng_texts = [pair[1] for pair in train_pairs]
tokenized_eng_sents = [word_tokenize(i) for i in train_eng_texts]
tokenized_spa_sents = [word_tokenize(i) for i in train_spa_texts]

In [None]:
print(train_spa_texts[:10])
print("===============")
print(tokenized_spa_sents[:10])

In [None]:
spanish_json = json.dumps(tokenized_spa_sents)
f1 = open("spanish.json","w")
f1.write(spanish_json)
f1.close()
english_json = json.dumps(tokenized_eng_sents)
f2 = open("english.json","w")
f2.write(english_json)
f2.close()

In [None]:
def translation_model_generation():
    bilingual_text = []
    english_file = open("english.json","r")
    english_text = english_file.read()
    english_list = json.loads(english_text)
    spanish_file = open("spanish.json","r")
    spanish_text = spanish_file.read()
    spanish_list = json.loads(spanish_text)
    for iter in zip(english_list, spanish_list): #opposite
        # Alignment must have mapping order
        # FIXME SEARCH ON IMPLEMENTING THE ALIGNMENT CORRECTLY  
        bilingual_text.append(AlignedSent(iter[0],iter[1]))
    ibm1_model = ibm1.IBMModel1(bilingual_text,10)
    return ibm1_model

def language_model_generation():
    # FIXME SPANISH.JSON OR SHOULD IT BE THE SAME ENLGISH.JSON
    spanish_file = open("english.json","r") #opposite
    spanish_text = spanish_file.read()
    spanish_list = json.loads(spanish_text)
    fdist = defaultdict(lambda: 1e-300)
    # words = []
    # for sentence in spanish_list:
    #     for w in sentence:
    #         words.append(w)
    # fdist = nltk.FreqDist(words)
    fdist = nltk.FreqDist(w for sentence in spanish_list for w in sentence)

    #fdist.setdefault(lambda:1e-300, 1e-300)
    language_model = type('',(object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()
    return language_model

def phrase_table_generation(ibm1_model):
    phrase_table = PhraseTable()
    for english_word in ibm1_model.translation_table.keys():
        for spanish_word in ibm1_model.translation_table[english_word].keys():
            phrase_table.add((spanish_word,),(english_word,), np.log(ibm1_model.translation_table[english_word][spanish_word]))
    # for chinese_word in ibm1_model.translation_table.keys():
    #     for english_word in ibm1_model.translation_table[chinese_word].keys():
    #         phrase_table.add(chinese_word,english_word, np.log(ibm1_model.translation_table[chinese_word][english_word]))

    return phrase_table



In [None]:
translation_model = translation_model_generation()

In [None]:
phrase_table = phrase_table_generation(translation_model)

In [None]:
phrase_table.translations_for(('Me',))[:2]

In [None]:
language_model = language_model_generation()

In [None]:
stack_decoder1 = StackDecoder(phrase_table,language_model) 

In [None]:
print(stack_decoder1.translate(['Soy','bueno']))

In [None]:
test_spa_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_spa_texts)
    tokens = list(word_tokenize(input_sentence))
    translated = stack_decoder1.translate(tokens)
    print(input_sentence)
    #print(tokens)
    #print(translated)
    print(' '.join(translated))
    print("===================")