In [1]:
import pathlib
import random
import json
from tensorflow import keras
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
from nltk.translate import ibm1, AlignedSent, Alignment, PhraseTable, StackDecoder
from collections import defaultdict
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\4Ever\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [3]:
with open(text_file, encoding="utf8") as f:
    lines = f.read().split("\n")[:-1]  
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    text_pairs.append((spa, eng))
    

In [4]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

118964 total pairs
83276 training pairs
17844 validation pairs
17844 test pairs


In [5]:
train_spa_texts = [pair[0] for pair in train_pairs]
train_eng_texts = [pair[1] for pair in train_pairs]
tokenized_eng_sents = [word_tokenize(i) for i in train_eng_texts]
tokenized_spa_sents = [word_tokenize(i) for i in train_spa_texts]

In [7]:
spanish_json = json.dumps(tokenized_spa_sents)
f1 = open("spanish.json","w")
f1.write(spanish_json)
f1.close()
english_json = json.dumps(tokenized_eng_sents)
f2 = open("english.json","w")
f2.write(english_json)
f2.close()

In [10]:
from IBM_Model1.IBM_Model1 import IBM

ibm_ourversion = IBM(IBM.TRAINIG_MODE)
# ibm_ourversion.load()
ibm_ourversion.train()

Henaaa
Iteration :  0
Finished first loop
finish 
Iteration :  1
Finished first loop
finish 
Iteration :  2
Finished first loop
finish 
Iteration :  3
Finished first loop
finish 
Iteration :  4
Finished first loop
finish 
Iteration :  5
Finished first loop
finish 
Iteration :  6
Finished first loop
finish 
Iteration :  7
Finished first loop
finish 
Iteration :  8
Finished first loop
finish 
Iteration :  9
Finished first loop
finish 
Iteration :  10
Finished first loop
finish 
Converged


In [30]:
from tqdm import tqdm

def translation_model_generation(): #remove
    bilingual_text = []
    english_file = open("english.json","r")
    english_text = english_file.read()
    english_list = json.loads(english_text)
    spanish_file = open("spanish.json","r")
    spanish_text = spanish_file.read()
    spanish_list = json.loads(spanish_text)
    for iter in zip(english_list, spanish_list):
        # Alignment must have mapping order
        # FIXME SEARCH ON IMPLEMENTING THE ALIGNMENT CORRECTLY  
        bilingual_text.append(AlignedSent(iter[0],iter[1]))
    ibm1_model = ibm1.IBMModel1(bilingual_text, 10)
    return ibm1_model

def language_model_generation():
    english_file = open("english.json","r")
    english_text = english_file.read()
    english_list = json.loads(english_text)

    fdist = defaultdict(lambda: 1e-300)
    #fdist = nltk.FreqDist(w for sentence in english_list for w in sentence)
    for sentence in english_list:
         for word in sentence:
            fdist[word] += 1
    language_model = type('', (object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()
    return language_model


def phrase_table_generation(ibm1_model):
    phrase_table = PhraseTable()
    translation_table = ibm_ourversion.getTranslationTable() #change to ibm1_model
    for english_word in tqdm(translation_table):
        for spanish_word in translation_table[english_word].keys():
            phrase_table.add((spanish_word,),(english_word,), np.log(translation_table[english_word][spanish_word]))
   
    return phrase_table



In [12]:
# translation_model = translation_model_generation()

In [31]:
phrase_table = phrase_table_generation(ibm_ourversion)

100%|██████████| 13622/13622 [04:43<00:00, 48.06it/s]


In [27]:
language_model = language_model_generation()

In [28]:

stack_decoder1 = StackDecoder(phrase_table,language_model) 

In [29]:
print(stack_decoder1.translate([word.lower() for word in ['Yo', 'Soy','bueno']]) )
print(stack_decoder1.translate([word.lower() for word in ['Ella', 'debe', 'ser', 'feliz']]))
print(stack_decoder1.translate([word.lower() for word in ['Yo', 'no', 'comprendo']]))
print(stack_decoder1.translate([word.lower() for word in ['Hola', 'me', 'llamo', 'Juan']]))
print(stack_decoder1.translate([word.lower() for word in ['Por', 'favor', 'habla', 'más', 'despacio']]))

['i', 'im', 'good']
['she', 'must', 'be', 'happy']
['i', 'not', 'understand']
['hello', 'me', 'call', 'john']
['for', 'please', 'speaks', 'more', 'slowly']


In [17]:
test_spa_texts = [pair[0] for pair in test_pairs]
for _ in range(5):
    input_sentence = random.choice(test_spa_texts)
    tokens = list(word_tokenize(input_sentence))
    translated = stack_decoder1.translate(tokens)
    print(input_sentence)
    #print(tokens)
    #print(translated)
    print(' '.join(translated))
    print("===================")

  language_model = type('',(object,),{'probability_change':lambda self,context,phrase:np.log(fdist[phrase]),'probability':lambda self,phrase:np.log(fdist[phrase])})()


Ella debe ser feliz.



KeyboardInterrupt: 