In [1]:
import math
import datasets
import IBM1 as ibm1
import debug_helpers

# pretty print variabeles on line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Run EM on toy example
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data()
s_t_pairs
(lprobs, _, _) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 fn_debug = debug_helpers.print_likelihood)

debug_helpers.print_lexicon_probs(None, lprobs, None, None)

[(['das', 'Haus'], ['the', 'house']),
 (['das', 'Buch'], ['the', 'book']),
 (['ein', 'Buch'], ['a', 'book'])]

iteration  log_likelihood  likelihood  AER
1 -4.159 0.016 0.00000
2 -1.151 0.316 0.00000
3 -0.842 0.431 0.00000
4 -0.586 0.557 0.00000
5 -0.390 0.677 0.00000
6 -0.252 0.777 0.00000
7 -0.159 0.853 0.00000
8 -0.100 0.905 0.00000
9 -0.062 0.940 0.00000
10 -0.039 0.961 0.00000
das the 0.9933053397165424
das house 0.0046110887522225865
das book 0.0020835715312351013
Haus the 0.08276408100718743
Haus house 0.9172359189928124
Buch the 0.0020835715312350995
Buch book 0.9933053397165423
Buch a 0.004611088752222586
ein a 0.9172359189928124
ein book 0.08276408100718743



In [3]:
# Run EM on toy example with NULL words
val_sentence_pairs = [(
    ['<NULL>', 'Buch', 'klein', 'das', 'Haus'], 
    ['the', 'small', 'house', 'book']
)]
ref_alignments = [[
    {(3, 1), (2, 2), (4, 3), (1, 4)}, 
    {(3, 1), (2, 2), (4, 3), (1, 4)}
]]
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data_null_words()
s_t_pairs
(lprobs, _, _) = ibm1.EM(
    s_t_pairs, s_vocabulary, t_vocabulary, 20,
    val_sentence_pairs, ref_alignments, debug_helpers.print_likelihood)
debug_helpers.print_lexicon_probs(None, lprobs, None, None)

# QUESTION: how can likelihood be bigger than 1?

[(['<NULL>', 'das', 'Haus'], ['the', 'house']),
 (['<NULL>', 'das', 'Buch'], ['the', 'book']),
 (['<NULL>', 'ein', 'Buch'], ['a', 'book']),
 (['<NULL>', 'ein', 'Haus'], ['a', 'small', 'house']),
 (['<NULL>', 'mein', 'Buch'], ['my', 'small', 'book'])]

iteration  log_likelihood  likelihood  AER
1 -8.318 0.000 0.25000
2 -3.214 0.040 0.14286
3 -2.152 0.116 0.14286
4 -1.373 0.253 0.14286
5 -0.865 0.421 0.14286
6 -0.544 0.580 0.14286
7 -0.338 0.713 0.14286
8 -0.202 0.817 0.14286
9 -0.109 0.896 0.14286
10 -0.045 0.956 0.14286
11 -0.000 1.000 0.14286
12 0.031 1.032 0.14286
13 0.054 1.055 0.14286
14 0.070 1.072 0.14286
15 0.081 1.085 0.14286
16 0.090 1.094 0.14286
17 0.096 1.101 0.14286
18 0.101 1.106 0.14286
19 0.104 1.110 0.14286
20 0.107 1.113 0.14286
<NULL> the 0.002703782024438147
<NULL> house 0.0047664944727924
<NULL> book 0.23131096714592353
<NULL> a 0.0049385156306393915
<NULL> small 0.756279473085454
<NULL> my 7.676407527728933e-07
das the 0.9999984381034837
das house 1.5359743961192934e-06
das book 2.5922120144793404e-08
Haus the 4.917775659561484e-07
Haus house 0.9992092174629266
Haus a 8.766652745862759e-07
Haus small 0.0007894140942328188
Buch the 1.2226865092224424e-08
Buch book 0.9999961375254085
Buch a 2.2861551201309313e-08

In [4]:
# Run EM on training data set with AER on validation set
val_sentence_pairs, _, _ = datasets.validation_data()
reference_alignments = datasets.validation_alignments()    

s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
(lprobs, log_lhoods, AERs) = ibm1.EM(
    s_t_pairs, s_vocabulary, t_vocabulary, 30,
    val_sentence_pairs, reference_alignments, 
    debug_helpers.print_likelihood)

iteration  log_likelihood  likelihood  AER
1 -34858025.333 0.000 0.37678
2 -8405523.373 0.000 0.33749
3 -5424561.055 0.000 0.32981
4 -4514727.763 0.000 0.32852
5 -4203772.133 0.000 0.33044
6 -4064695.086 0.000 0.33237
7 -3991189.402 0.000 0.32915
8 -3947933.360 0.000 0.32787
9 -3920480.549 0.000 0.32755
10 -3902021.286 0.000 0.32177
11 -3889024.767 0.000 0.32177
12 -3879524.166 0.000 0.32081
13 -3872364.267 0.000 0.31717
14 -3866831.922 0.000 0.31814
15 -3862467.595 0.000 0.32008
16 -3858963.685 0.000 0.32008
17 -3856108.787 0.000 0.32008
18 -3853754.064 0.000 0.32008
19 -3851789.648 0.000 0.32008
20 -3850132.320 0.000 0.32008
21 -3848721.998 0.000 0.32008
22 -3847513.280 0.000 0.31814
23 -3846470.406 0.000 0.31814
24 -3845564.802 0.000 0.31911
25 -3844772.642 0.000 0.31814
26 -3844075.743 0.000 0.32008
27 -3843459.838 0.000 0.32008
28 -3842912.388 0.000 0.32008
29 -3842423.821 0.000 0.32008
30 -3841986.155 0.000 0.32008


In [11]:
log_likelihood_data(s_t_pairs, lprobs)

-3841592.7124227234

In [5]:
debug_helpers.print_learned_translations(lprobs, 15)

<NULL> .
meets à
worst pire
transparent transparent
documented documentée
handedness commissaire
amazed étonné
rationalize rationaliser
squander dilapident
clawed gouvernement
mud salir
558 558
flagged signalées
boardwalk promenade
Disclosure la


In [6]:
import json
fname = 'probs_ibm1_iter30.txt'
json.dump(lprobs, open(fname,'w'))

In [None]:
probs_ibm1_iter_10 = json.load(open(fname))

In [8]:
AERs

[0.32007759456838025]

In [7]:
with open("loglhoods_ibm1_iter30.txt", "w") as file:
    file.write(str(log_lhoods))
    
with open("AERs_ibm1_iter30.txt", "w") as file:
    file.write(str(AERs))

21

21

In [None]:
with open("test.txt", "r") as file:
    data2 = eval(file.readline())