In [1]:
import math
import datasets
import IBM1 as ibm1
#import aer

# pretty print variabeles on line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Helper functions to output useful information after each 
# EM iteration

def print_likelihood(i, lprobs, log_likelihood, aer):
    likelihood = math.exp(log_likelihood)
    if i == 1:
        print('iteration  log_likelihood  likelihood  AER')
    print(f'{i} {log_likelihood:.3f} {likelihood:.3f} {aer:.3f}')

def print_lexicon_probs(i, lprobs, log_likelihood, aer):
    for s in lprobs.keys():
        for t in lprobs[s].keys():
            if lprobs[s][t] > 0:
                print (s, t, lprobs[s][t])
    print()

In [3]:
# Run EM on toy example
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data()
s_t_pairs
(lprobs, _, _) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 fn_after_iter = print_likelihood)

print_lexicon_probs(None, lprobs, None, None)

[(['das', 'Haus'], ['the', 'house']),
 (['das', 'Buch'], ['the', 'book']),
 (['ein', 'Buch'], ['a', 'book'])]

iteration  log_likelihood  likelihood  AER
1 -4.159 0.016 0.000
2 -1.151 0.316 0.000
3 -0.842 0.431 0.000
4 -0.586 0.557 0.000
5 -0.390 0.677 0.000
6 -0.252 0.777 0.000
7 -0.159 0.853 0.000
8 -0.100 0.905 0.000
9 -0.062 0.940 0.000
10 -0.039 0.961 0.000
11 -0.025 0.975 0.000
12 -0.017 0.983 0.000
13 -0.012 0.989 0.000
14 -0.008 0.992 0.000
15 -0.006 0.994 0.000
16 -0.005 0.995 0.000
17 -0.004 0.996 0.000
18 -0.003 0.997 0.000
19 -0.003 0.997 0.000
20 -0.002 0.998 0.000
das the 0.9999876443477212
das house 9.729458618407095e-06
das book 2.6261936603029776e-06
Haus the 0.031542072170447605
Haus house 0.9684579278295524
Buch the 2.6261936603029755e-06
Buch book 0.9999876443477214
Buch a 9.729458618407092e-06
ein a 0.9684579278295524
ein book 0.031542072170447605



In [4]:
# Run EM on toy example with NULL words
val_sentence_pairs = [(
    ['<NULL>', 'Buch', 'klein', 'das', 'Haus'], 
    ['the', 'small', 'house', 'book']
)]
ref_alignments = [[
    {(1, 3), (2, 2), (3, 4), (4, 1)}, 
    {(1, 3), (2, 2), (3, 4), (4, 1)}
]]
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data_null_words()
s_t_pairs
(lprobs, _, _) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 val_sentence_pairs, ref_alignments, print_likelihood)
print_lexicon_probs(None, lprobs, None, None)

# QUESTION: how can likelihood be bigger than 1?

# example alignment
# alignment positions start at 1, NULL word is ignored 
#sentence_pair
#alignment = ibm1.align(lprobs, sentence_pair)
#alignment
#{(1, 3), (3, 4), (4, 1)}


[(['<NULL>', 'das', 'Haus'], ['the', 'house']),
 (['<NULL>', 'das', 'Buch'], ['the', 'book']),
 (['<NULL>', 'ein', 'Buch'], ['a', 'book']),
 (['<NULL>', 'ein', 'Haus'], ['a', 'small', 'house']),
 (['<NULL>', 'mein', 'Buch'], ['my', 'small', 'book'])]

iteration  log_likelihood  likelihood  AER
1 -8.318 0.000 0.250
2 -3.214 0.040 0.143
3 -2.152 0.116 0.143
4 -1.373 0.253 0.143
5 -0.865 0.421 0.143
6 -0.544 0.580 0.143
7 -0.338 0.713 0.143
8 -0.202 0.817 0.143
9 -0.109 0.896 0.143
10 -0.045 0.956 0.143
11 -0.000 1.000 0.143
12 0.031 1.032 0.143
13 0.054 1.055 0.143
14 0.070 1.072 0.143
15 0.081 1.085 0.143
16 0.090 1.094 0.143
17 0.096 1.101 0.143
18 0.101 1.106 0.143
19 0.104 1.110 0.143
20 0.107 1.113 0.143
<NULL> the 0.002703782024438147
<NULL> house 0.0047664944727924
<NULL> book 0.23131096714592353
<NULL> a 0.0049385156306393915
<NULL> small 0.756279473085454
<NULL> my 7.676407527728933e-07
das the 0.9999984381034837
das house 1.5359743961192934e-06
das book 2.5922120144793404e-08
Haus the 4.917775659561484e-07
Haus house 0.9992092174629266
Haus a 8.766652745862759e-07
Haus small 0.0007894140942328188
Buch the 1.2226865092224424e-08
Buch book 0.9999961375254085
Buch a 2.2861551201309313e-08
Buch my 3.3186398260819336e-06
Buch sma

In [None]:
# Run EM on training data set
import aer

val_naacl_path = 'validation/dev.wa.nonullalign'
val_sentence_pairs, _, _ = datasets.validation_data()
reference_alignments = aer.read_naacl_alignments(val_naacl_path)    

s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
(lprobs, log_lhoods, AERs) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, val_sentence_pairs,
                 reference_alignments, print_likelihood)