In [1]:
import math
import datasets
import IBM1 as ibm1
import aer
from align import align

# pretty print variabeles on line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Some helper functions to print useful information after each 
# EM iteration

def print_likelihood(i, lprobs, log_likelihood):
    likelihood = math.exp(log_likelihood)
    if i == 1:
        print('iteration  log_likelihood  likelihood')
    print(f'{i} {log_likelihood:.3f} {likelihood:.3f}')

def print_lexicon_probs(i, lprobs, log_likelihood):
    for s in lprobs.keys():
        for t in lprobs[s].keys():
            if lprobs[s][t] > 0:
                print (s, t, lprobs[s][t])
    print()

In [12]:
# Run EM on toy example
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data()
s_t_pairs
lprobs = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 print_likelihood)
lprobs

[(['das', 'Haus'], ['the', 'house']),
 (['das', 'Buch'], ['the', 'book']),
 (['ein', 'Buch'], ['a', 'book'])]

iteration  log_likelihood  likelihood
1 -4.159 0.016
2 -1.151 0.316
3 -0.842 0.431
4 -0.586 0.557
5 -0.390 0.677
6 -0.252 0.777
7 -0.159 0.853
8 -0.100 0.905
9 -0.062 0.940
10 -0.039 0.961
11 -0.025 0.975
12 -0.017 0.983
13 -0.012 0.989
14 -0.008 0.992
15 -0.006 0.994
16 -0.005 0.995
17 -0.004 0.996
18 -0.003 0.997
19 -0.003 0.997
20 -0.002 0.998


defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>>,
            {'Buch': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<lambda>>,
                         {'a': 9.729458618407092e-06,
                          'book': 0.9999876443477214,
                          'the': 2.6261936603029755e-06}),
             'Haus': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<lambda>>,
                         {'house': 0.9684579278295524,
                          'the': 0.031542072170447605}),
             'das': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<lambda>>,
                         {'book': 2.6261936603029776e-06,
                          'house': 9.729458618407095e-06,
                          'the': 0.9999876443477212}),
             'ein': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<l

In [13]:
# Run EM on toy example with NULL words
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data_null_words()
s_t_pairs
lprobs = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, print_likelihood)
lprobs

# QUESTION: how can likelihood be bigger than 1?

# example alignment
# alignment positions start at 1, NULL word is ignored 
sentence_pair = (
    ['<NULL>', 'Buch', 'grose', 'das', 'Haus'], 
    ['the', 'small', 'house', 'book']
)
sentence_pair
align(lprobs, sentence_pair) 

[(['<NULL>', 'das', 'Haus'], ['the', 'house']),
 (['<NULL>', 'das', 'Buch'], ['the', 'book']),
 (['<NULL>', 'ein', 'Buch'], ['a', 'book']),
 (['<NULL>', 'ein', 'Haus'], ['a', 'small', 'house']),
 (['<NULL>', 'mein', 'Buch'], ['my', 'small', 'book'])]

iteration  log_likelihood  likelihood
1 -8.318 0.000
2 -3.214 0.040
3 -2.152 0.116
4 -1.373 0.253
5 -0.865 0.421
6 -0.544 0.580
7 -0.338 0.713
8 -0.202 0.817
9 -0.109 0.896
10 -0.045 0.956
11 -0.000 1.000
12 0.031 1.032
13 0.054 1.055
14 0.070 1.072
15 0.081 1.085
16 0.090 1.094
17 0.096 1.101
18 0.101 1.106
19 0.104 1.110
20 0.107 1.113


defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>>,
            {'<NULL>': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<lambda>>,
                         {'a': 0.0049385156306393915,
                          'book': 0.23131096714592353,
                          'house': 0.0047664944727924,
                          'my': 7.676407527728933e-07,
                          'small': 0.756279473085454,
                          'the': 0.002703782024438147}),
             'Buch': defaultdict(<function IBM1._initialize_lexicon_probabilities.<locals>.<lambda>.<locals>.<lambda>>,
                         {'a': 2.2861551201309313e-08,
                          'book': 0.9999961375254085,
                          'my': 3.3186398260819336e-06,
                          'small': 5.087463490999096e-07,
                          'the': 1.2226865092224424e-08}),
             'Haus': defaultdict(<function IBM1._initialize_lexic

(['<NULL>', 'Buch', 'grose', 'das', 'Haus'], ['the', 'small', 'house', 'book'])

{(1, 3), (3, 4), (4, 1)}

In [None]:
# Run EM on training data set
s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
lprobs = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, print_likelihood)

In [None]:
predictions[0:3]
# predictions
[
    {(5,7), (1,2), ...},
    {},
    ...
]

In [None]:
#gold_sets[0:3]
(17, 15),
(17, 18),
(x,y) means x generates y

In [None]:
path = 'validation/dev.wa.nonullalign'

gold_sets = aer.read_naacl_alignments(path)
gold_sets[5][1]
