In [1]:
import math
import datasets
import IBM1 as ibm1
#import aer

# pretty print variabeles on line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Helper functions to output useful information after each 
# EM iteration

def print_likelihood(i, lprobs, log_likelihood, aer):
    likelihood = math.exp(log_likelihood)
    if i == 1:
        print('iteration  log_likelihood  likelihood  AER')
    print(f'{i} {log_likelihood:.3f} {likelihood:.3f} {aer:.5f}')

def print_lexicon_probs(i, lprobs, log_likelihood, aer):
    for s in lprobs.keys():
        for t in lprobs[s].keys():
            if lprobs[s][t] > 0:
                print (s, t, lprobs[s][t])
    print()

In [None]:
# Run EM on toy example
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data()
s_t_pairs
(lprobs, _, _) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 fn_after_iter = print_likelihood)

print_lexicon_probs(None, lprobs, None, None)

In [3]:
# Run EM on toy example with NULL words
val_sentence_pairs = [(
    ['<NULL>', 'Buch', 'klein', 'das', 'Haus'], 
    ['the', 'small', 'house', 'book']
)]
ref_alignments = [[
    {(1, 3), (2, 2), (3, 4), (4, 1)}, 
    {(1, 3), (2, 2), (3, 4), (4, 1)}
]]
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data_null_words()
s_t_pairs
(lprobs, _, _) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 val_sentence_pairs, ref_alignments, print_likelihood)
print_lexicon_probs(None, lprobs, None, None)

# QUESTION: how can likelihood be bigger than 1?

# example alignment
# alignment positions start at 1, NULL word is ignored 
#sentence_pair
#alignment = ibm1.align(lprobs, sentence_pair)
#alignment
#{(1, 3), (3, 4), (4, 1)}


[(['<NULL>', 'das', 'Haus'], ['the', 'house']),
 (['<NULL>', 'das', 'Buch'], ['the', 'book']),
 (['<NULL>', 'ein', 'Buch'], ['a', 'book']),
 (['<NULL>', 'ein', 'Haus'], ['a', 'small', 'house']),
 (['<NULL>', 'mein', 'Buch'], ['my', 'small', 'book'])]

iteration  log_likelihood  likelihood  AER
1 -8.318 0.000 0.25000
2 -3.214 0.040 0.14286
3 -2.152 0.116 0.14286
4 -1.373 0.253 0.14286
5 -0.865 0.421 0.14286
6 -0.544 0.580 0.14286
7 -0.338 0.713 0.14286
8 -0.202 0.817 0.14286
9 -0.109 0.896 0.14286
10 -0.045 0.956 0.14286
<NULL> the 0.02257393014477732
<NULL> house 0.03853211664967519
<NULL> book 0.25879136482282894
<NULL> a 0.04036246446938904
<NULL> small 0.6388299741911422
<NULL> my 0.0009101497221873396
das the 0.9982745611336503
das house 0.001521101649152167
das book 0.00020433721719753526
Haus the 0.0005024136009538736
Haus house 0.9648451634494088
Haus a 0.0008770198645035952
Haus small 0.03377540308513371
Buch the 9.294634178534926e-05
Buch book 0.9952068965931067
Buch a 0.00017007146758518473
Buch my 0.0035000676358473257
Buch small 0.0010300179616753772
ein a 0.9665889462253223
ein book 0.00012498978556698626
ein small 0.03230203858017701
ein house 0.0009840254089337642
mein my 0.7703666901172609
mein small 0.22670748409840

In [None]:
# Run EM on training data set (no AER)

s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
(lprobs, log_lhoods, _) = ibm1.EM(s_t_pairs, s_vocabulary, 
        t_vocabulary, fn_after_iter = print_likelihood)

In [4]:
# Run EM on training data set
import aer

val_naacl_path = 'validation/dev.wa.nonullalign'
val_sentence_pairs, _, _ = datasets.validation_data()
reference_alignments = aer.read_naacl_alignments(val_naacl_path)    

s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
(lprobs, log_lhoods, AERs) = ibm1.EM(s_t_pairs, s_vocabulary, t_vocabulary, val_sentence_pairs,
                 reference_alignments, print_likelihood)

iteration  log_likelihood  likelihood  AER
1 -34858025.333 0.000 0.79829
2 -8405523.373 0.000 0.79195
3 -5424561.055 0.000 0.79038
4 -4514727.763 0.000 0.78613
5 -4203772.133 0.000 0.78998
6 -4064695.086 0.000 0.79191
7 -3991189.402 0.000 0.78861
8 -3947933.360 0.000 0.78785
9 -3920480.549 0.000 0.79094
10 -3902021.286 0.000 0.79094


In [15]:
def print_example_alignments(lprobs, n = 100):
    split = math.floor(len(lprobs.keys())/n)
    i = 0
    for t_word in lprobs.keys():
        if i%split == 0:
            max_sword = max(lprobs[t_word], key=lprobs[t_word].get)
            print (t_word, max_sword)
        i += 1
print_example_alignments(lprobs)

<NULL> .
industrialized industrialisés
billion milliards
informed informé
responsible responsable
Honourable honorable
bravery bravoure
Acadian acadienne
hometown ville
places endroits
reorganizing ait
scepticism scepticisme
writing écrit
broader plus
outcome résultat
declining baisse
reminds rappelle
Mouton à
deliberate délibérée
218 218
math calcul
hide cacher
solvents solvants
severely gravement
rusting attaqué
nefarious savais
abdication abdication
hepatitis hépatite
Cross Croix
scapegoat bouc
minority minorité
orientations orientations
televisions allumaient
offensive choquant
Page Page
diesel diesel
simplified simplifier
dash clignotant
readjustment un
1947 1947
Truro Truro
Blais Blais
LAKES GRANDS
tracks inspire
5,500 5
wasted gaspillé
Jimmy Jimmy
Zarya module
minions laquais
galling exaspérant
infusion injection
Leslie Leslie
zealous exagérée
Thai thaïlandais
Regulation acquisition
1.65 1,65
ploughing engloutir
resemble même
signifying qui
Zaltzman Zaltzman
Job pour
Vastel Vast

In [6]:
ef_pairs = val_sentence_pairs[5:15]
ef_reference_alignments = reference_alignments[5:15]
ef_predicted_alignments = ibm1.align(lprobs, ef_pairs)
AER = aer.calculate_AER(ef_reference_alignments, ef_predicted_alignments)



In [7]:
AER

0.8161764705882353

In [14]:
print(ef_pairs[0][0])
print(ef_pairs[0][1])

['<NULL>', 'I', 'never', 'met', 'a', 'street', 'hooker', 'who', 'wanted', 'to', 'be', 'there', '.']
['je', 'ne', 'ai', 'jamais', 'rencontré', 'une', 'seule', 'prostituée', 'de', 'rue', 'qui', 'voulait', 'exercer', 'un', 'tel', 'métier', '.']


In [9]:
ef_predicted_alignments[0]

{(1, 1),
 (2, 2),
 (3, 1),
 (4, 2),
 (5, 3),
 (6, 4),
 (7, 3),
 (8, 1),
 (9, 9),
 (10, 5),
 (11, 7),
 (12, 8),
 (13, 5),
 (14, 4),
 (15, 2),
 (16, 1),
 (17, 12)}

In [10]:
ef_reference_alignments[0]

[{(1, 1), (2, 2), (2, 4), (4, 6), (5, 10), (6, 8), (7, 11), (8, 12), (12, 17)},
 {(1, 1),
  (2, 2),
  (2, 4),
  (3, 3),
  (3, 5),
  (4, 6),
  (5, 9),
  (5, 10),
  (6, 8),
  (7, 11),
  (8, 12),
  (9, 13),
  (9, 14),
  (9, 15),
  (9, 16),
  (10, 13),
  (10, 14),
  (10, 15),
  (10, 16),
  (11, 13),
  (11, 14),
  (11, 15),
  (11, 16),
  (12, 17)}]

In [5]:
import json
fname = 'probs_ibm1_iter10.txt'
json.dump(lprobs, open(fname,'w'))

In [None]:
probs_ibm1_iter_10 = json.load(open(fname))