In [1]:
import math
import datasets
import debug_helpers

import collections
import aer

# pretty print variabeles on line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [103]:
def _initialize_jump_probs(sentence_pairs):
    max_jump = 0
    for (s_sentence, t_sentence) in sentence_pairs:
        s_length = len(s_sentence) - 1 #ignore NULL
        t_length = len(t_sentence)
        jump_1 = abs(1 - math.floor(s_length))
        jump_2 = abs(s_length - math.floor(s_length/t_length))
        max_jump = max(jump_1, jump_2, max_jump)
    init_prob = 1. / (2*max_jump + 1 + 1) # last item is special NULL jump prob
    return [init_prob] * (2*max_jump + 1 + 1)

# s_pos and s_length for source sentence including special NULL word
def get_jump_prob_index(s_pos, t_pos, s_length, t_length, jump_probs):
    if s_pos == 0:
        return len(jump_probs) - 1
    jump = int(s_pos - math.floor((t_pos + 1) * (s_length - 1) / t_length))
    max_jump = int((len(jump_probs) - 2)/2)
    jump_prob_index = jump + max_jump
    if jump_prob_index < 0:
        raise IndexError('Index is expected to be positive.')
    return jump_prob_index

def get_jump_prob(s_pos, t_pos, s_length, t_length, jump_probs):
    jump_prob_index = get_jump_prob_index(s_pos, t_pos, s_length, t_length, jump_probs)
    return jump_probs[jump_prob_index]

In [111]:
def get_alignment_prob(s_pos, t_pos, s_length, t_length, jump_probs):
    jump_prob = get_jump_prob(
        s_pos, t_pos, s_length, t_length, jump_probs)
    sum_jump_probs = sum([
        get_jump_prob(
            s_word_pos, t_pos, s_length, t_length, jump_probs
        ) for s_word_pos in range(s_length)])
    return jump_prob / sum_jump_probs

In [2]:
# {Hause: { book:0.25, ...}, ...}
# read: the probability of 'book' given 'Haus' is 0.25
def _initialize_lexicon_probabilities(source_vocabulary, target_vocabulary):
    p_init = 1./len(target_vocabulary)
    lexicon_probabilities = collections.defaultdict(
        lambda: collections.defaultdict(lambda: p_init))
    return lexicon_probabilities

In [3]:
def _likelihood_target_word(s_sentence, t_word, lprobs):
    return sum([lprobs[s_word][t_word] for s_word in s_sentence])

In [7]:
def align(lprobs, sentence_pairs):
    if isinstance(sentence_pairs, tuple):
        return _align_sentence_pair(lprobs, sentence_pairs)
    return [ _align_sentence_pair(lprobs, sentence_pair) for sentence_pair in sentence_pairs ]

def _align_sentence_pair(lprobs, sentence_pair):
    s_sentence = sentence_pair[0]
    t_sentence = sentence_pair[1]
    best_alignment = set()
    for j, t_word in enumerate(t_sentence):
        best_align_prob = -1
        best_align_pos = -1
        for i, s_word in enumerate(s_sentence):
            if s_word not in lprobs.keys() or t_word not in lprobs[s_word].keys():
                continue # ignore unseen source and target words
            align_prob = lprobs[s_word][t_word] #p(t|s)
            if align_prob >= best_align_prob:
                best_align_pos = i
                best_align_prob = align_prob
        if (best_align_pos > 0): # Leave out NULL-alignments (and alignments between unseen words)
            best_alignment.add((best_align_pos, j + 1)) # word positions start at 1
    return best_alignment


In [8]:
def EM(s_t_pairs, s_vocabulary, t_vocabulary, max_iterations = 10,
        val_sentence_pairs = None, reference_alignments = None, fn_debug = None):
    lprobs = _initialize_lexicon_probabilities(s_vocabulary, t_vocabulary)
    i = 0
    log_likelihoods = []
    AERs = []
    while i < max_iterations:
        # initialize
        log_likelihood = 0
        AER = 0
        counts_t_given_s = collections.defaultdict(lambda: collections.defaultdict(int))
        total_s = collections.defaultdict(int)

        # calculate counts and log likelihood
        for (s_sentence, t_sentence) in s_t_pairs:
            s_length = len(s_sentence)
            t_length = len(t_sentence)
            for t_pos, t_word in enumerate(t_sentence):
                # normalization factor
                s_total_t = _likelihood_target_word(s_sentence, t_word, lprobs)
                # likelihood_target_position: sum thing
                # s_total_t = product of both
                log_likelihood += math.log(s_total_t)
                for s_pos, s_word in enumerate(s_sentence):
                    update = lprobs[s_word][t_word]/s_total_t
                    counts_t_given_s[s_word][t_word] += update
                    total_s[s_word] += update
        
        # store log_likelihood and AER values
        log_likelihoods.append(log_likelihood)
        if val_sentence_pairs and reference_alignments:
            predicted_alignments = align(lprobs, val_sentence_pairs)
            AER = aer.calculate_AER(reference_alignments, predicted_alignments)
            AERs.append(AER)

        # print debug info
        if fn_debug:
            fn_debug(i, lprobs, log_likelihood, AER)

        # update probabilities
        for s in lprobs.keys():
            for t in lprobs[s].keys():
                lprobs[s][t] = counts_t_given_s[s][t]/total_s[s]

        # update iteration number
        i += 1
    return lprobs, log_likelihoods, AERs


In [9]:
# Run EM on toy example
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data()
s_t_pairs
(lprobs, _, _) = EM(s_t_pairs, s_vocabulary, t_vocabulary, 
                 fn_debug = debug_helpers.print_likelihood)

debug_helpers.print_lexicon_probs(None, lprobs, None, None)

[(['das', 'Haus'], ['the', 'house']),
 (['das', 'Buch'], ['the', 'book']),
 (['ein', 'Buch'], ['a', 'book'])]

iteration  log_likelihood  likelihood  AER
0 -4.159 0.016 0.00000
1 -1.151 0.316 0.00000
2 -0.842 0.431 0.00000
3 -0.586 0.557 0.00000
4 -0.390 0.677 0.00000
5 -0.252 0.777 0.00000
6 -0.159 0.853 0.00000
7 -0.100 0.905 0.00000
8 -0.062 0.940 0.00000
9 -0.039 0.961 0.00000
das the 0.9933053397165424
das house 0.0046110887522225865
das book 0.0020835715312351013
Haus the 0.08276408100718743
Haus house 0.9172359189928124
Buch the 0.0020835715312350995
Buch book 0.9933053397165423
Buch a 0.004611088752222586
ein a 0.9172359189928124
ein book 0.08276408100718743



In [10]:
# Run EM on toy example with NULL words
val_sentence_pairs = [(
    ['<NULL>', 'Buch', 'klein', 'das', 'Haus'], 
    ['the', 'small', 'house', 'book']
)]
ref_alignments = [[
    {(3, 1), (2, 2), (4, 3), (1, 4)}, 
    {(3, 1), (2, 2), (4, 3), (1, 4)}
]]
s_t_pairs, s_vocabulary, t_vocabulary = datasets.example_data_null_words()
s_t_pairs
(lprobs, _, _) = EM(
    s_t_pairs, s_vocabulary, t_vocabulary, 20,
    val_sentence_pairs, ref_alignments, debug_helpers.print_likelihood)
debug_helpers.print_lexicon_probs(None, lprobs, None, None)

[(['<NULL>', 'das', 'Haus'], ['the', 'house']),
 (['<NULL>', 'das', 'Buch'], ['the', 'book']),
 (['<NULL>', 'ein', 'Buch'], ['a', 'book']),
 (['<NULL>', 'ein', 'Haus'], ['a', 'small', 'house']),
 (['<NULL>', 'mein', 'Buch'], ['my', 'small', 'book'])]

iteration  log_likelihood  likelihood  AER
0 -8.318 0.000 0.75000
1 -3.214 0.040 0.25000
2 -2.152 0.116 0.14286
3 -1.373 0.253 0.14286
4 -0.865 0.421 0.14286
5 -0.544 0.580 0.14286
6 -0.338 0.713 0.14286
7 -0.202 0.817 0.14286
8 -0.109 0.896 0.14286
9 -0.045 0.956 0.14286
10 -0.000 1.000 0.14286
11 0.031 1.032 0.14286
12 0.054 1.055 0.14286
13 0.070 1.072 0.14286
14 0.081 1.085 0.14286
15 0.090 1.094 0.14286
16 0.096 1.101 0.14286
17 0.101 1.106 0.14286
18 0.104 1.110 0.14286
19 0.107 1.113 0.14286
<NULL> the 0.002703782024438147
<NULL> house 0.0047664944727924
<NULL> book 0.23131096714592353
<NULL> a 0.0049385156306393915
<NULL> small 0.756279473085454
<NULL> my 7.676407527728933e-07
das the 0.9999984381034837
das house 1.5359743961192934e-06
das book 2.5922120144793404e-08
Haus the 4.917775659561484e-07
Haus house 0.9992092174629266
Haus a 8.766652745862759e-07
Haus small 0.0007894140942328188
Buch the 1.2226865092224424e-08
Buch book 0.9999961375254085
Buch a 2.2861551201309313e-08


In [None]:
# Run EM on training data set with AER on validation set
val_sentence_pairs, _, _ = datasets.validation_data()
reference_alignments = datasets.validation_alignments()    

s_t_pairs, s_vocabulary, t_vocabulary = datasets.training_data()
(lprobs, log_lhoods, AERs) = EM(
    s_t_pairs, s_vocabulary, t_vocabulary, 30,
    val_sentence_pairs, reference_alignments, 
    debug_helpers.print_likelihood)