__Word Alignment Assignment__

Your task is to learn word alignments for the data provided with this Python Notebook. 

Start by running the 'train' function below and implementing the assertions which will fail. Then consider the following improvements to the baseline model:
* Is the TranslationModel parameterized efficiently?
* What form of PriorModel would help here? (Currently the PriorModel is uniform.)
* How could you use a Hidden Markov Model to model word alignment indices? (There's an implementation of simple HMM below to help you start.)
* How could you initialize more complex models from simpler ones?
* How could you model words that are not aligned to anything?

Grades will be assigned as follows*:

 AER below on blinds   |  Grade 
----------|-------------
 0.5 - 0.6 |   1 
 0.4 - 0.5 |   2 
 0.35 - 0.4 |  3    
 0.3 - 0.35 |  4    
 0.25 - 0.3 |  5   
 
You should save the notebook with the final scores for 'dev' and 'test' test sets.

*__Note__: Students who submitted a version of this assignment last year will have a 0.05 AER handicap, i.e to get a grade of 5, they will need to get an AER below 0.25.


In [1]:
# This cell contains the generative models that you may want to use for word alignment.
# Currently only the TranslationModel is at all functional.

import numpy as np
from collections import defaultdict
from copy import deepcopy

class TranslationModel:
    "Models conditional distribution over trg words given a src word."

    def __init__(self, src_corpus, trg_corpus, identity_matrix, hard_align=False):
        self.identity_matrix = identity_matrix
        self.num_unique_src_tokens = identity_matrix.shape[0]
        self.num_unique_trg_tokens = identity_matrix.shape[1]
        self._trg_given_src_probs = np.ones((self.num_unique_src_tokens,
                                             self.num_unique_trg_tokens)) / self.num_unique_trg_tokens
        self._src_trg_counts = np.zeros((self.num_unique_src_tokens, self.num_unique_trg_tokens))
        self.hard_align = hard_align

    def get_params(self):
        return self._trg_given_src_probs

    def get_conditional_prob(self, src_token, trg_token):
        "Return the conditional probability of trg_token given src_token."
        return self._trg_given_src_probs[src_token][trg_token]

    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        "Returns matrix with t[i][j] = p(f_j|e_i)."
        return self._trg_given_src_probs[np.ix_(src_tokens, trg_tokens)]

    def collect_statistics(self, src_tokens, trg_tokens, posterior_matrix, hmm=False):
        "Accumulate counts of translations from: posterior_matrix[j][i] = p(a_j=i|e, f)"
#         assert posterior_matrix.shape == (len(trg_tokens), len(src_tokens))
        # assert False, "Implement collection of statistics here."
        self._src_trg_counts[np.ix_(src_tokens, trg_tokens)] += posterior_matrix
       
        
    def recompute_parameters(self):
        "Reestimate parameters and reset counters."
        # assert False, "Implement reestimation of parameters from counters here."
        self._trg_given_src_probs = self._src_trg_counts / np.sum(self._src_trg_counts, axis=1, keepdims=True)
        self._src_trg_counts = np.zeros((self.num_unique_src_tokens, self.num_unique_trg_tokens))
        if self.hard_align:
            self._trg_given_src_probs[self.identity_matrix.row, self.identity_matrix.col] = 1.0


class PriorModel:
    "Models the prior probability of an alignment given only the sentence lengths and token indices."

    def __init__(self, src_corpus, trg_corpus):
        "Add counters and parameters here for more sophisticated models."
        self._distance_counts = {}
        self._distance_probs = {}

    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        return np.ones((src_length, trg_length)) * 1.0 / src_length
    
    def get_prior_prob(self, src_index, trg_index, src_length, trg_length):
        "Returns a uniform prior probability."
        return 1.0 / src_length

    def collect_statistics(self, src_length, trg_length, posterior_matrix):
        "Extract the necessary statistics from this matrix if needed."
        pass

    def recompute_parameters(self):
        "Reestimate the parameters and reset counters."
        pass
    
    
class ComplexPriorModel:
    "Models the prior probability of an alignment given the sentence lengths and token indices."

    def __init__(self, src_corpus, trg_corpus, use_null=False,
                 src_phi=0.5, trg_phi=0.5, src_null_index=0, trg_null_index=0):
        "Add counters and parameters here for more sophisticated models."
        self.num_src_indices = np.max(list(map(len, src_corpus)))
        self.num_trg_indices = np.max(list(map(len, trg_corpus)))
        self._distance_counts = defaultdict(lambda: 
                                            np.zeros((self.num_src_indices,
                                                      self.num_trg_indices)))
        self._distance_probs = defaultdict(lambda:
                                           np.ones((self.num_src_indices,
                                                    self.num_trg_indices)) / self.num_trg_indices)
        self.src_phi = src_phi
        self.trg_phi = trg_phi
        self.src_null_index = src_null_index
        self.trg_null_index = trg_null_index
        self.use_null = use_null

    def get_prior_prob(self, src_index, trg_index, src_length, trg_length):
        "Returns a uniform prior probability."
        return self._distance_probs[(src_length, trg_length)][src_index, trg_index]

    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        return (self._distance_probs[(src_length, trg_length)]
                [np.ix_(np.arange(src_length), np.arange(trg_length))])

    def collect_statistics(self, src_tokens, trg_tokens, posterior_matrix):
        "Extract the necessary statistics from this matrix if needed."
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        src_indices = np.arange(src_length)
        trg_indices = np.arange(trg_length)
        (self._distance_counts[(src_length, trg_length)]
         [np.ix_(src_indices, trg_indices)]) += posterior_matrix

    def recompute_parameters(self):
        "Reestimate the parameters and reset counters."
        for key in self._distance_counts:
            denoms = np.sum(self._distance_counts[key], axis=0, keepdims=True)
            self._distance_probs[key] = self._distance_counts[key] / denoms
            if self.use_null:
                self._distance_probs[key][self.src_null_index, :] *= self.src_phi
                self._distance_probs[key][:self.src_null_index, :] *= (1 - self.src_phi)
                self._distance_probs[key][(self.src_null_index + 1):, :] *= (1 - self.src_phi)
                self._distance_probs[key][:, self.trg_null_index] *= self.trg_phi
                self._distance_probs[key][:, :self.trg_null_index] *= (1 - self.trg_phi)
                self._distance_probs[key][:, (self.trg_null_index + 1):] *= (1 - self.trg_phi)
            self._distance_counts[key] = np.zeros((self.num_src_indices, self.num_trg_indices))
        

class ImprovedComplexPriorModel:
    "Models the prior probability of an alignment given the sentence lengths and token indices."

    def __init__(self, src_corpus, trg_corpus, num_indices=10,
                 use_null=False, src_phi=0.5, trg_phi=0.5, src_null_index=0, trg_null_index=0):
        "Add counters and parameters here for more sophisticated models."
        self.num_src_indices = num_indices
        self.num_trg_indices = num_indices
        self._distance_counts = np.zeros((self.num_src_indices, self.num_trg_indices))
        self._distance_probs = np.ones((self.num_src_indices,
                                        self.num_trg_indices)) / self.num_trg_indices
        self.src_phi = src_phi
        self.trg_phi = trg_phi
        self.src_null_index = src_null_index
        self.trg_null_index = trg_null_index
        self.use_null = use_null

    def get_prior_prob(self, src_index, trg_index, src_length, trg_length):
        "Returns a uniform prior probability."
        return self._distance_probs[int(trg_index / trg_length * self.num_trg_indices),
                                    int(src_index / src_length * self.num_src_indices)]
    
    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        squeezed_src_indices = np.array(list(map(lambda x: int(x / src_length * self.num_src_indices),
                                        np.arange(src_length))))
        squeezed_trg_indices = np.array(list(map(lambda x: int(x / trg_length * self.num_trg_indices),
                                np.arange(trg_length))))
        return self._distance_probs[np.ix_(squeezed_src_indices, squeezed_trg_indices)]

    def collect_statistics(self, src_tokens, trg_tokens, posterior_matrix):
        "Extract the necessary statistics from this matrix if needed."
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        squeezed_src_indices = np.array(list(map(lambda x: int(x / src_length * self.num_src_indices),
                                        np.arange(src_length))))
        squeezed_trg_indices = np.array(list(map(lambda x: int(x / trg_length * self.num_trg_indices),
                                np.arange(trg_length))))
        self._distance_counts[np.ix_(squeezed_src_indices, squeezed_trg_indices)] += posterior_matrix

    def recompute_parameters(self):
        "Reestimate the parameters and reset counters."
        denoms = np.sum(self._distance_counts, axis=0, keepdims=True)
        self._distance_probs = self._distance_counts / denoms
        if self.use_null:
            self._distance_probs[self.src_null_index, :] *= self.src_phi
            self._distance_probs[:self.src_null_index, :] *= (1 - self.src_phi)
            self._distance_probs[(self.src_null_index + 1):, :] *= (1 - self.src_phi)
            self._distance_probs[:, self.trg_null_index] *= self.trg_phi
            self._distance_probs[:, :self.trg_null_index] *= (1 - self.trg_phi)
            self._distance_probs[:, (self.trg_null_index + 1):] *= (1 - self.trg_phi)
        self._distance_counts = np.zeros((self.num_src_indices, self.num_trg_indices))

class TransitionModel:
    "Models the prior probability of an alignment conditioned on previous alignment."

    def __init__(self, src_corpus, trg_corpus):
        "Add counters and parameters here for more sophisticated models."
        self.num_src_indices = np.max(list(map(len, src_corpus)))
        self.alignment_probs_given_prev = dict()
        self.alignment_counts = dict()

    def get_parameters_for_sentence_pair(self, src_tokens, trg_tokens):
        "Retrieve the parameters for this sentence pair: A[k, i] = p(a_{j} = i|a_{j-1} = k)"
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        if src_length not in self.alignment_probs_given_prev:
            self.alignment_probs_given_prev[src_length] = np.ones((src_length, src_length)) / src_length
        return self.alignment_probs_given_prev[src_length]
                
    def collect_statistics(self, src_tokens, trg_tokens, bigram_posteriors):
        "Extract statistics from the bigram posterior[i][j]: p(a_{t-1} = i, a_{t} = j| e, f)"
        src_length = len(src_tokens)
        trg_length = len(trg_tokens)
        if src_length not in self.alignment_counts:
            self.alignment_counts[src_length] = np.zeros((src_length, src_length))
        self.alignment_counts[src_length] += np.sum(bigram_posteriors, axis=2)
        
        
    def recompute_parameters(self):
        "Recompute the transition matrix"
        for length in self.alignment_counts:
            denoms = np.sum(self.alignment_counts[length], axis=0, keepdims=True)
            self.alignment_probs_given_prev[length] = self.alignment_counts[length] / denoms
            self.alignment_counts[length] = np.zeros((length, length))

In [4]:
# This cell contains the framework for training and evaluating a model using EM.

from utils import read_parallel_corpus, extract_test_set_alignments, score_alignments
from itertools import starmap
from math import log
from scipy.sparse import coo_matrix
import editdistance
import multiprocessing
import os
import functools


def infer_posteriors(src_tokens, trg_tokens, prior_model, translation_model, hmm=False):
    "Compute the posterior probability p(a_j=i | f, e) for each target token f_j given e and f."
    # HINT: An HMM will require more complex statistics over the hidden alignments.
    P = prior_model.get_parameters_for_sentence_pair(src_tokens, trg_tokens)
    T = translation_model.get_parameters_for_sentence_pair(src_tokens, trg_tokens) # t[i][j] = P(f_j|e_i)
    # assert False, "Compute the posterior distribution over src indices for each trg word."
    if hmm:
        initial_distribution = np.ones(len(src_tokens)) / len(src_tokens)
        bigram_posterior_matrix = np.zeros((len(src_tokens), len(src_tokens), len(trg_tokens)))
        unigram_posterior_matrix = np.zeros((len(trg_tokens), len(src_tokens)))
        alpha, beta, sentence_marginal_log_likelihood = forward_backward(initial_distribution, P, T)
        
        unigram_posterior_matrix = alpha * beta
        denoms = np.sum(unigram_posterior_matrix, axis=0, keepdims=True)
        unigram_posterior_matrix /= denoms
        
        bigram_posterior_matrix = (alpha[:, None, :-1] * P[:, :, None] *
                                   beta[None, :, 1:] * T[None, :, 1:])
        denoms = np.sum(bigram_posterior_matrix, axis=(0, 1), keepdims=True)
        bigram_posterior_matrix /= denoms
        return unigram_posterior_matrix, bigram_posterior_matrix, sentence_marginal_log_likelihood
    posterior_matrix = P * T
    denoms = np.sum(posterior_matrix, axis=0, keepdims=True)
    posterior_matrix /= denoms
    sentence_marginal_log_likelihood = np.sum(np.log(denoms))
    return posterior_matrix, sentence_marginal_log_likelihood

def collect_expected_statistics(src_corpus, trg_corpus, prior_model, translation_model, hmm=False):
    "E-step: infer posterior distribution over each sentence pair and collect statistics."
    corpus_log_likelihood = 0.0
    for src_tokens, trg_tokens in zip(src_corpus, trg_corpus):
        # Infer posterior
        if hmm:
            unigram_posteriors, bigram_posteriors, log_likelihood = infer_posteriors(
                src_tokens, trg_tokens, prior_model, translation_model, hmm=hmm)
            prior_model.collect_statistics(src_tokens, trg_tokens, bigram_posteriors)
            translation_model.collect_statistics(src_tokens, trg_tokens, unigram_posteriors)
        else:
            posteriors, log_likelihood = infer_posteriors(src_tokens, trg_tokens, prior_model,
                                                          translation_model, hmm=hmm)
            # Collect statistics in each model.
            prior_model.collect_statistics(src_tokens, trg_tokens, posteriors)
            translation_model.collect_statistics(src_tokens, trg_tokens, posteriors)
        # Update log prob
        corpus_log_likelihood += log_likelihood
    return corpus_log_likelihood

def estimate_models(src_corpus, trg_corpus, prior_model, translation_model,
                    num_iterations, hmm=False, use_null=False,
                    src_null_index=0, trg_null_index=0):
    "Estimate models iteratively using EM."
    for iteration in range(num_iterations):
        # E-step
        corpus_log_likelihood = collect_expected_statistics(src_corpus, trg_corpus,
                                                            prior_model, translation_model, hmm=hmm)
        # M-step
        prior_model.recompute_parameters()
        translation_model.recompute_parameters()
        if iteration > 0:
            print("corpus log likelihood: %1.3f" % corpus_log_likelihood)
            aligned_corpus = align_corpus(src_corpus, trg_corpus,
                                          prior_model, translation_model, hmm=hmm,
                                          use_null=use_null, src_null_index=src_null_index,
                                          trg_null_index=trg_null_index)
            evaluate(extract_test_set_alignments(aligned_corpus))
    return prior_model, translation_model


def get_alignments_from_posterior(posteriors, hmm=False, use_null=False,
                                  src_null_index=0, trg_null_index=0):
    "Returns the MAP alignment for each target word given the posteriors."
    # HINT: If you implement an HMM, you may want to implement a better algorithm here.
    alignments = {}
    for trg_index, src_index in enumerate(np.argmax(posteriors, 0)):
        if src_index == src_null_index or trg_index == trg_null_index:
            continue
        if use_null:
            src_index -= 1
            trg_index -= 1
        if trg_index not in alignments:
            alignments[trg_index] = {}
        alignments[trg_index][src_index] = '*'
    return alignments

def align_corpus(src_corpus, trg_corpus, prior_model, translation_model, hmm=False,
                 use_null=False, src_null_index=0, trg_null_index=0):
    "Align each sentence pair in the corpus in turn."
    aligned_corpus = []
    for src_tokens, trg_tokens in zip(src_corpus, trg_corpus):
        if hmm:
            posteriors, _, _, = infer_posteriors(src_tokens, trg_tokens, prior_model,
                                                            translation_model, hmm=hmm)
        else:
            posteriors, _ = infer_posteriors(src_tokens, trg_tokens, prior_model,
                                             translation_model, hmm=hmm)
        alignments = get_alignments_from_posterior(posteriors, hmm=hmm, use_null=use_null,
                                                   src_null_index=src_null_index,
                                                   trg_null_index=trg_null_index)
        aligned_corpus.append((src_tokens, trg_tokens, alignments))
    return aligned_corpus

def initialize_models(src_corpus, trg_corpus, identity_matrix, translation_model_cls,
                      prior_model_cls, translation_model_=None, prior_model_=None,
                      hard_align=False, **prior_params):
    prior_model = (prior_model_cls(src_corpus, trg_corpus, **prior_params)
                   if prior_model_ is None else prior_model_)
    translation_model = (translation_model_cls(src_corpus, trg_corpus, identity_matrix, hard_align)
                         if translation_model_ is None else translation_model_)
    return prior_model, translation_model

def load_lemmas(filenames):
    word_to_lemma = {}
    for filename in filenames:
        with open(filename) as fin:
            for line in fin:
                lemma, word = line.strip().split()
                word_to_lemma[word] = lemma
    return word_to_lemma
            
            
def normalize_corpus(corpus, use_null=False, null_token="<null>",
                     use_lemmas=False, lemmas_files=[], use_hashing=False, num_buckets=3000):
    if use_lemmas:
        word_to_lemma = load_lemmas(lemmas_files)
        corpus = [list(map(lambda word: word_to_lemma.get(word.lower(), word.lower()), tokens))
                  for tokens in corpus]
    unique_tokens = sorted(set(token for tokens in corpus for token in tokens))
    if use_null:
        unique_tokens = [null_token] + unique_tokens
    token_to_idx = {token: idx for idx, token in enumerate(unique_tokens)}
    null_index = token_to_idx.get(null_token, None)
    normalized_corpus = []
    for tokens in corpus:
        token_indices = [token_to_idx[token] for token in tokens]
        if use_hashing:
            offset = 1 if use_null else 0
            token_indices = [offset + (hash(token) % num_buckets) for token in tokens]
        else:
            token_indices = [token_to_idx[token] for token in tokens]
        if use_null:
            token_indices = [null_index] + token_indices
            
        normalized_corpus.append(token_indices)
    return normalized_corpus, unique_tokens, null_index

def calc_trg_indices(src_data, unique_trg_tokens, use_editdistance,
                     use_hashing, num_buckets, use_null):
    trg_indices = []
    src_idx, src_token = src_data
    offset = 1 if use_null else 0
    if use_hashing:
        trg_tokens_with_indices = map(lambda token: (offset + (hash(token) % num_buckets), token),
                                      unique_trg_tokens)
    else:
        trg_tokens_with_indices = enumerate(unique_trg_tokens)
    for trg_idx, trg_token in trg_tokens_with_indices:
        if (src_token == trg_token or
            (use_editdistance and
             (editdistance.eval(src_token, trg_token) / len(src_token)) < 0.2)):
            trg_indices.append(trg_idx)
    return trg_indices, src_idx, src_token

def calc_identity_matrix(unique_src_tokens, unique_trg_tokens, use_editdistance,
                         use_hashing, num_buckets, use_null):
    iis = []
    js = []
    values = []
    offset = 1 if use_null else 0
    with multiprocessing.Pool(8) as pool:
        map_func = functools.partial(calc_trg_indices,
                                     unique_trg_tokens=unique_trg_tokens,
                                     use_editdistance=use_editdistance,
                                     use_hashing=use_hashing,
                                     num_buckets=num_buckets,
                                     use_null=use_null)
        if use_hashing:
            src_tokens_with_indices = map(lambda token: (offset + (hash(token) % num_buckets), token),
                                          unique_src_tokens)
        else:
            src_tokens_with_indices = enumerate(unique_src_tokens)
        for trg_indices, src_idx, src_token in pool.imap(map_func, src_tokens_with_indices):
            iis.extend([src_idx] * len(trg_indices))
            js.extend(trg_indices)
            values.extend([1.0] * len(trg_indices))
    if use_hashing:
        shape = (offset + num_buckets, offset + num_buckets)
    else:
        shape = (len(unique_src_tokens), len(unique_trg_tokens))
    return coo_matrix((values, (iis, js)), shape=shape)

            

def normalize(src_corpus, trg_corpus, use_null=False,
              src_null_token="<src_null>", trg_null_token="<trg_null>",
              use_editdistance=False, use_lemmas=False, lemmas_folder="lemmatization-lists",
              use_hashing=False, num_buckets=3000):
    # assert False, "Apply some normalization here to reduce the numbers of parameters."
    (normalized_src, 
     unique_src_tokens,
     src_null_index) = normalize_corpus(src_corpus, use_null, src_null_token,
                                        use_lemmas, [os.path.join(lemmas_folder,
                                                                  "lemmatization-en.txt")],
                                        use_hashing, num_buckets)
    (normalized_trg,
     unique_trg_tokens,
     trg_null_index) = normalize_corpus(trg_corpus, use_null, trg_null_token,
                                        use_lemmas, [os.path.join(lemmas_folder,
                                                                  "lemmatization-sl.txt"),
                                                     os.path.join(lemmas_folder,
                                                                  "lemmatization-sk.txt"),
                                                     os.path.join(lemmas_folder,
                                                                  "lemmatization-cs.txt")],
                                        use_hashing, num_buckets)
    identity_matrix = calc_identity_matrix(unique_src_tokens, unique_trg_tokens,
                                           use_editdistance, use_hashing, num_buckets, use_null)
    return normalized_src, normalized_trg, identity_matrix, src_null_index, trg_null_index

def train(num_iterations, translation_model_cls=TranslationModel, prior_model_cls=PriorModel,
          translation_model=None, prior_model=None, hmm=False, hard_align=False,
          src_null_token="<src_null>", trg_null_token="<trg_null>", use_editdistance=False,
          use_lemmas=False, lemmas_folder="lemmatization-lists",
          use_hashing=False, num_buckets=3000, **prior_params):
    src_corpus, trg_corpus, _ = read_parallel_corpus('en-cs.all')
    use_null = prior_params.get("use_null", False)
    if translation_model is not None:
        use_editdistance = False
    (src_corpus, trg_corpus, identity_matrix,
     src_null_index, trg_null_index) = normalize(src_corpus, trg_corpus,
                                                 use_null, src_null_token, trg_null_token,
                                                 use_editdistance, use_lemmas, lemmas_folder,
                                                 use_hashing, num_buckets)
    if use_null and not hmm and prior_model_cls != PriorModel:
        prior_params["src_null_index"] = src_null_index
        prior_params["trg_null_index"] = trg_null_index
    if use_null and (hmm or prior_model_cls == PriorModel):
        del prior_params["use_null"]
    prior_model, translation_model = initialize_models(src_corpus, trg_corpus, identity_matrix,
                                                       translation_model_cls, prior_model_cls,
                                                       translation_model, prior_model, hard_align,
                                                       **prior_params)
    prior_model, translation_model = estimate_models(src_corpus, trg_corpus, prior_model,
                                                     translation_model, num_iterations,
                                                     hmm=hmm, use_null=use_null,
                                                     src_null_index=src_null_index,
                                                     trg_null_index=trg_null_index)    
    aligned_corpus = align_corpus(src_corpus, trg_corpus, prior_model, translation_model,
                                  hmm=hmm, use_null=use_null,
                                  src_null_index=src_null_index, trg_null_index=trg_null_index)
    return extract_test_set_alignments(aligned_corpus), translation_model, prior_model

def evaluate(candidate_alignments):
    src_dev, trg_dev, wa_dev = read_parallel_corpus('en-cs-wa.dev', has_alignments=True)
    src_test, trg_test, wa_test = read_parallel_corpus('en-cs-wa.test', has_alignments=True)
    print('dev: recall %1.3f; precision %1.3f; aer %1.3f' % score_alignments(wa_dev, candidate_alignments['dev']))
    print('test: recall %1.3f; precision %1.3f; aer %1.3f' % score_alignments(wa_test, candidate_alignments['test']))            

# Experimenting with different models

Let's start with a simple IBM Model 1:

In [5]:
test_alignments, _, _ = train(5)
evaluate(test_alignments)

corpus log likelihood: -1389685.929
dev: recall 0.424; precision 0.384; aer 0.598
test: recall 0.424; precision 0.379; aer 0.601
corpus log likelihood: -1238510.650
dev: recall 0.465; precision 0.419; aer 0.560
test: recall 0.464; precision 0.413; aer 0.564
corpus log likelihood: -1173020.847
dev: recall 0.480; precision 0.431; aer 0.547
test: recall 0.477; precision 0.423; aer 0.553
corpus log likelihood: -1147238.949
dev: recall 0.487; precision 0.437; aer 0.540
test: recall 0.485; precision 0.429; aer 0.546
dev: recall 0.487; precision 0.437; aer 0.540
test: recall 0.485; precision 0.429; aer 0.546


Now we will add hard-alignment explicitly setting alignment probability of identical tokens to 1.

In [6]:
test_alignments, _, _ = train(5, hard_align=True)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535


Hard alignment lowered our AER by 0.01. We will use it in all later experiments.

Now let's try IBM Model 2 with a prior that depends on word positions in a sentence.

In [7]:
test_alignments, _, _ = train(5, prior_model_cls=ComplexPriorModel, hard_align=True)
evaluate(test_alignments)



corpus log likelihood: -1340071.228
dev: recall 0.447; precision 0.404; aer 0.577
test: recall 0.455; precision 0.405; aer 0.572
corpus log likelihood: -1045077.915
dev: recall 0.505; precision 0.456; aer 0.522
test: recall 0.509; precision 0.454; aer 0.521
corpus log likelihood: -853536.594
dev: recall 0.525; precision 0.475; aer 0.502
test: recall 0.528; precision 0.471; aer 0.503
corpus log likelihood: -759527.242
dev: recall 0.532; precision 0.482; aer 0.495
test: recall 0.534; precision 0.478; aer 0.497
dev: recall 0.532; precision 0.482; aer 0.495
test: recall 0.534; precision 0.478; aer 0.497


More complex prior certainly improved our AER.

Now we pretrain translation model with IBM Model 1 for 2 epochs and then
train IBM Model 2 using pretrained translation model.

In [10]:
_, translation_model1, _ = train(2, hard_align=True)
test_alignments, _, _  = train(5, prior_model_cls=ComplexPriorModel,
                               translation_model=translation_model1)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587




corpus log likelihood: -926555.541
dev: recall 0.538; precision 0.486; aer 0.491
test: recall 0.541; precision 0.483; aer 0.491
corpus log likelihood: -790209.314
dev: recall 0.546; precision 0.493; aer 0.483
test: recall 0.548; precision 0.489; aer 0.484
corpus log likelihood: -730269.906
dev: recall 0.548; precision 0.496; aer 0.480
test: recall 0.548; precision 0.490; aer 0.484
corpus log likelihood: -701341.783
dev: recall 0.548; precision 0.496; aer 0.480
test: recall 0.550; precision 0.491; aer 0.482
dev: recall 0.548; precision 0.496; aer 0.480
test: recall 0.550; precision 0.491; aer 0.482


Pretrained model produces better AER than a model without pretraining.

For now our ComplexPrior depended on sentence lengths. We remove that dependency in ImprovedComplexPrior by using only relative position of the word in a sentence: $relative\_pos = \frac{word\_index}{sentence\_length}$. To simplify things, we introduce buckets, each of which will be responsible for one area of a sentence then we will use mapping from a relative position in a sentence to a bucket.
We can calculate bucket numbers from relative positions as follows: $$bucket\_number = \lfloor{relative\_pos \cdot num\_buckets}\rfloor$$

For example in a sentence "Quick brown | fox jumps | over the | lazy dog" where bucket borders are depicted using '|' token, the word 'jumps' has index 3 and therefore its relative position is $\frac{3}{8} = 0.375$ and its bucket number is $\lfloor0.375 * 4\rfloor = 1$

In our improved prior we use bucket indices instead of word indices, that way we reduce the number of parameters in our model.

In [12]:
test_alignments, _, _  = train(5, prior_model_cls=ImprovedComplexPriorModel,
                               hard_align=True)
evaluate(test_alignments)

corpus log likelihood: -1059004.252
dev: recall 0.519; precision 0.473; aer 0.506
test: recall 0.526; precision 0.473; aer 0.503
corpus log likelihood: -881497.886
dev: recall 0.581; precision 0.529; aer 0.447
test: recall 0.586; precision 0.528; aer 0.446
corpus log likelihood: -763555.219
dev: recall 0.593; precision 0.542; aer 0.435
test: recall 0.599; precision 0.542; aer 0.432
corpus log likelihood: -702661.794
dev: recall 0.592; precision 0.542; aer 0.435
test: recall 0.598; precision 0.541; aer 0.433
dev: recall 0.592; precision 0.542; aer 0.435
test: recall 0.598; precision 0.541; aer 0.433


Let's try to pretrain IBM Model 2 with improved prior using Model 1.

In [15]:
_, translation_model1, _ = train(2, hard_align=True)
test_alignments, _, _  = train(5, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -810766.506
dev: recall 0.599; precision 0.545; aer 0.430
test: recall 0.606; precision 0.546; aer 0.427
corpus log likelihood: -734935.129
dev: recall 0.602; precision 0.550; aer 0.426
test: recall 0.607; precision 0.548; aer 0.425
corpus log likelihood: -691945.668
dev: recall 0.600; precision 0.549; aer 0.428
test: recall 0.607; precision 0.549; aer 0.425
corpus log likelihood: -668485.180
dev: recall 0.598; precision 0.548; aer 0.429
test: recall 0.604; precision 0.548; aer 0.427
dev: recall 0.598; precision 0.548; aer 0.429
test: recall 0.604; precision 0.548; aer 0.427


Now, let's use HMM to train our alignments

In [18]:
test_alignments, _, _  = train(10, prior_model_cls=TransitionModel, hmm=True,
                               hard_align=True)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.405; precision 0.368; aer 0.615
test: recall 0.407; precision 0.364; aer 0.616
corpus log likelihood: -1164938.254
dev: recall 0.437; precision 0.399; aer 0.584
test: recall 0.442; precision 0.398; aer 0.582
corpus log likelihood: -1039038.072
dev: recall 0.447; precision 0.412; aer 0.572
test: recall 0.455; precision 0.413; aer 0.568
corpus log likelihood: -940864.392
dev: recall 0.448; precision 0.414; aer 0.571
test: recall 0.456; precision 0.416; aer 0.566
corpus log likelihood: -873764.473
dev: recall 0.448; precision 0.414; aer 0.570
test: recall 0.458; precision 0.417; aer 0.564
corpus log likelihood: -837671.646
dev: recall 0.444; precision 0.411; aer 0.574
test: recall 0.457; precision 0.416; aer 0.566
corpus log likelihood: -819925.310
dev: recall 0.442; precision 0.409; aer 0.576
test: recall 0.454; precision 0.414; aer 0.568
corpus log likelihood: -810752.141
dev: recall 0.442; precision 0.409; aer 0.576
test: recall 0.454; 

As we can see, HMM starts to diverge after 5-6 iterations, so it's no use to train it longer.

Trying to add pretraining with Model 1 to HMM

In [19]:
_, translation_model1, _ = train(2, hard_align=True)
test_alignments, _, _  = train(6, prior_model_cls=TransitionModel,
                               translation_model=translation_model1,
                               hmm=True)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1094438.564
dev: recall 0.534; precision 0.486; aer 0.492
test: recall 0.539; precision 0.483; aer 0.492
corpus log likelihood: -991391.332
dev: recall 0.536; precision 0.492; aer 0.488
test: recall 0.542; precision 0.490; aer 0.486
corpus log likelihood: -908109.303
dev: recall 0.536; precision 0.493; aer 0.488
test: recall 0.542; precision 0.492; aer 0.485
corpus log likelihood: -855545.814
dev: recall 0.534; precision 0.492; aer 0.489
test: recall 0.540; precision 0.490; aer 0.487
corpus log likelihood: -827657.895
dev: recall 0.532; precision 0.491; aer 0.491
test: recall 0.538; precision 0.489; aer 0.489
dev: recall 0.532; precision 0.491; aer 0.491
test: recall 0.538; precision 0.489; aer 0.489


Let's try to optimise parameters of Model 2 with Model 1 pretraining:

In [20]:
_, translation_model1, _ = train(10, hard_align=True)
test_alignments, _, _  = train(15, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
corpus log likelihood: -1126356.169
dev: recall 0.500; precision 0.448; aer 0.528
test: recall 0.498; precision 0.441; aer 0.533
corpus log likelihood: -1121308.055
dev: recall 0.502; precision 0.449; aer 0.527
test: recall 0.500; precision 0.443; aer 0.531
corpus log likelihood: -1118403.061
dev: recall 0.504; precision 0.451; aer 0.525
test: recall 0.502; precision 0.444; aer 0.529
corpus log likelihood: -1116584.283
dev: recall 0.505; precision 0.452; aer 0.524
test: recall 0.

Let's increase the number of buckets (default was 10)

In [22]:
_, translation_model1, _ = train(10, hard_align=True)
test_alignments, _, _  = train(15, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1,
                               num_indices=15)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
corpus log likelihood: -1126356.169
dev: recall 0.500; precision 0.448; aer 0.528
test: recall 0.498; precision 0.441; aer 0.533
corpus log likelihood: -1121308.055
dev: recall 0.502; precision 0.449; aer 0.527
test: recall 0.500; precision 0.443; aer 0.531
corpus log likelihood: -1118403.061
dev: recall 0.504; precision 0.451; aer 0.525
test: recall 0.502; precision 0.444; aer 0.529
corpus log likelihood: -1116584.283
dev: recall 0.505; precision 0.452; aer 0.524
test: recall 0.

In [25]:
_, translation_model1, _ = train(10, hard_align=True)
test_alignments, _, _  = train(15, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1,
                               num_indices=20)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
corpus log likelihood: -1126356.169
dev: recall 0.500; precision 0.448; aer 0.528
test: recall 0.498; precision 0.441; aer 0.533
corpus log likelihood: -1121308.055
dev: recall 0.502; precision 0.449; aer 0.527
test: recall 0.500; precision 0.443; aer 0.531
corpus log likelihood: -1118403.061
dev: recall 0.504; precision 0.451; aer 0.525
test: recall 0.502; precision 0.444; aer 0.529
corpus log likelihood: -1116584.283
dev: recall 0.505; precision 0.452; aer 0.524
test: recall 0.

In [26]:
_, translation_model1, _ = train(10, hard_align=True)
test_alignments, _, _  = train(15, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1,
                               num_indices=25)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
corpus log likelihood: -1126356.169
dev: recall 0.500; precision 0.448; aer 0.528
test: recall 0.498; precision 0.441; aer 0.533
corpus log likelihood: -1121308.055
dev: recall 0.502; precision 0.449; aer 0.527
test: recall 0.500; precision 0.443; aer 0.531
corpus log likelihood: -1118403.061
dev: recall 0.504; precision 0.451; aer 0.525
test: recall 0.502; precision 0.444; aer 0.529
corpus log likelihood: -1116584.283
dev: recall 0.505; precision 0.452; aer 0.524
test: recall 0.

All versions of Model 2 with pretraining start to diverge after 5-6 iterations, so there is no point in training it further. Model with num_buckets=20 gives the best AER

Now let's use the best chained pretraining model to pretrain HMM

In [29]:
_, translation_model1, _ = train(10, hard_align=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20)
test_alignments, _, _ = train(5, prior_model_cls=TransitionModel,
                              translation_model=translation_model2, hmm=True)
evaluate(test_alignments)

corpus log likelihood: -1340071.228
dev: recall 0.440; precision 0.396; aer 0.584
test: recall 0.439; precision 0.391; aer 0.587
corpus log likelihood: -1215103.192
dev: recall 0.478; precision 0.429; aer 0.549
test: recall 0.476; precision 0.423; aer 0.553
corpus log likelihood: -1158635.449
dev: recall 0.491; precision 0.441; aer 0.536
test: recall 0.488; precision 0.433; aer 0.542
corpus log likelihood: -1136220.022
dev: recall 0.497; precision 0.447; aer 0.530
test: recall 0.495; precision 0.440; aer 0.535
corpus log likelihood: -1126356.169
dev: recall 0.500; precision 0.448; aer 0.528
test: recall 0.498; precision 0.441; aer 0.533
corpus log likelihood: -1121308.055
dev: recall 0.502; precision 0.449; aer 0.527
test: recall 0.500; precision 0.443; aer 0.531
corpus log likelihood: -1118403.061
dev: recall 0.504; precision 0.451; aer 0.525
test: recall 0.502; precision 0.444; aer 0.529
corpus log likelihood: -1116584.283
dev: recall 0.505; precision 0.452; aer 0.524
test: recall 0.

# Adding data normalization

Now that we've experimented with different models, we can improve them further by modifying our data.

### Using NULL

We start with adding NULL tokens to source and target sentences, so that our models have the option of not aligning the word anywhere.

In [30]:
test_alignments, _, _ = train(5, hard_align=True, use_null=True)
evaluate(test_alignments)

corpus log likelihood: -1396500.374
dev: recall 0.435; precision 0.393; aer 0.588
test: recall 0.436; precision 0.388; aer 0.590
corpus log likelihood: -1264417.109
dev: recall 0.476; precision 0.428; aer 0.550
test: recall 0.473; precision 0.422; aer 0.555
corpus log likelihood: -1206642.928
dev: recall 0.490; precision 0.440; aer 0.537
test: recall 0.486; precision 0.433; aer 0.543
corpus log likelihood: -1183555.582
dev: recall 0.495; precision 0.445; aer 0.532
test: recall 0.493; precision 0.439; aer 0.537
dev: recall 0.495; precision 0.445; aer 0.532
test: recall 0.493; precision 0.439; aer 0.537


In [31]:
test_alignments, _, _ = train(5, prior_model_cls=ComplexPriorModel, hard_align=True,
                              use_null=True)
evaluate(test_alignments)



corpus log likelihood: -1777093.628
dev: recall 0.443; precision 0.401; aer 0.580
test: recall 0.452; precision 0.402; aer 0.575
corpus log likelihood: -1465544.316
dev: recall 0.502; precision 0.454; aer 0.524
test: recall 0.505; precision 0.451; aer 0.524
corpus log likelihood: -1260819.822
dev: recall 0.522; precision 0.473; aer 0.505
test: recall 0.526; precision 0.470; aer 0.505
corpus log likelihood: -1148278.615
dev: recall 0.530; precision 0.480; aer 0.497
test: recall 0.533; precision 0.477; aer 0.498
dev: recall 0.530; precision 0.480; aer 0.497
test: recall 0.533; precision 0.477; aer 0.498


In [32]:
test_alignments, _, _ = train(5, prior_model_cls=ImprovedComplexPriorModel, hard_align=True,
                              use_null=True, num_indices=20)
evaluate(test_alignments)

corpus log likelihood: -1666450.611
dev: recall 0.516; precision 0.469; aer 0.510
test: recall 0.527; precision 0.473; aer 0.502
corpus log likelihood: -1477549.541
dev: recall 0.589; precision 0.537; aer 0.439
test: recall 0.597; precision 0.537; aer 0.436
corpus log likelihood: -1335835.561
dev: recall 0.608; precision 0.557; aer 0.420
test: recall 0.621; precision 0.561; aer 0.412
corpus log likelihood: -1254862.775
dev: recall 0.610; precision 0.560; aer 0.417
test: recall 0.621; precision 0.564; aer 0.410
dev: recall 0.610; precision 0.560; aer 0.417
test: recall 0.621; precision 0.564; aer 0.410


In [33]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True)
test_alignments, _, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                               translation_model=translation_model1,
                               num_indices=20, use_null=True)
evaluate(test_alignments)

corpus log likelihood: -1396500.374
dev: recall 0.435; precision 0.393; aer 0.588
test: recall 0.436; precision 0.388; aer 0.590
corpus log likelihood: -1264417.109
dev: recall 0.476; precision 0.428; aer 0.550
test: recall 0.473; precision 0.422; aer 0.555
corpus log likelihood: -1206642.928
dev: recall 0.490; precision 0.440; aer 0.537
test: recall 0.486; precision 0.433; aer 0.543
corpus log likelihood: -1183555.582
dev: recall 0.495; precision 0.445; aer 0.532
test: recall 0.493; precision 0.439; aer 0.537
corpus log likelihood: -1173258.251
dev: recall 0.500; precision 0.449; aer 0.528
test: recall 0.498; precision 0.442; aer 0.533
corpus log likelihood: -1167925.793
dev: recall 0.501; precision 0.450; aer 0.527
test: recall 0.499; precision 0.444; aer 0.531
corpus log likelihood: -1164826.667
dev: recall 0.503; precision 0.451; aer 0.525
test: recall 0.501; precision 0.445; aer 0.530
corpus log likelihood: -1162869.478
dev: recall 0.504; precision 0.453; aer 0.524
test: recall 0.

In [34]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True)
test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True)
evaluate(test_alignments)

corpus log likelihood: -1396500.374
dev: recall 0.435; precision 0.393; aer 0.588
test: recall 0.436; precision 0.388; aer 0.590
corpus log likelihood: -1264417.109
dev: recall 0.476; precision 0.428; aer 0.550
test: recall 0.473; precision 0.422; aer 0.555
corpus log likelihood: -1206642.928
dev: recall 0.490; precision 0.440; aer 0.537
test: recall 0.486; precision 0.433; aer 0.543
corpus log likelihood: -1183555.582
dev: recall 0.495; precision 0.445; aer 0.532
test: recall 0.493; precision 0.439; aer 0.537
corpus log likelihood: -1173258.251
dev: recall 0.500; precision 0.449; aer 0.528
test: recall 0.498; precision 0.442; aer 0.533
corpus log likelihood: -1167925.793
dev: recall 0.501; precision 0.450; aer 0.527
test: recall 0.499; precision 0.444; aer 0.531
corpus log likelihood: -1164826.667
dev: recall 0.503; precision 0.451; aer 0.525
test: recall 0.501; precision 0.445; aer 0.530
corpus log likelihood: -1162869.478
dev: recall 0.504; precision 0.453; aer 0.524
test: recall 0.

Null tokens improved AER, but not by much.

### Using lemmas and edidistance

Now we reduce number of different words in our corpora by mapping them to their lowercase lemmas. Also we improve hard-alignment by setting alignment probability to 1 when tokens have small editdistance compared to the source word length (for example $\frac{edit\_distance}{source\_word\_length} < 0.2$) 

Only using editdistance and nulls:

In [36]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True, use_editdistance=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True)
test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True)
evaluate(test_alignments)

corpus log likelihood: -1390590.154
dev: recall 0.439; precision 0.396; aer 0.584
test: recall 0.443; precision 0.395; aer 0.583
corpus log likelihood: -1259842.567
dev: recall 0.479; precision 0.431; aer 0.547
test: recall 0.480; precision 0.427; aer 0.549
corpus log likelihood: -1203119.442
dev: recall 0.494; precision 0.444; aer 0.534
test: recall 0.492; precision 0.438; aer 0.537
corpus log likelihood: -1180431.644
dev: recall 0.499; precision 0.449; aer 0.528
test: recall 0.499; precision 0.443; aer 0.532
corpus log likelihood: -1170298.505
dev: recall 0.503; precision 0.452; aer 0.525
test: recall 0.503; precision 0.447; aer 0.527
corpus log likelihood: -1165042.450
dev: recall 0.505; precision 0.453; aer 0.523
test: recall 0.505; precision 0.449; aer 0.526
corpus log likelihood: -1161982.313
dev: recall 0.506; precision 0.454; aer 0.522
test: recall 0.507; precision 0.450; aer 0.524
corpus log likelihood: -1160046.289
dev: recall 0.508; precision 0.456; aer 0.520
test: recall 0.

Adding lemmas from [this repo](https://github.com/michmech/lemmatization-lists) (needs to be cloned and placed alongside this notebook). Using english lemmas for english and Czech, Slovak and Slovene lemmas for Czech.

In [37]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True, use_editdistance=True,
                                 use_lemmas=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True, use_lemmas=True)
test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True, use_lemmas=True)
evaluate(test_alignments)

corpus log likelihood: -1362528.658
dev: recall 0.549; precision 0.490; aer 0.483
test: recall 0.551; precision 0.487; aer 0.484
corpus log likelihood: -1206680.969
dev: recall 0.577; precision 0.516; aer 0.457
test: recall 0.578; precision 0.510; aer 0.459
corpus log likelihood: -1144326.083
dev: recall 0.586; precision 0.523; aer 0.448
test: recall 0.586; precision 0.518; aer 0.451
corpus log likelihood: -1122879.060
dev: recall 0.591; precision 0.528; aer 0.443
test: recall 0.591; precision 0.522; aer 0.447
corpus log likelihood: -1113927.201
dev: recall 0.595; precision 0.532; aer 0.439
test: recall 0.593; precision 0.525; aer 0.444
corpus log likelihood: -1109406.550
dev: recall 0.597; precision 0.533; aer 0.438
test: recall 0.596; precision 0.527; aer 0.442
corpus log likelihood: -1106804.159
dev: recall 0.598; precision 0.533; aer 0.437
test: recall 0.597; precision 0.528; aer 0.441
corpus log likelihood: -1105162.271
dev: recall 0.598; precision 0.533; aer 0.437
test: recall 0.

### Adding hashing

To further decrease the number of parameters we simplify things by mapping words to indices using hash function: $word\_idx = hash(word)\ \%\ num\_buckets$.
The final model has num_buckets=3000.

In [38]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True, use_editdistance=True,
                                 use_lemmas=True, use_hashing=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True, use_lemmas=True, use_hashing=True)
test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True, use_lemmas=True,
                                                  use_hashing=True)



corpus log likelihood: -1433400.254
dev: recall 0.540; precision 0.484; aer 0.490
test: recall 0.548; precision 0.486; aer 0.486
corpus log likelihood: -1310437.416
dev: recall 0.557; precision 0.498; aer 0.475
test: recall 0.563; precision 0.498; aer 0.472
corpus log likelihood: -1248885.513
dev: recall 0.564; precision 0.504; aer 0.468
test: recall 0.570; precision 0.503; aer 0.466
corpus log likelihood: -1225062.843
dev: recall 0.570; precision 0.509; aer 0.463
test: recall 0.573; precision 0.506; aer 0.463
corpus log likelihood: -1214351.537
dev: recall 0.574; precision 0.512; aer 0.460
test: recall 0.577; precision 0.509; aer 0.460
corpus log likelihood: -1208728.004
dev: recall 0.578; precision 0.515; aer 0.456
test: recall 0.578; precision 0.510; aer 0.459
corpus log likelihood: -1205440.348
dev: recall 0.579; precision 0.517; aer 0.455
test: recall 0.579; precision 0.511; aer 0.458
corpus log likelihood: -1203367.560
dev: recall 0.580; precision 0.518; aer 0.454
test: recall 0.

Hashing didn't help  to decrease AER, now let's try to do several runs of HMM using fresh TransitionModels, because Translation Model will keep improving and Transition model won't diverge.

In [39]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True, use_editdistance=True,
                                 use_lemmas=True, use_hashing=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True, use_lemmas=True, use_hashing=True)
test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True, use_lemmas=True,
                                                  use_hashing=True)

test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model_hmm,
                                                  hmm=True, use_null=True, use_lemmas=True,
                                                  use_hashing=True)

test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model_hmm,
                                                  hmm=True, use_null=True, use_lemmas=True,
                                                  use_hashing=True)

evaluate(test_alignments)



corpus log likelihood: -1433400.254
dev: recall 0.540; precision 0.484; aer 0.490
test: recall 0.548; precision 0.486; aer 0.486
corpus log likelihood: -1310437.416
dev: recall 0.557; precision 0.498; aer 0.475
test: recall 0.563; precision 0.498; aer 0.472
corpus log likelihood: -1248885.513
dev: recall 0.564; precision 0.504; aer 0.468
test: recall 0.570; precision 0.503; aer 0.466
corpus log likelihood: -1225062.843
dev: recall 0.570; precision 0.509; aer 0.463
test: recall 0.573; precision 0.506; aer 0.463
corpus log likelihood: -1214351.537
dev: recall 0.574; precision 0.512; aer 0.460
test: recall 0.577; precision 0.509; aer 0.460
corpus log likelihood: -1208728.004
dev: recall 0.578; precision 0.515; aer 0.456
test: recall 0.578; precision 0.510; aer 0.459
corpus log likelihood: -1205440.348
dev: recall 0.579; precision 0.517; aer 0.455
test: recall 0.579; precision 0.511; aer 0.458
corpus log likelihood: -1203367.560
dev: recall 0.580; precision 0.518; aer 0.454
test: recall 0.

Let's see how far we can push this:

In [40]:
_, translation_model1, _ = train(10, hard_align=True, use_null=True, use_editdistance=True,
                                 use_lemmas=True, use_hashing=True)
_, translation_model2, _  = train(6, prior_model_cls=ImprovedComplexPriorModel,
                                  translation_model=translation_model1,
                                  num_indices=20, use_null=True, use_lemmas=True, use_hashing=True)

test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                  translation_model=translation_model2,
                                                  hmm=True, use_null=True, use_lemmas=True,
                                                  use_hashing=True)
num_repetitions = 5
for i in range(num_repetitions):
    test_alignments, translation_model_hmm, _ = train(5, prior_model_cls=TransitionModel,
                                                      translation_model=translation_model_hmm,
                                                      hmm=True, use_null=True, use_lemmas=True,
                                                      use_hashing=True)

evaluate(test_alignments)



corpus log likelihood: -1433400.254
dev: recall 0.540; precision 0.484; aer 0.490
test: recall 0.548; precision 0.486; aer 0.486
corpus log likelihood: -1310437.416
dev: recall 0.557; precision 0.498; aer 0.475
test: recall 0.563; precision 0.498; aer 0.472
corpus log likelihood: -1248885.513
dev: recall 0.564; precision 0.504; aer 0.468
test: recall 0.570; precision 0.503; aer 0.466
corpus log likelihood: -1225062.843
dev: recall 0.570; precision 0.509; aer 0.463
test: recall 0.573; precision 0.506; aer 0.463
corpus log likelihood: -1214351.537
dev: recall 0.574; precision 0.512; aer 0.460
test: recall 0.577; precision 0.509; aer 0.460
corpus log likelihood: -1208728.004
dev: recall 0.578; precision 0.515; aer 0.456
test: recall 0.578; precision 0.510; aer 0.459
corpus log likelihood: -1205440.348
dev: recall 0.579; precision 0.517; aer 0.455
test: recall 0.579; precision 0.511; aer 0.458
corpus log likelihood: -1203367.560
dev: recall 0.580; precision 0.518; aer 0.454
test: recall 0.

Well, repeating HMM over and over didn't improve the AER significantly, so I guess the best model is the previous one (with only 3 repetitions of HMM)

In [17]:
# Discrete HMM with scaling. You may want to use this if you decide to implement an HMM.
# The parameters for this HMM will still need to be provided by the models above.

def forward(pi, A, O):
    S, T = O.shape
    alpha = np.zeros((S, T))
    scaling_factors = np.zeros(T)
    
    # base case
    alpha[:, 0] = pi * O[:, 0]
    scaling_factors[0] = np.sum(alpha[:, 0])
    alpha[:, 0] /= scaling_factors[0] 
    
    # recursive case
    for t in range(1, T):
        alpha[:, t] = np.dot(alpha[:, t-1], A[:, :]) * O[:, t]

        # Normalize at each step to prevent underflow.
        scaling_factors[t] = np.sum(alpha[:, t])
        alpha[:, t] /= scaling_factors[t]

    return (alpha, scaling_factors)

def backward(pi, A, O, forward_scaling_factors):
    S, T = O.shape
    beta = np.zeros((S, T))

    # base case
    beta[:, T-1] = 1 / forward_scaling_factors[T-1]
    
    # recursive case
    for t in range(T-2, -1, -1):
        beta[:, t] = np.sum(beta[:, t+1] * A[:, :] * O[:, t+1], 1) / forward_scaling_factors[t]

    return beta

def forward_backward(pi, A, O):
    alpha, forward_scaling_factors = forward(pi, A, O)
    beta = backward(pi, A, O, forward_scaling_factors)
    return alpha, beta, np.sum(np.log(forward_scaling_factors))
