In [1]:
!python toy_aligner.py  test/en-cs.en.dev.tokens test/en-cs.cs.dev.tokens > toy.dev.wa
!python eval.py test/en-cs.en.dev.tokens test/en-cs.cs.dev.tokens test/en-cs.wa.dev toy.dev.wa 

recall 0.057; precision 0.064; aer 0.939


In [7]:
import word_alignment_partial as wap
import eval as eval_

def initialize_models(src_corpus, trg_corpus, prior_cls, translation_cls):
    prior_model = prior_cls(src_corpus, trg_corpus)
    translation_model = translation_cls(src_corpus, trg_corpus)
    return prior_model, translation_model

def learn_models(src_path, trg_path, prior_cls, translation_cls, num_iterations):
    src_corpus, trg_corpus = read_all_tokens(src_path), read_all_tokens(trg_path)
    src_corpus, trg_corpus = normalize(src_corpus, trg_corpus)    
    prior_model, translation_model = initialize_models(src_corpus, trg_corpus, prior_cls, translation_cls)
    prior_model, translation_model = wap.estimate_models(src_corpus, trg_corpus, prior_model, translation_model, num_iterations)    
    alignments = wap.align_corpus(src_corpus, trg_corpus, prior_model, translation_model)
    return alignments

def save_alignments(alignments, output_prefix):
    return output_alignments_per_test_set(alignments, output_prefix)

def eval_alignments(src_path, trg_path, reference_path, candidate_path):
    src_corpus = [line.strip().split() for line in codecs.open(src_path, 'r', 'utf8')]
    trg_corpus = [line.strip().split() for line in codecs.open(trg_path, 'r', 'utf8')]
    reference = eval_.parse_alignments(reference_path)
    candidate = eval_.parse_alignments(candidate_path)
    assert eval_.validate(src_corpus, trg_corpus, reference)
    assert eval_.validate(src_corpus, trg_corpus, candidate)
    score = eval_.score(reference, candidate)
    print("recall %1.3f; precision %1.3f; aer %1.3f" % score)
    return score

In [6]:
!mkdir model_alignments

In [10]:
class ModelTester(object):
    learn_src_path = "small/en-cs.en.all.tokens.10K"
    learn_trg_path = "small/en-cs.cs.all.tokens.10K"
    
    test_src_path = "test/en-cs.en.dev.tokens"
    test_trg_path = "test/en-cs.cs.dev.tokens"
    
    reference = "test/en-cs.wa.dev"
    
    num_iterations = 10
    
    @staticmethod
    def candidate_path(prefix):
        return "model_alignments/" + prefix + ".dev.wa"
    
    def __call__(self, prior_cls, model_cls, prefix):
        alignments = learn_models(self.learn_src_path, self.learn_trg_path,
                                  prior_cls, model_cls, num_iterations)
        path = self.candidate_path(prefix)
        prefix = path.split('.')[0]
        save_alignments(alignments, prefix)
        score = eval_alignments(self.test_src_path, self.test_trg_path, 
                                self.reference, path)
        return score

In [11]:
tester = ModelTester()