In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import with_statement

import collections
import gzip
import itertools
import os
import re
import random
import numpy as np

from sklearn import metrics
from sklearn import svm

random.seed(10)

### Import the Spacy library

[Spacy](http://spacy.io) is an NLP library which is ready to be used in production settings. This library will help us find features for our data.

In [None]:
import spacy

en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

### Import the NLTK library

We will use the [NLTK library](NLTK) for other features we want to extract from our data. As well as use the BLUE scoring algorithm in the library

In [None]:
from nltk.translate.bleu_score import modified_precision, corpus_bleu
from nltk.util                 import ngrams

### Evaluation metrics

In [None]:
def BLEU(reference,candidate):
    """
    Compute the BLEU score for a given candidate sentence, with respect to a
    given reference sentence.

    reference: the reference translation
    candidate: the candidate translation
    """
    return float(modified_precision([reference],candidate,n=4))

In [None]:
print()

### Features

In [None]:
def pos_feature(s,nlp,n=1,simple_pos=False):
    """
    Compute the POS feature vector given a sentence and an instance of spaCy.
    The POS feature vector is a vector which indicates, per POS-tag of the
    language, what ratio of the words in the sentence have this POS-tag.

    s  : input sentence
    nlp: instance of spaCy nlp
    n  : the size of the n-grams over which the vector is built
    """
    doc       = nlp(s,tag=True,parse=False,entity=False)
    
    # Compute the PoS-tags using spaCy.
    if simple_pos:
        pos_tags  = [tok.pos_ for tok in doc]
        pos_sible = spacy.parts_of_speech.NAMES.values()
    else:
        pos_tags  = [tok.tag_ for tok in doc] 
        pos_sible = nlp.tagger.tag_names
        
    # Compute the n-grams of the PoS-tags.
    pos_tags  = list(ngrams(pos_tags,n))
    pos_sible = itertools.combinations(pos_sible,n)
    
    pos_count = collections.Counter(pos_tags)
    pos_count = map(lambda tag: pos_count[tag] / len(pos_tags), pos_sible)
    return pos_count

In [None]:
en_s = u'Hello, world. Here are two sentences.'
de_s = u'Ich bin ein Berliner.'

In [None]:
# Example: print the simple unigram PoS-feature for an English sentence.
en_pos = pos_feature(en_s,en_nlp,n=1,simple_pos=True)
print(len(en_pos))
print(map(lambda x: round(x,2),en_pos))

In [None]:
# Example: print the complex unigram POS-feature for a German sentence.
de_pos = pos_feature(de_s,de_nlp,n=1)
print(len(de_pos))
print(map(lambda x: round(x,2),de_pos))

In [None]:
# Example: print the first 100 values of the simple bigram POS-feature for an German sentence.
de_pos = pos_feature(de_s,de_nlp,n=2,simple_pos=True)
print(len(de_pos))
print(map(lambda x: round(x,2),de_pos))

In [None]:
# Example: print the first 24 values in the sentence vector for an English sentence.
en_doc = en_nlp(en_s)
print(len(en_doc.vector))
print(en_doc.vector[:24])

In [None]:
# Example: print the first 24 values in the sentence vector for a German sentence.
de_doc = de_nlp(de_s)
print(len(de_doc.vector))
print(de_doc.vector[:24])

### Loading the data

In [None]:
# Create constants for the paths to all data files.
DATA_DIR         = os.path.abspath(os.path.join('..','data'))
BASELINE_WEIGHTS = os.path.join(DATA_DIR,'baseline.weights')
DEV_BEST         = os.path.join(DATA_DIR,'nlp2-dev.1000best')
DEV_DE           = os.path.join(DATA_DIR,'nlp2-dev.de')
DEV_EN_PLF       = os.path.join(DATA_DIR,'nlp2-dev.en.pw.plf-100')
DEV_EN           = os.path.join(DATA_DIR,'nlp2-dev.en.s')
TEST_BEST        = os.path.join(DATA_DIR,'nlp2-test.1000best')
TEST_DE          = os.path.join(DATA_DIR,'nlp2-test.de')
TEST_EN_PLF      = os.path.join(DATA_DIR,'nlp2-test.en.pw.plf-100')
TEST_EN          = os.path.join(DATA_DIR,'nlp2-test.en.s')

In [None]:
def parse_candidate(s):
    """
    Parse a candidate translation (a line from the 1000-best files) into
    a tuple containing (in order):
    
        k:              the 0-based sentence id           (int)
        source:         the source sentence               (str)
        target:         the translated sentence           (str)
        segments:       the segments and their alignments (list[(str,(int,int))])
        feature_vector: the feature vector                ({str: list[float]})
        score:          the score assigned by MOSES       (float)
        alignments:     the alignments                    ([(int,int)])

    Note: alignments in the "segments" field are pairs of states in the
    input lattice, whereas the alignments in the "alignments" field are
    pairs of a state in the input lattice together with the position of
    the output word.
    """
    k, segments_and_alignments, feature_vector, score, alignments, source = s.split(' ||| ')
    
    # Parse an id as an integer
    k = int(k)
    
    # Parse a candidate translation (with alignments) into a sentence.
    segments_and_alignments = map(lambda s: s.strip(),
                                  re.split(r'\|(\d+\-\d+)\|', segments_and_alignments))
    segments = segments_and_alignments[0::2]
    target = ' '.join(segments)
    
    # Parse a candidate translation (with alignments) into a list of segments.
    segment_alignments = map(lambda s: tuple(map(int,s.strip().split('-'))), 
                             segments_and_alignments[1::2])
    segments = zip(segments,segment_alignments)
    
    # Parse a feature vector string into a dictionary.
    feature_vector = re.split(r'([A-Za-z]+0?)=', feature_vector)
    feature_names  = feature_vector[1::2]
    feature_values = map(lambda s: map(float,s.strip().split()), feature_vector[2::2])
    feature_map    = dict(zip(feature_names,feature_values))
    
    # Parse a score as a float.
    score = float(score)
    
    # Parse an alignment string into a list of tuples.
    alignments = map(lambda s: tuple(map(int,s.split('-'))), alignments.strip().split(' '))
    
    return (k, source, target, segments, feature_map, score, alignments)

In [None]:
# Load the development data.
dev_limit = 10

with open(DEV_EN, 'r') as f:
    inputs = [f.readline() for i in range(0, dev_limit)]
    
with open(DEV_DE, 'r') as f:
    references = [f.readline() for i in range(0, dev_limit)]
    
with open(DEV_BEST,'r') as f:
    candidates = []
    candidate_set = []
    i = 0
    while True:
        candidate = parse_candidate(f.readline())
        if candidate[0] == i:
            candidate_set.append(candidate)
        else:
            candidates.append(candidate_set)
            candidate_set = [candidate]
            i = candidate[0]
        if i > dev_limit:
            break

In [None]:
# Load the development data.
test_limit = 10

with open(TEST_EN, 'r') as f:
    test_inputs = [f.readline() for i in range(0, dev_limit)]
    
with open(TEST_DE, 'r') as f:
    test_references = [f.readline() for i in range(0, dev_limit)]
    
with open(TEST_BEST,'r') as f:
    test_candidates = []
    candidate_set = []
    i = 0
    while True:
        candidate = parse_candidate(f.readline())
        if candidate[0] == i:
            candidate_set.append(candidate)
        else:
            test_candidates.append(candidate_set)
            candidate_set = [candidate]
            i = candidate[0]
        if i > dev_limit:
            break

In [None]:
# Example: print all relevant information for sentence with id #2.
print(inputs[2])
print(references[2])
(k, source, target, segments, feature_map, score, alignments) = candidates[2][0]
# print(segments)
print(feature_map.values())
# print(score)

### Constructing feature vector for two sentences

In [None]:
def feature_vector(e, c1, c2):
    """
    e  : source sentence
    c1 : features from first translation
    c2 : features from second translation
    """
    (_, _, t1, _, f1, s1, _) = c1
    (_, _, t2, _, f2, s2, _) = c2
    
    f1 = sum(f1.values(), [])
    f2 = sum(f2.values(), [])
    
    e  = e .decode('utf-8')
    t1 = t1.decode('utf-8')
    t2 = t2.decode('utf-8')
    
    pos0 = pos_feature(e ,en_nlp)
    pos1 = pos_feature(t1,de_nlp)
    pos2 = pos_feature(t2,de_nlp)
    
    v0 = en_nlp(e).vector
    v1 = de_nlp(t1).vector
    v2 = de_nlp(t2).vector
    
    #return list(itertools.chain(f1,f2,pos0,pos1,pos2,v0,v1,v2,[s1,s2]))
    return [sum(f1),sum(f2),pos1,pos2]

In [None]:
def training_label(ref, c1, c2):
    
    (_, _, t1, _, _, _, _) = c1
    (_, _, t2, _, _, _, _) = c2
    
    if (BLEU(ref, t1) > BLEU(ref, t2)):
        return 1
    
    return 0

### The PRO algorithm

In [None]:
def pro_corpus(inputs, references, candidates, sample_size=10):
    
    data = []
    
    for i, e in enumerate(inputs):
        g = references[i]
        c = candidates[i]
        data = data + pro(e, g, c, sample_size)
    
    (x,y) = zip(*data)
    return (list(x), list(y))

In [None]:
def pro(e, g, c, sample_size=10):
    
    data = []
    
    for i in range(0,sample_size):
        
        # Randomly pick two candidates that are not the same
        j1 = j2 = random.randint(0,len(c)-1)
        while j1 == j2:
            j2 = random.randint(0,len(c)-1)
            
        training_example = (feature_vector(e, c[j1],c[j2]), training_label(g,c[j1],c[j2]))
        
        data.append(training_example)
    
    return data

### Train SVM 

In [None]:
(train_x, train_y) = pro_corpus(inputs, references, candidates, 15)

In [None]:
clf = svm.LinearSVC()
clf.fit(train_x, train_y) 

In [None]:
# Generating test data    
(test_x, test_y) = pro_corpus(test_inputs, test_references, test_candidates, 10)
predicted = clf.predict(test_x)
print(metrics.classification_report(test_y, predicted))

### Reranking the test data

In [None]:
def best_sentences(inputs, candidates, clf):
    sentences = []
    
    for i, e in enumerate(inputs):
        j = i + 1000
        c = candidates[i]
        sentences.append(best_sentence(e, c, clf))
    
    return sentences

In [None]:
def best_sentence(e, c, clf):
    
    def compare(x, y):
        if clf.predict([feature_vector(e,x,y)]) == [0]:
            return -1
        else:
            return 1
        
    (_, _, s, _, _, _, _) = sorted(c, cmp=compare)[0]
    return s

In [None]:
bleu_references = [[x] for x in test_references]
bleu_hypotheses = best_sentences(test_inputs, test_candidates, clf)

In [None]:
blue = corpus_bleu(bleu_references, bleu_hypotheses) 
print(blue)