In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import with_statement

import collections
import gzip
import os
import re
import random
import numpy as np

from sklearn import metrics
from sklearn import svm

random.seed(10)

### Import the Spacy library

[Spacy](http://spacy.io) is an NLP library which is ready to be used in production settings. This library will help us find features for our data.

In [3]:
import spacy

en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

### Import the NLTK library

We will use the [NLTK library](NLTK) for other features we want to extract from our data. As well as use the BLUE scoring algorithm in the library

In [4]:
import nltk

In [5]:
en_s = u'Hello, world. Here are two sentences.'
de_s = u'Ich bin ein Berliner.'

### Evaluation metrics

In [84]:
def BLEU(reference,candidate):
    """
    Compute the BLEU score for a given candidate sentence, with respect to a
    given reference sentence.

    reference: the reference translation
    candidate: the candidate translation
    """
    return float(
        nltk.translate.bleu_score.modified_precision([reference],candidate,n=2))

### Features

In [7]:
def pos_feature(s,nlp):
    """
    Compute the POS feature vector given a sentence and an instance of spaCy.
    The POS feature vector is a vector which indicates, per POS-tag of the
    language, what ratio of the words in the sentence have this POS-tag.

    s  : input sentence
    nlp: instance of spaCy nlp
    """
    doc       = nlp(s,tag=True,parse=False,entity=False)
    pos_count = collections.Counter([tok.tag_ for tok in doc])
    return map(lambda tag: pos_count[tag] / len(doc), nlp.tagger.tag_names)

In [8]:
# Example: print the POS-feature for an English sentence.
print(len(en_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(en_s,en_nlp)))

56
[0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [9]:
# Example: print the POS-feature for a German sentence.
print(len(de_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(de_s,de_nlp)))

57
[0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [10]:
# Example: print the first 24 values in the sentence vector for an English sentence.
en_doc = en_nlp(en_s)
print(len(en_doc.vector))
print(en_doc.vector[:24])

300
[ 0.07028833 -0.00575205  0.02307617  0.0293571   0.0150436   0.03114259
  0.00853428  0.00191485  0.00394045 -0.02803375  0.04562261 -0.00354294
 -0.04505194  0.0158382  -0.01503225 -0.06313571 -0.06844621 -0.06096174
 -0.0268664  -0.0046172  -0.031034   -0.00546252  0.0032963  -0.04281867]


In [11]:
# Example: print the first 24 values in the sentence vector for a German sentence.
de_doc = de_nlp(de_s)
print(len(de_doc.vector))
print(de_doc.vector[:24])

300
[ 0.04531119 -0.0290378   0.23427622  0.17914079 -0.19891641  0.1339044
 -0.16393779 -0.40263137 -0.04548381 -0.20328541  0.22776499 -0.1040916
 -0.1286144   0.078522    0.16846181 -0.33073679  0.34232721 -0.29092079
 -0.17509019  0.0541686   0.0833232  -0.27929959  0.0028818   0.15957041]


### Loading the data

In [12]:
# Create constants for the paths to all data files.
DATA_DIR         = os.path.abspath(os.path.join('..','data'))
BASELINE_WEIGHTS = os.path.join(DATA_DIR,'baseline.weights')
DEV_BEST         = os.path.join(DATA_DIR,'nlp2-dev.1000best')
DEV_DE           = os.path.join(DATA_DIR,'nlp2-dev.de')
DEV_EN_PLF       = os.path.join(DATA_DIR,'nlp2-dev.en.pw.plf-100')
DEV_EN           = os.path.join(DATA_DIR,'nlp2-dev.en.s')
TEST_BEST        = os.path.join(DATA_DIR,'nlp2-test.1000best')
TEST_DE          = os.path.join(DATA_DIR,'nlp2-test.de')
TEST_EN_PLF      = os.path.join(DATA_DIR,'nlp2-test.en.pw.plf-100')
TEST_EN          = os.path.join(DATA_DIR,'nlp2-test.en.s')

In [13]:
def parse_candidate(s):
    """
    Parse a candidate translation (a line from the 1000-best files) into
    a tuple containing (in order):
    
        k:              the 0-based sentence id           (int)
        source:         the source sentence               (str)
        target:         the translated sentence           (str)
        segments:       the segments and their alignments (list[(str,(int,int))])
        feature_vector: the feature vector                ({str: list[float]})
        score:          the score assigned by MOSES       (float)
        alignments:     the alignments                    ([(int,int)])

    Note: alignments in the "segments" field are pairs of states in the
    input lattice, whereas the alignments in the "alignments" field are
    pairs of a state in the input lattice together with the position of
    the output word.
    """
    k, segments_and_alignments, feature_vector, score, alignments, source = s.split(' ||| ')
    
    # Parse an id as an integer
    k = int(k)
    
    # Parse a candidate translation (with alignments) into a sentence.
    segments_and_alignments = map(lambda s: s.strip(),
                                  re.split(r'\|(\d+\-\d+)\|', segments_and_alignments))
    segments = segments_and_alignments[0::2]
    target = ' '.join(segments)
    
    # Parse a candidate translation (with alignments) into a list of segments.
    segment_alignments = map(lambda s: tuple(map(int,s.strip().split('-'))), 
                             segments_and_alignments[1::2])
    segments = zip(segments,segment_alignments)
    
    # Parse a feature vector string into a dictionary.
    feature_vector = re.split(r'([A-Za-z]+0?)=', feature_vector)
    feature_names  = feature_vector[1::2]
    feature_values = map(lambda s: map(float,s.strip().split()), feature_vector[2::2])
    feature_map = dict(zip(feature_names,feature_values))
    
    # Parse a score as a float.
    score = float(score)
    
    # Parse an alignment string into a list of tuples.
    alignments = map(lambda s: tuple(map(int,s.split('-'))), alignments.strip().split(' '))
    
    return (k, source, target, segments, feature_map, score, alignments)

In [50]:
# Load the development data.
dev_limit = 1000

with open(DEV_EN, 'r') as f:
    inputs = [f.readline() for i in range(0, dev_limit)]
    
with open(DEV_DE, 'r') as f:
    references = [f.readline() for i in range(0, dev_limit)]
    
with open(DEV_BEST,'r') as f:
    candidates = []
    candidate_set = []
    i = 0
    while True:
        candidate = parse_candidate(f.readline())
        if candidate[0] == i:
            candidate_set.append(candidate)
        else:
            candidates.append(candidate_set)
            candidate_set = [candidate]
            i = candidate[0]
        if i > dev_limit:
            break

In [51]:
# Load the development data.
test_limit = 100

with open(TEST_EN, 'r') as f:
    test_inputs = [f.readline() for i in range(0, dev_limit)]
    
with open(TEST_DE, 'r') as f:
    test_references = [f.readline() for i in range(0, dev_limit)]
    
with open(TEST_BEST, 'r') as f:
    test_candidates = []
    candidate_set = []
    i = 0
    while True:
        candidate = parse_candidate(f.readline())
        if candidate[0] == i:
            candidate_set.append(candidate)
        else:
            test_candidates.append(candidate_set)
            candidate_set = [candidate]
            i = candidate[0]
        if i > dev_limit:
            break

In [52]:
# Example: print all relevant information for sentence with id #2.
print(inputs[2])
print(references[2])
(k, source, target, segments, feature_map, score, alignments) = candidates[2][0]
# print(segments)
print(feature_map.values())
# print(score)

two sets of lights so close to one another : intentional or just a silly error ?

Zwei Anlagen so nah beieinander : Absicht oder Schildbürgerstreich ?

[[0.0], [-88.3261], [10.0], [-27.4501, -35.3568, -17.6299, -29.1119], [0.838325, 0.0794625, 0.0794625], [0.0, 0.0, 0.0, 0.0], [-14.0], [-107.313], [0.0]]


### Constructing feature vector for two sentences

In [68]:
def feature_vector(e, c1, c2):
    (_, _, t1, _, f1, s1, _) = c1
    (_, _, t2, _, f2, s2, _) = c2
    
    return sum(f1.values(), []) + sum(f2.values(), []) + [s1, s2]
    

In [69]:
feature_vector = feature_vector(inputs[2],candidates[2][0],candidates[2][1])
print(feature_vector)

[0.0, -88.3261, 10.0, -27.4501, -35.3568, -17.6299, -29.1119, 0.838325, 0.0794625, 0.0794625, 0.0, 0.0, 0.0, 0.0, -14.0, -107.313, 0.0, 0.0, -89.0222, 10.0, -27.9478, -37.5129, -15.8729, -29.8851, 0.844537, 0.07325, 0.07325, 0.0, 0.0, 0.0, 0.0, -14.0, -107.313, 0.0]


In [91]:
def training_label(ref, c1, c2):
    
    (_, _, t1, _, _, _, _) = c1
    (_, _, t2, _, _, _, _) = c2
    
    if (BLEU(ref, t1) > BLEU(ref, t2)):
        return 1
    
    return 0

In [92]:
training_label = training_label(references[2], candidates[2][0], candidates[2][500])
print(training_label)

1


### The PRO algorithm

In [55]:
def pro_corpus(inputs, references, candidates, sample_size=10):
    
    data = []
    
    for i, e in enumerate(inputs):
        g = references[i]
        c = candidates[i]
        data = data + pro(e, g, c, sample_size)
    
    (x,y) = zip(*data)
    return (list(x), list(y))
        
        

In [56]:
def pro(e, g, c, sample_size=10):
    
    data = []
    
    for i in range(0,sample_size):
        
        # Randomly pick two candidates that are not the same
        j1 = j2 = random.randint(0,len(c)-1)
        while j1 == j2:
            j2 = random.randint(0,len(c)-1)
            
        training_example = ( feature_vector(e,c[j1],c[j2]), training_label(g,c[j1],c[j2]) )
        
        data.append(training_example)
    
    return data
        

### Train SVM 

In [57]:
(train_x, train_y) = pro_corpus(inputs, references, candidates, 100)

In [58]:
clf = svm.LinearSVC()
clf.fit(train_x, train_y) 

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [59]:
# Generating test data    
(test_x, test_y) = pro_corpus(test_inputs, test_references, test_candidates, 5)
predicted = clf.predict(test_x)
print(metrics.classification_report(test_y, predicted))

             precision    recall  f1-score   support

          0       0.51      0.98      0.67      2541
          1       0.61      0.03      0.05      2459

avg / total       0.56      0.51      0.37      5000



### Reranking the test data

In [60]:
def best_sentences(inputs, candidates, clf):
    sentences = []
    
    for i, e in enumerate(inputs):
        j = i + 1000
        c = candidates[i]
        sentences.append(best_sentence(e, c, clf))
    
    return sentences

In [61]:
def best_sentence(e, c, clf):
    
    def compare(x, y):
        if clf.predict([feature_vector(e,x,y)]) == [0]:
            return -1
        else:
            return 1
        
    (_, _, s, _, _, _, _) = sorted(c, cmp=compare)[0]
    return s

In [62]:
bleu_references = [[x] for x in test_references]
bleu_hypotheses = best_sentences(test_inputs, test_candidates, clf)

In [63]:
blue = nltk.translate.bleu_score.corpus_bleu(bleu_references, bleu_hypotheses) 
print(blue)

0.594249827189
