In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import with_statement

### Import the Spacy library

[Spacy](http://spacy.io) is an NLP library which is ready to be used in production settings. This library will help us find features for our data.

In [4]:
import spacy

en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

In [5]:
import collections
import gzip
import nltk
import os
import re

ImportError: No module named nltk

In [None]:
en_s = u'Hello, world. Here are two sentences.'
de_s = u'Ich bin ein Berliner.'

In [None]:
def BLEU(reference,candidate,n):
    """
    Compute the BLEU score for a given candidate sentence, with respect to a
    given reference sentence.

    reference: the reference translation
    candidate: the candidate translation
    n        : the size of the ngrams
    """
    return float(
        nltk.translate.bleu_score.modified_precision([reference],candidate,n=n))

In [None]:
def pos_feature(s,nlp):
    """
    Compute the POS feature vector given a sentence and an instance of spaCy.
    The POS feature vector is a vector which indicates, per POS-tag of the
    language, what ratio of the words in the sentence have this POS-tag.

    s  : input sentence
    nlp: instance of spaCy nlp
    """
    doc       = nlp(s,tag=True,parse=False,entity=False)
    pos_count = collections.Counter([tok.tag_ for tok in doc])
    return map(lambda tag: pos_count[tag] / len(doc), nlp.tagger.tag_names)

In [None]:
# Example: print the POS-feature for an English sentence.
print(len(en_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(en_s,en_nlp)))

In [None]:
# Example: print the POS-feature for a German sentence.
print(len(de_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(de_s,de_nlp)))

In [None]:
# Example: print the first 24 values in the sentence vector for an English sentence.
en_doc = en_nlp(en_s)
print(len(en_doc.vector))
print(en_doc.vector[:24])

In [None]:
# Example: print the first 24 values in the sentence vector for a German sentence.
de_doc = de_nlp(de_s)
print(len(de_doc.vector))
print(de_doc.vector[:24])

In [None]:
# Create constants for the paths to all data files.
DATA_DIR         = os.path.abspath(os.path.join('..','data'))
BASELINE_WEIGHTS = os.path.join(DATA_DIR,'baseline.weights')
DEV_BEST         = os.path.join(DATA_DIR,'nlp2-dev.1000best.gz')
DEV_DE           = os.path.join(DATA_DIR,'nlp2-dev.de.gz')
DEV_EN_PLF       = os.path.join(DATA_DIR,'nlp2-dev.en.pw.plf-100.gz')
DEV_EN           = os.path.join(DATA_DIR,'nlp2-dev.en.s.gz')
TEST_BEST        = os.path.join(DATA_DIR,'nlp2-test.1000best.gz')
TEST_DE          = os.path.join(DATA_DIR,'nlp2-test.de.gz')
TEST_EN_PLF      = os.path.join(DATA_DIR,'nlp2-test.en.pw.plf-100.gz')
TEST_EN          = os.path.join(DATA_DIR,'nlp2-test.en.s.gz')

In [None]:
# Load the development data.
with gzip.open(DEV_EN,  'r') as f: inputs     = f.readlines()
with gzip.open(DEV_DE,  'r') as f: references = f.readlines()
with gzip.open(DEV_BEST,'r') as f: candidates = f.readlines()

In [None]:
def parse_candidate(s):
    """
    Parse a candidate translation (a line from the 1000-best files) into
    a tuple containing (in order):
    
        k:              the 0-based sentence id           (int)
        source:         the source sentence               (str)
        target:         the translated sentence           (str)
        segments:       the segments and their alignments (list[(str,(int,int))])
        feature_vector: the feature vector                ({str: list[float]})
        score:          the score assigned by MOSES       (float)
        alignments:     the alignments                    ([(int,int)])

    Note: alignments in the "segments" field are pairs of states in the
    input lattice, whereas the alignments in the "alignments" field are
    pairs of a state in the input lattice together with the position of
    the output word.
    """
    k, segments_and_alignments, feature_vector, score, alignments, source = s.split(' ||| ')
    
    # Parse an id as an integer
    k = int(k)
    
    # Parse a candidate translation (with alignments) into a sentence.
    segments_and_alignments = map(lambda s: s.strip(),
                                  re.split(r'\|(\d+\-\d+)\|', segments_and_alignments))
    segments = segments_and_alignments[0::2]
    target = ' '.join(segments)
    
    # Parse a candidate translation (with alignments) into a list of segments.
    segment_alignments = map(lambda s: tuple(map(int,s.strip().split('-'))), 
                             segments_and_alignments[1::2])
    segments = zip(segments,segment_alignments)
    
    # Parse a feature vector string into a dictionary.
    feature_vector = re.split(r'([A-Za-z]+0?)=', feature_vector)
    feature_names  = feature_vector[1::2]
    feature_values = map(lambda s: map(float,s.strip().split()), feature_vector[2::2])
    feature_vector = dict(zip(feature_names,feature_values))
    
    # Parse a score as a float.
    score = float(score)
    
    # Parse an alignment string into a list of tuples.
    alignments = map(lambda s: tuple(map(int,s.split('-'))), alignments.strip().split(' '))
    
    return (k, source, target, segments, feature_vector, score, alignments)

In [None]:
# Example: print all relevant information for sentence with id #2.
print(inputs[2])
print(references[2])
for fld in parse_candidate(candidates[2000]):
    print(fld)