In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import with_statement

In [5]:
import spacy

en_nlp = spacy.load('en')
de_nlp = spacy.load('de')

In [12]:
import collections
import gzip
import nltk
import os

In [6]:
en_s = u'Hello, world. Here are two sentences.'
de_s = u'Ich bin ein Berliner.'

In [8]:
def pos_feature(s,nlp):
    """
    Compute the POS feature vector given a sentence and an instance of spaCy.
    The POS feature vector is a vector which indicates, per POS-tag of the
    language, what ratio of the words in the sentence have this POS-tag.

    s  : input sentence
    nlp: instance of spaCy nlp
    """
    doc       = nlp(s,tag=True,parse=False,entity=False)
    pos_count = collections.Counter([tok.tag_ for tok in doc])
    return map(lambda tag: pos_count[tag] / len(doc), nlp.tagger.tag_names)

In [9]:
def BLEU(reference,candidate,n):
    """
    Compute the BLEU score for a given candidate sentence, with respect to a
    given reference sentence.

    reference: the reference translation
    candidate: the candidate translation
    n        : the size of the ngrams
    """
    return float(
        nltk.translate.bleu_score.modified_precision([reference],candidate,n=n))

In [27]:
# Example: print the POS-feature for an English sentence.
print(len(en_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(en_s,en_nlp)))

56
[0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [28]:
# Example: print the POS-feature for a Dutch sentence.
print(len(de_nlp.tagger.tag_names))
print(map(lambda x: round(x,2),pos_feature(de_s,de_nlp)))

57
[0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [26]:
# Example: print the first 24 values in the sentence vector for a German sentence.
de_doc = de_nlp(de_s)
print(len(de_doc.vector))
print(de_doc.vector[:24])

300
[ 0.04531119 -0.0290378   0.23427622  0.17914079 -0.19891641  0.1339044
 -0.16393779 -0.40263137 -0.04548381 -0.20328541  0.22776499 -0.1040916
 -0.1286144   0.078522    0.16846181 -0.33073679  0.34232721 -0.29092079
 -0.17509019  0.0541686   0.0833232  -0.27929959  0.0028818   0.15957041]
