## 1. Extract features with spaCy

In [1]:
# Import spaCy models (may take up to 20 sec to load)

import en_core_web_md

nlp = en_core_web_md.load()

In [2]:
# text is a Doc object
text = nlp("Jack Sparrow was a legendary pirate and the irreverent " +
           "trickster of the Caribbean. Jack's first love was the sea. " +
           "His second love was his beloved ship, the Black Pearl.")

In [3]:
# text.sents -> sentences
for sentence in text.sents:
    print(sentence)

Jack Sparrow was a legendary pirate and the irreverent trickster of the Caribbean.
Jack's first love was the sea.
His second love was his beloved ship, the Black Pearl.


In [4]:
# text.ents -> entities; each entity has a .label_
for entity in text.ents:
    print("{:<20}{:10}{:5}{:5}".format(entity.text, entity.label_,
                                       entity.start, entity.end))

Jack Sparrow        PERSON        0    2
Caribbean           LOC          12   13
Jack                PERSON       14   15
first               ORDINAL      16   17
second              ORDINAL      23   24
the Black Pearl     FAC          30   33


In [5]:
# Doc is a sequence of Token objects
for token in list(text.sents)[1]:
    print("{:<5}{:<7}{:12}{:12}{:10}{:5}{:5}".format(
        token.i, token.idx, token.text, token.lemma_, token.shape_,
        token.is_punct, token.whitespace_ == " "))

14   83     Jack        Jack        Xxxx          0    0
15   87     's          's          'x            0    1
16   90     first       first       xxxx          0    1
17   96     love        love        xxxx          0    1
18   101    was         be          xxx           0    1
19   105    the         the         xxx           0    1
20   109    sea         sea         xxx           0    0
21   112    .           .           .             1    1


In [6]:
# each Token contains information about its part of speech
for token in list(text.sents)[1]:
    print("{:<5}{:12}{:7}{:7}".format(token.i, token.text, token.tag_, token.pos_))

14   Jack        NNP    PROPN  
15   's          POS    PART   
16   first       JJ     ADJ    
17   love        NN     NOUN   
18   was         VBD    VERB   
19   the         DT     DET    
20   sea         NN     NOUN   
21   .           .      PUNCT  


In [7]:
# each Token contains information about its dependency and its parent
for token in list(text.sents)[1]:
    print("{:<5}{:12}{:12}{:10}".format(token.i, token.text, token.dep_, token.head.text))

14   Jack        poss        love      
15   's          case        Jack      
16   first       amod        love      
17   love        nsubj       was       
18   was         ROOT        was       
19   the         det         sea       
20   sea         attr        was       
21   .           punct       was       


In [8]:
# each Token contains information about its children
for token in list(text.sents)[1]:
    if len(list(token.children)) > 0:
        print("Children of \"{}\": {}".format(token.text,
                                              [child.text for child in token.children]))

Children of "Jack": ["'s"]
Children of "love": ['Jack', 'first']
Children of "was": ['love', 'sea', '.']
Children of "sea": ['the']


In [9]:
from spacy import displacy

# Visualize a sentence parse
sentence = nlp("Jack Sparrow was a legendary pirate and the irreverent trickster of the Caribbean.")
displacy.render(sentence, style='dep', options={"collapse_punct": False, "font": "Arial",
                                                "font_size": "40", "distance": 110},
                jupyter=True)

In [10]:
# Visualize the entities

displacy.render(text, style='ent',
                options={"collapse_punct": False, "distance": 110},
                jupyter=True)

## 2. Encode features with sklearn

In [11]:
from sklearn.feature_extraction import DictVectorizer

In [12]:
# Collect features for each word in the sentence

def feature_extractor(sentence, i):
    """Extract features for the word i in sentence."""
    features = dict()
    features["lemma"] = sentence[i].lemma_
    features["tag"] = sentence[i].tag_
    features["lemma-1"] = sentence[i-1].lemma_ if i > 0 else "<S>"
    features["tag-1"] = sentence[i-1].tag_ if i > 0 else "<S>"
    token_text = sentence[i].text
    if token_text.istitle():
        features["shape"] = "title"
    elif token_text.isupper():
        features["shape"] = "upper"
    elif token_text.islower():
        features["shape"] = "lower"
    else:
        features["shape"] = "unk"
    features["right-bigram"] = sentence[i+1].text + "_" + sentence[i+2].text \
        if i < (len(sentence) - 2) else "NONE"
    return features

In [13]:
# features for each word:
data = [feature_extractor(sentence, i) for i in range(len(sentence))]

print(data[3])

{'lemma': 'a', 'tag': 'DT', 'lemma-1': 'be', 'tag-1': 'VBD', 'shape': 'lower', 'right-bigram': 'legendary_pirate'}


In [14]:
# Vectorize the features
vec = DictVectorizer()

# The resulting sparse matrix
x = vec.fit_transform(data).toarray()
print(x)

[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 1

In [15]:
# The full feature set:
print("{} features were generated\n".format(len(vec.get_feature_names())))
print(vec.get_feature_names())

58 features were generated

['lemma-1=<S>', 'lemma-1=Caribbean', 'lemma-1=Jack', 'lemma-1=Sparrow', 'lemma-1=a', 'lemma-1=and', 'lemma-1=be', 'lemma-1=irreverent', 'lemma-1=legendary', 'lemma-1=of', 'lemma-1=pirate', 'lemma-1=the', 'lemma-1=trickster', 'lemma=.', 'lemma=Caribbean', 'lemma=Jack', 'lemma=Sparrow', 'lemma=a', 'lemma=and', 'lemma=be', 'lemma=irreverent', 'lemma=legendary', 'lemma=of', 'lemma=pirate', 'lemma=the', 'lemma=trickster', 'right-bigram=Caribbean_.', 'right-bigram=NONE', 'right-bigram=Sparrow_was', 'right-bigram=a_legendary', 'right-bigram=and_the', 'right-bigram=irreverent_trickster', 'right-bigram=legendary_pirate', 'right-bigram=of_the', 'right-bigram=pirate_and', 'right-bigram=the_Caribbean', 'right-bigram=the_irreverent', 'right-bigram=trickster_of', 'right-bigram=was_a', 'shape=lower', 'shape=title', 'shape=unk', 'tag-1=<S>', 'tag-1=CC', 'tag-1=DT', 'tag-1=IN', 'tag-1=JJ', 'tag-1=NN', 'tag-1=NNP', 'tag-1=VBD', 'tag=.', 'tag=CC', 'tag=DT', 'tag=IN', 'tag=JJ',

In [16]:
# The feature set for an unknown word:
new_word = feature_extractor(nlp("Jack's first love was the Sea."), 5)

print(vec.transform(new_word).toarray())

[[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 1. 0. 0. 0. 0. 0.]]


In [17]:
for i, j in zip(vec.get_feature_names(), vec.transform(new_word).toarray()[0]):
    print(i, j)

lemma-1=<S> 0.0
lemma-1=Caribbean 0.0
lemma-1=Jack 0.0
lemma-1=Sparrow 0.0
lemma-1=a 0.0
lemma-1=and 0.0
lemma-1=be 1.0
lemma-1=irreverent 0.0
lemma-1=legendary 0.0
lemma-1=of 0.0
lemma-1=pirate 0.0
lemma-1=the 0.0
lemma-1=trickster 0.0
lemma=. 0.0
lemma=Caribbean 0.0
lemma=Jack 0.0
lemma=Sparrow 0.0
lemma=a 0.0
lemma=and 0.0
lemma=be 0.0
lemma=irreverent 0.0
lemma=legendary 0.0
lemma=of 0.0
lemma=pirate 0.0
lemma=the 1.0
lemma=trickster 0.0
right-bigram=Caribbean_. 0.0
right-bigram=NONE 0.0
right-bigram=Sparrow_was 0.0
right-bigram=a_legendary 0.0
right-bigram=and_the 0.0
right-bigram=irreverent_trickster 0.0
right-bigram=legendary_pirate 0.0
right-bigram=of_the 0.0
right-bigram=pirate_and 0.0
right-bigram=the_Caribbean 0.0
right-bigram=the_irreverent 0.0
right-bigram=trickster_of 0.0
right-bigram=was_a 0.0
shape=lower 1.0
shape=title 0.0
shape=unk 0.0
tag-1=<S> 0.0
tag-1=CC 0.0
tag-1=DT 0.0
tag-1=IN 0.0
tag-1=JJ 0.0
tag-1=NN 0.0
tag-1=NNP 0.0
tag-1=VBD 1.0
tag=. 0.0
tag=CC 0.0
tag=DT