## Vectorization of features

In [1]:
text = ["Peter Piper picked a peck of pickled peppers.",
        "A peck of pickled peppers Peter Piper picked.",
        "If Peter Piper picked a peck of pickled peppers, where's the peck of pickled peppers Peter Piper picked?"]

### Count Vectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# by default: tokenization, lowercasing, punctuation is completely ignored
c_vec = CountVectorizer()
x = c_vec.fit(text)

In [3]:
print(c_vec.get_feature_names())

['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']


In [4]:
x.transform(text).toarray()

array([[0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 2, 2, 2, 2, 2, 2, 2, 1, 1]])

In [5]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[1, 0, 0, 1, 1, 0, 0, 0, 0, 0]])

#### What if I want lemmas? And no lowercasing?

In [6]:
import en_core_web_md
from spacy import displacy

nlp = en_core_web_md.load()

In [7]:
data = [nlp(sent) for sent in text]
data = [" ".join([token.lemma_ for token in sent]) for sent in data]

In [8]:
c_vec2 = CountVectorizer(lowercase=False, stop_words='english',
                         tokenizer=lambda x: x.split(' '))
x = c_vec2.fit(data)



In [9]:
print(c_vec2.get_feature_names())

[',', '.', '?', 'Peter', 'Piper', 'peck', 'pepper', 'pick', 'pickle']


In [10]:
x.transform(data).toarray()

array([[0, 1, 0, 1, 1, 1, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1, 1, 1],
       [1, 0, 1, 2, 2, 2, 2, 2, 2]])

In [11]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 1, 0]])

### TF-IDF Vectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# bacisally it's CountVectorizer followed by TfidfTransformer
tfidf_vec = TfidfVectorizer()
x = tfidf_vec.fit(text)

In [13]:
print(tfidf_vec.get_feature_names())

['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']


In [14]:
x.transform(text).toarray()

array([[0.        , 0.37796447, 0.37796447, 0.37796447, 0.37796447,
        0.37796447, 0.37796447, 0.37796447, 0.        , 0.        ],
       [0.        , 0.37796447, 0.37796447, 0.37796447, 0.37796447,
        0.37796447, 0.37796447, 0.37796447, 0.        , 0.        ],
       [0.27986767, 0.33058871, 0.33058871, 0.33058871, 0.33058871,
        0.33058871, 0.33058871, 0.33058871, 0.27986767, 0.27986767]])

In [15]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[0.76749457, 0.        , 0.        , 0.45329466, 0.45329466,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

### DictVectorizer

In [16]:
from sklearn.feature_extraction import DictVectorizer

In [17]:
doc = nlp("I like nice, cute, and fluffy kittens")

In [18]:
displacy.render(doc, style='dep', options={"collapse_punct": True, "font": "Arial",
                                           "font_size": "40", "distance": 110},
                jupyter=True)

In [19]:
# Collect features for each word in the sentence

def extract_features(sentence):
    data = []
    for i in range(len(sentence)):
        features = dict()
        features["lemma"] = sentence[i].lemma_
        features["is_capitalized"] = sentence[i].text.istitle()
        features["word-1"] = sentence[i-1].text if i > 0 else "NONE"
        features["parent"] = sentence[i].dep_ + "_" + sentence[i].head.lemma_
        features["right-bigram"] = sentence[i+1].text + "_" + sentence[i+2].text \
            if i < (len(sentence) - 2) else "NONE"
        data.append(features)
    return data

data = extract_features(doc)

In [20]:
dict_vec = DictVectorizer()
x = dict_vec.fit(data)

In [21]:
# The full feature set:
print("{} features were generated\n".format(len(dict_vec.get_feature_names())))
print(dict_vec.get_feature_names())

34 features were generated

['is_capitalized', 'lemma=,', 'lemma=-PRON-', 'lemma=and', 'lemma=cute', 'lemma=fluffy', 'lemma=kitten', 'lemma=like', 'lemma=nice', 'parent=ROOT_like', 'parent=amod_kitten', 'parent=cc_cute', 'parent=conj_cute', 'parent=conj_nice', 'parent=dobj_like', 'parent=nsubj_like', 'parent=punct_cute', 'parent=punct_nice', 'right-bigram=,_and', 'right-bigram=,_cute', 'right-bigram=NONE', 'right-bigram=and_fluffy', 'right-bigram=cute_,', 'right-bigram=fluffy_kittens', 'right-bigram=like_nice', 'right-bigram=nice_,', 'word-1=,', 'word-1=I', 'word-1=NONE', 'word-1=and', 'word-1=cute', 'word-1=fluffy', 'word-1=like', 'word-1=nice']


In [22]:
new_text = nlp("Mary called Peter.")
dict_vec.transform(extract_features(new_text)).toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])