# Spacy library examples

https://spacy.io/

In [1]:
import os
import sys
import logging
import warnings
import copy
import re
import json
import itertools
from operator import attrgetter
import inspect
import collections
import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from spacy.lang.en import English
import en_core_web_sm

from IPython.display import display, HTML

warnings.filterwarnings('ignore')
np.random.seed(42)

In [2]:
def as_table(obj_iter, variables):
    dct = { var:[] for var in variables}
    for obj in obj_iter:
        for var in variables:
            attr = attrgetter(var)(obj)
            if (not isinstance(attr, str)) and isinstance(attr, collections.Iterable):
                attr = list(attr)
            dct[var].append(attr)
    return pd.DataFrame(data=dct, columns=variables)

In [3]:
# Load the english model
# Disabling the 'tagger' will result in different lemmatization
nlp = spacy.load('en')
# Alternatives
# nlp = spacy.load('en', disable=['parser', 'ner'])
# nlp = en_core_web_sm.load()
# nlp = English()
print('NLP pipeline: ', nlp.pipe_names)

doc = nlp(u"The company Apple is looking at buying U.K. startups for $1 billion. I'm not.")

print('doc: ', doc)

df_tokens = as_table(
    doc, ['text', 'lemma_', 'pos_', 'norm_', 'tag_', 'dep_', 'children', 'shape_', 'is_alpha', 'is_stop'])
display(df_tokens)
print([t.lemma_ for t in doc])

displacy.render(doc, style='dep', jupyter=True)

NLP pipeline:  ['tagger', 'parser', 'ner']
doc:  The company Apple is looking at buying U.K. startups for $1 billion. I'm not.


Unnamed: 0,text,lemma_,pos_,norm_,tag_,dep_,children,shape_,is_alpha,is_stop
0,The,the,DET,the,DT,det,[],Xxx,True,False
1,company,company,NOUN,company,NN,compound,[The],xxxx,True,False
2,Apple,apple,PROPN,apple,NNP,nsubj,[company],Xxxxx,True,False
3,is,be,VERB,is,VBZ,aux,[],xx,True,True
4,looking,look,VERB,looking,VBG,ROOT,"[Apple, is, at, .]",xxxx,True,False
5,at,at,ADP,at,IN,prep,[buying],xx,True,True
6,buying,buy,VERB,buying,VBG,pcomp,"[startups, for]",xxxx,True,False
7,U.K.,u.k.,PROPN,u.k.,NNP,compound,[],X.X.,False,False
8,startups,startup,NOUN,startups,NNS,dobj,[U.K.],xxxx,True,False
9,for,for,ADP,for,IN,prep,[billion],xxx,True,True


['the', 'company', 'apple', 'be', 'look', 'at', 'buy', 'u.k.', 'startup', 'for', '$', '1', 'billion', '.', '-PRON-', 'be', 'not', '.']


  "__main__", mod_spec)
  "__main__", mod_spec)


In [4]:
df_ents = as_table(
    doc.ents, ['text', 'start_char', 'end_char', 'label_'])
display(df_ents)

displacy.render(doc, style='ent', jupyter=True)

Unnamed: 0,text,start_char,end_char,label_
0,Apple,12,17,ORG
1,U.K.,39,43,GPE
2,$1 billion,57,67,MONEY


In [5]:
df_np = as_table(
    doc.noun_chunks, ['text', 'root.text', 'root.dep_', 'root.head.text'])
display(df_np)

Unnamed: 0,text,root.text,root.dep_,root.head.text
0,The company Apple,Apple,nsubj,looking
1,U.K. startups,startups,dobj,buying
2,I,I,nsubj,'m


In [6]:
df_sent = as_table(
    doc.sents, ['text'])
display(df_sent)

df_sent = as_table(
    (word for sent in doc.sents for word in sent), ['text', 'orth_', 'tag_', 'head.i', 'dep_'])
display(df_sent)

Unnamed: 0,text
0,The company Apple is looking at buying U.K. st...
1,I'm not.


Unnamed: 0,text,orth_,tag_,head.i,dep_
0,The,The,DT,1,det
1,company,company,NN,2,compound
2,Apple,Apple,NNP,4,nsubj
3,is,is,VBZ,4,aux
4,looking,looking,VBG,4,ROOT
5,at,at,IN,4,prep
6,buying,buying,VBG,5,pcomp
7,U.K.,U.K.,NNP,8,compound
8,startups,startups,NNS,6,dobj
9,for,for,IN,6,prep


In [7]:
print(spacy.about.__version__)
s = u"The company Apple is looking at buying U.K. startups for $1 billion. I'm not."

nlp = spacy.load('en', disable=['parser', 'ner'])
doc = nlp(s)
print('With tagger: ', [t.lemma_ for t in doc])  # Lemma's are lowercase

nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
doc = nlp(s)
print('Without tagger: ', [t.lemma_ for t in doc])  # Lemma's captial letters are kept

2.0.5
With tagger:  ['the', 'company', 'apple', 'be', 'look', 'at', 'buy', 'u.k.', 'startup', 'for', '$', '1', 'billion', '.', '-PRON-', 'be', 'not', '.']
Without tagger:  ['The', 'company', 'Apple', 'be', 'look', 'at', 'buy', 'U.K.', 'startups', 'for', '$', '1', 'billion', '.', '-PRON-', 'be', 'not', '.']


In [8]:
print(spacy.about.__version__)
nlp = spacy.load('en')
for s in ["The store", "the store"]:
    doc = nlp(s)
    print('\n{}'.format(s))
    for t in doc:
        print('{}\t{}'.format(t.text, t.is_stop))

2.0.5

The store
The	False
store	False

the store
the	True
store	False


In [9]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'an', 'mine', 'since', 'because', 'rather', 'afterwards', 'into', 'regarding', 'always', 'our', 'upon', 'wherein', 'much', 'along', 'yet', 'seeming', 'enough', 'about', 'name', 'several', 'them', 'until', 'whereupon', 'now', 'toward', 'back', 'cannot', 'well', 'whence', 'when', 'towards', 'even', 'why', 'might', 'quite', 'hereupon', 'perhaps', 'give', 'becomes', 'full', 'nowhere', 'being', 'i', 'front', 'per', 'which', 'with', 'neither', 'around', 'against', 'his', 'from', 'three', 'both', 'were', 'hundred', 'ca', 'nobody', 'below', 'hereby', 'whenever', 'five', 'almost', 'no', 'also', 'forty', 'they', 'whose', 'across', 'have', 'be', 'mostly', 'alone', 'here', 'you', 'please', 'latter', 'less', 'whatever', 'fifteen', 'beside', 'at', 'these', 'often', 'either', 'under', 'its', 'unless', 'before', 'few', 'itself', 'seemed', 'yourself', 'more', 'put', 'ten', 'throughout', 'and', 'than', 'noone', 'elsewhere', 'was', 'we', 'between', 'who', 'among', 'except', 'seem', 'sixty', 'each', 'tho