In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
text = "The rain in Spain falls mainly on the plain."

In [4]:
doc = nlp(text)

In [5]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.is_stop)

The the DET True
rain rain NOUN False
in in ADP True
Spain Spain PROPN False
falls fall VERB False
mainly mainly ADV False
on on ADP True
the the DET True
plain plain NOUN False
. . PUNCT False


Created a doc from the text. This doc -- nlp(text) -- is a container for a document and all of its annotations. We then iterated through the doc to see what spaCy parsed. Would be helpful to have this as a df.

In [6]:
import pandas as pd

In [7]:
cols = ("text", "lemma", "POS", "explain", "stopword")

rows = []

for token in doc:
    row = [token.text, # raw text
           token.lemma_, # a root form of the word
           token.pos_, # determines part of speech of the lemma
           spacy.explain(token.pos_), # explains pos 
           token.is_stop # flag for whether the word is a stopword = a common word to filter out
          ]
    rows.append(row)

df = pd.DataFrame(rows, columns = cols)

df

Unnamed: 0,text,lemma,POS,explain,stopword
0,The,the,DET,determiner,True
1,rain,rain,NOUN,noun,False
2,in,in,ADP,adposition,True
3,Spain,Spain,PROPN,proper noun,False
4,falls,fall,VERB,verb,False
5,mainly,mainly,ADV,adverb,False
6,on,on,ADP,adposition,True
7,the,the,DET,determiner,True
8,plain,plain,NOUN,noun,False
9,.,.,PUNCT,punctuation,False


In [8]:
from spacy import displacy

In [9]:
## visualze the parse tree for the doc
displacy.render(doc, style = "dep")

Handling multiple sentences now.

In [10]:
text = "We were all out at the zoo one day, I was doing some acting, walking on the railing of the gorilla exhibit. I fell in! Everyone screamed and Tommy jumped in after me, forgetting that he had blueberries in his front pocket... The gorillas just went wild."

doc = nlp(text)

# can individually access the sentences
# they are broken up by punctuation?
for sentence in doc.sents:
    print(">", sentence)

> We were all out at the zoo one day, I was doing some acting, walking on the railing of the gorilla exhibit.
> I fell in!
> Everyone screamed and Tommy jumped in after me, forgetting that he had blueberries in his front pocket...
> The gorillas just went wild.


When spaCy creates a document, it uses a principle of non-destructive tokenization meaning that the tokens, sentences, etc., are simply indexes into a long array. In other words, they don't carve the text stream into little pieces. So each sentence is a span with a start and an end index into the document:

In [11]:
for sentence in doc.sents:
    print(">", sentence.start, sentence.end)

> 0 25
> 25 29
> 29 48
> 48 54


In [12]:
# so we can index into the doc array to pull the toens for one sentence:
doc[48:54]

The gorillas just went wild.

In [13]:
# or index into a specific token, such as the verb `went` in the last sentence:
tok = doc[51]
print(tok.text, tok.lemma_, tok.pos_)


went go VERB


In [14]:
# after parsing a document
# can pull out all the noun chunks

text = "Steve Jobs and Steve Wozniak incorporated Apple Computer on January 3, 1977, in Cupertino, California."
doc = nlp(text)

for chunk in doc.noun_chunks:
    print(chunk.text)

Steve Jobs
Steve Wozniak
Apple Computer
January
Cupertino
California


In [15]:
# can take this further and identify named entities = proper nouns within the text
for entity in doc.ents:
    print(entity.text, entity.label_)

# GPE = Geopolitical entity 

Steve Jobs PERSON
Steve Wozniak PERSON
Apple Computer ORG
January 3, 1977 DATE
Cupertino GPE
California GPE


In [16]:
# can visualize named entities

displacy.render(doc, style="ent")

More generally, one can link lemmas to resouces that describe their meanings like went -> go. Use WordNet.

In [17]:
import nltk

In [18]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /Users/Nida/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# nlp.pipe_names
# nlp.add_pipe('WordnetAnnotator')

spaCy runs as a pipeline and allows means for customizing parts of the pipeline in use. Next we will add the WordnetAnnotator from the `spacy-wordnet` project. 

In [19]:
from spacy_wordnet.wordnet_annotator import WordnetAnnotator


In [20]:
nlp = spacy.load("en_core_web_sm")


In [21]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [None]:
# @Language.factory('spacywordnet')
# def spacy_wordnet(nlp, name):
#     return WordnetAnnotator()

# #nlp.max_length = 2000000
# nlp.add_pipe('spacy_wordnet', after='tagger')

In [22]:
token = nlp("withdraw")[0]

In [None]:
token._.

In [23]:
token._.wordnet.synsets()

AttributeError: [E046] Can't retrieve unregistered extension attribute 'wordnet'. Did you forget to call the `set_extension` method?

In [None]:
token._.wordnet.lemmas()

In [None]:
token._.wordnet.wordnet_domains()

In [None]:
import scattertext as st

In [None]:

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat', category_name='Democratic', not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank
)
open('./demo_compact.html', 'w').write(html)


Generates an html file that views the data as a scattertext plot.

In [24]:
print(spacy.__version__)

3.0.6
