# Intro to spaCy

Generally, attributes that end with an underscore return strings. Attributes that do not return an ID.

In [2]:
from spacy.lang.en import English

In [3]:
# Create the nlp object
nlp = English()

In [10]:
# Hello world example
doc = nlp("Hello world!")

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


## Token Splicing & Subsetting

**Tokens**

In [11]:
# Token Splicing/Subsets

# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

world


**Spans**

The example below selects the token from position 1 up to, but not including, position 4.

In [12]:
# A slice from the Doc is a Span object
# format is doc[start:end]
span = doc[1:4]

# Get the span text via the .text attribute
print(span.text)

world!


## Token modules

- `i` gives the index
- `text` gives the *token* text
- `is_alpha` - boolean; if alphabetic
- `is_punct` - boolean; if punctuation
- `like_num` - boolean; if number. Works for "ten" or "10."

In [6]:
doc = nlp("It costs $5.")

print('Index:   ', [token.i for token in doc])
print('Text:    ', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4]
Text:     ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


## Example - Find all % in the doc

In [7]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

In [8]:
# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals '%'
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


# More Examples

**Print the 'doc'**

In [16]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process a text
doc = nlp("This is a sentence.")

# Print the document text
print(doc.text)

This is a sentence.


**More span/slicing examples**

In [17]:
# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


## Tokenize by sentences

In [20]:
from spacy.lang.en import English

nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

doc = nlp("""Senegal retained top spot on the continent, moving up two places to reach 20th in the world - their best ever ranking. 
Nigeria, who won bronze in Egypt, went up 12 places to 33 on the global list and third in Africa. Tunisia, the other semi-finalists at the Nations Cup, were second in Africa, behind Senegal, but moved down four places to 29th in the world. 
Surprise quarter-finalists Madagascar were rewarded for their impressive run in Egypt, moving up 12 places to 96th overall. 
Benin - who knocked out Morocco in the last-16 - went up six places to 82nd in the world with Morocco also going up six places to 41st in the world and fifth in Africa.
Nations Cup hosts Egypt went up nine spots to make the top 50, moving up to 49th overall. 
Ghana are just below the Pharaohs in 7th on the African list having maintained their position of 50th in the world.""")

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['Senegal retained top spot on the continent, moving up two places to reach 20th in the world - their best ever ranking.', '\nNigeria, who won bronze in Egypt, went up 12 places to 33 on the global list and third in Africa.', 'Tunisia, the other semi-finalists at the Nations Cup, were second in Africa, behind Senegal, but moved down four places to 29th in the world.', '\nSurprise quarter-finalists Madagascar were rewarded for their impressive run in Egypt, moving up 12 places to 96th overall.', '\nBenin - who knocked out Morocco in the last-16 - went up six places to 82nd in the world with Morocco also going up six places to 41st in the world and fifth in Africa.', '\nNations Cup hosts Egypt went up nine spots to make the top 50, moving up to 49th overall.', '\nGhana are just below the Pharaohs in 7th on the African list having maintained their position of 50th in the world.']


In [21]:
# The easiest solution
list(doc.sents)

[Senegal retained top spot on the continent, moving up two places to reach 20th in the world - their best ever ranking.,
 
 Nigeria, who won bronze in Egypt, went up 12 places to 33 on the global list and third in Africa.,
 Tunisia, the other semi-finalists at the Nations Cup, were second in Africa, behind Senegal, but moved down four places to 29th in the world.,
 
 Surprise quarter-finalists Madagascar were rewarded for their impressive run in Egypt, moving up 12 places to 96th overall.,
 
 Benin - who knocked out Morocco in the last-16 - went up six places to 82nd in the world with Morocco also going up six places to 41st in the world and fifth in Africa.,
 
 Nations Cup hosts Egypt went up nine spots to make the top 50, moving up to 49th overall.,
 
 Ghana are just below the Pharaohs in 7th on the African list having maintained their position of 50th in the world.]

## Remove Stopwords

In [7]:
#Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:10])

Number of stop words: 326
First ten stop words: ['used', 'more', 'take', 'the', 'herein', 'get', 'to', 'me', 'that', 'until']


In [8]:
# Remove Stop words
from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)

Filtered Sentence: [Senegal, retained, spot, continent, ,, moving, places, reach, 20th, world, -, best, ranking, ., 
, Nigeria, ,, won, bronze, Egypt, ,, went, 12, places, 33, global, list, Africa, ., Tunisia, ,, semi, -, finalists, 
, Nations, Cup, ,, second, Africa, ,, Senegal, ,, moved, places, 29th, world, ., Surprise, 
, quarter, -, finalists, Madagascar, rewarded, impressive, run, Egypt, ,, moving, 12, places, 96th, overall, ., Benin, -, knocked, 
, Morocco, last-16, -, went, places, 82nd, world, Morocco, going, places, 41st, world, 
, fifth, Africa, ., Nations, Cup, hosts, Egypt, went, spots, 50, ,, moving, 49th, overall, ., Ghana, 
, Pharaohs, 7th, African, list, having, maintained, position, 50th, world, .]


## displaCy

Nice visual representation of NER (`style = "ent"`), syntactic dependency, (`style="dep"`), etc.

In [22]:
import spacy

# Load the small English model
nlp = spacy.load('en_core_web_sm')

doc = nlp("""Senegal retained top spot on the continent, moving up two places to reach 20th in the world - their best ever ranking. 
Nigeria, who won bronze in Egypt, went up 12 places to 33 on the global list and third in Africa. Tunisia, the other semi-finalists at the Nations Cup, were second in Africa, behind Senegal, but moved down four places to 29th in the world. 
Surprise quarter-finalists Madagascar were rewarded for their impressive run in Egypt, moving up 12 places to 96th overall. 
Benin - who knocked out Morocco in the last-16 - went up six places to 82nd in the world with Morocco also going up six places to 41st in the world and fifth in Africa.
Nations Cup hosts Egypt went up nine spots to make the top 50, moving up to 49th overall. 
Ghana are just below the Pharaohs in 7th on the African list having maintained their position of 50th in the world.""")

spacy.displacy.render(doc, style = "ent",jupyter = True)

In [16]:
doc = nlp(u"This is a sentence.")

spacy.displacy.render(doc, style="dep", jupyter=True)