In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk

In [2]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4 MB)
[K     |████████████████████████████████| 96.4 MB 1.2 MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-py3-none-any.whl size=98051301 sha256=e51905d6cedca70d4acdb897ed9034a4c760da2d12b6bf935ec22a0485e7f56d
  Stored in directory: /tmp/pip-ephem-wheel-cache-am5v9le8/wheels/69/c5/b8/4f1c029d89238734311b3269762ab2ee325a42da2ce8edb997
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [3]:
s1 = "I like apples & bananas too!"

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
# Loading a model other than small ones
import en_core_web_md
nlp2 = en_core_web_md.load()

# Tokenization:

In [6]:
doc = nlp(s1)
for token in doc:
  print(token)

I
like
apples
&
bananas
too
!


In [7]:
doc[1:5]

like apples & bananas

In [8]:
print(type(doc))
len(doc)

<class 'spacy.tokens.doc.Doc'>


7

# Stemming & Lemmatization:
### Spacy doesn't have stemming because people always prefer <u>Lemmatization</u>

In [9]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [10]:
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly']

In [11]:
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language='english')

In [12]:
for word in words:
  print(f"{word} = {p_stemmer.stem(word)}")

run = run
runner = runner
running = run
ran = ran
runs = run
easily = easili
fairly = fairli


In [13]:
for word in words:
  print(f"{word} = {s_stemmer.stem(word)}")

run = run
runner = runner
running = run
ran = ran
runs = run
easily = easili
fairly = fair


In [14]:
# Now lemmatization:
for token in doc:
  print(f"{token} = {token.lemma_}")

I = -PRON-
like = like
apples = apple
& = &
bananas = banana
too = too
! = !


# Vocabulary Matching (Rule/Pattern-Based & Phrase Matching):

In [15]:
# import Matcher library:
# https://spacy.io/usage/rule-based-matching/#matcher
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [16]:
# https://explosion.ai/demos/matcher
pattern_1 = [{'LOWER': 'apples'}, {'IS_PUNCT': True}]
pattern_2 = [{'LOWER': 'Bananas'},  {'IS_PUNCT': True}]

In [17]:
# A match consists of:
# 1) An ID key
# 2) an on_match callback
# 3) one or more patterns

matcher.add('Apples & Bananas', [pattern_1, pattern_2])

In [18]:
find_matches = matcher(doc)
find_matches
# It outputs (index_id, index_start, index_end)

[(6552546660629211649, 2, 4)]

In [19]:
# Define a function for finding matches:
for id, start, end in find_matches:
  string = nlp.vocab.strings[id]
  span = doc[start:end]
  print(string)

Apples & Bananas


In [20]:
# Remove matching:
matcher.remove("Apples & Bananas")

### Phrase Matching

In [21]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [22]:
phrase_list = ["Barack Obama", "Angela Merkel", "Washington D.C."]

In [23]:
# Convert each phrase into document object:
phrase_patterns = [nlp(text) for text in phrase_list]
phrase_patterns

[Barack Obama, Angela Merkel, Washington D.C.]

In [24]:
matcher.add("TerminologyList", None, *phrase_patterns)

In [25]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama converse in the Oval Office inside the White House in Washington D.C.")

In [26]:
find_matches = matcher(doc_3)
find_matches
# It outputs (index_id, index_start, index_end)

[(3766102292120407359, 2, 4),
 (3766102292120407359, 7, 9),
 (3766102292120407359, 19, 21)]

In [27]:
# Define a function for finding matches:
for id, start, end in find_matches:
  string = nlp.vocab.strings[id]
  span = doc_3[start:end]
  print(string, span)

TerminologyList Angela Merkel
TerminologyList Barack Obama
TerminologyList Washington D.C.


# Parts-Of-Speech (POS) Tagging:

In [28]:
# https://spacy.io/usage/linguistic-features#pos-tagging
s2 = "Apple is looking at buying U.K. startup for $1 billion"
doc_4 = nlp(s2)

In [29]:
for token in doc_4:
  print(f"Token: {token} = POS: {token.pos_} => {spacy.explain(token.tag_)}")

Token: Apple = POS: PROPN => noun, proper singular
Token: is = POS: AUX => verb, 3rd person singular present
Token: looking = POS: VERB => verb, gerund or present participle
Token: at = POS: ADP => conjunction, subordinating or preposition
Token: buying = POS: VERB => verb, gerund or present participle
Token: U.K. = POS: PROPN => noun, proper singular
Token: startup = POS: NOUN => noun, singular or mass
Token: for = POS: ADP => conjunction, subordinating or preposition
Token: $ = POS: SYM => symbol, currency
Token: 1 = POS: NUM => cardinal number
Token: billion = POS: NUM => cardinal number


### Visualizing

In [30]:
# https://spacy.io/usage/visualizers
from spacy import displacy

In [31]:
displacy.render(docs=doc_4, style='dep', jupyter=True, options={'distance': 150})

# Named Entity Recognition:

In [32]:
# https://spacy.io/usage/linguistic-features#named-entities
s2 = "Apple is looking at buying U.K. startup for $1 billion"
doc_4 = nlp(s2)
doc_4.ents

(Apple, U.K., $1 billion)

In [33]:
for ent in doc_4.ents:
  print(ent, ent.label_, spacy.explain(ent.label_))

Apple ORG Companies, agencies, institutions, etc.
U.K. GPE Countries, cities, states
$1 billion MONEY Monetary values, including unit


In [34]:
# Consider this sentence:
s3 = 'facebook is hiring from the USA'
doc_5 = nlp(s3)

doc_5.ents

(USA,)

In [35]:
# Why didn't it pick Facebook? This is why we can teach it to pick it in a custom way:
from spacy.tokens import Span
ORG = doc_5.vocab.strings['ORG']
new_ent = Span(doc_5, 0, 1, label=ORG)

In [36]:
doc_5.ents = list(doc_5.ents) + [new_ent] # Append the new entity to the list of total entities

In [37]:
doc_5.ents

(facebook, USA)

### Visualizing NER

In [38]:
from spacy import displacy
displacy.render(docs=doc_5, style='ent', jupyter=True)

In [39]:
# Visualize only specific NERs:
displacy.render(docs=doc_5, style='ent', jupyter=True, options={'ents': ['ORG']})

# Sentence Segmentation:

In [40]:
s1 = "This is a sentence. This is a second sentence. This is the last sentence."
s2 = "This is a sentence. This is a U.K. sentence; this is the last sentence." # Spacy will false flag the semi-colon(;) as another sentence

In [41]:
doc_6 = nlp(s1)
doc_7 = nlp(s2)

In [42]:
for sentence in doc_6.sents:
  print(sentence)

This is a sentence.
This is a second sentence.
This is the last sentence.


In [43]:
for sentence in doc_7.sents:
  print(sentence)

This is a sentence.
This is a U.K. sentence; this is the last sentence.


In [44]:
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ';':
      doc[token.i+1].is_sent_start = True
  return doc

In [45]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [46]:
nlp.add_pipe(set_custom_boundaries, before='parser')
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [47]:
doc_7 = nlp(s2)
for sentence in doc_7.sents:
  print(sentence)

This is a sentence.
This is a U.K. sentence;
this is the last sentence.
