In [1]:
import spacy
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

This is a notebook that reproduces the code in this article: https://pub.towardsai.net/nlp-zero-to-hero-with-python-2df6fcebff6e

Required packages: Spacy and NLTK.

To download the English language library for spacy:

`python -m spacy download en`

In [2]:
# Loading the english language library
load_en = spacy.load('en_core_web_sm')

In [3]:
# Word tokenisation
string = 'I\'m going to meet M.S. Dhoni.'
words = load_en(string)
for tokens in words:
    print(tokens.text)

I
'm
going
to
meet
M.S.
Dhoni
.


In [4]:
# Porter Stemming - reducing words to their root
pot_stem = PorterStemmer()
words = ['happy', 'happier', 'happiest', 'happiness', 'breathing', 'fairly']
for word in words:
    print(word + '----->' + pot_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fairli


In [5]:
# Snowball Stemming
snow_stem = SnowballStemmer(language='english')
for word in words:
    print(word + '----->' + snow_stem.stem(word))

happy----->happi
happier----->happier
happiest----->happiest
happiness----->happi
breathing----->breath
fairly----->fair


In [6]:
# Lemmatisation (same objective as stemming, but using morphological analysis instead of heuristics)
string = load_en('I\'m happy in this happiest place with all happiness. It feels how happier we are')
for lem_word in string:
    print(lem_word.text, '\t', lem_word.pos_, '\t', lem_word.lemma, '\t', lem_word.lemma_)

I 	 PRON 	 4690420944186131903 	 I
'm 	 VERB 	 10382539506755952630 	 be
happy 	 ADJ 	 244022080605231780 	 happy
in 	 ADP 	 3002984154512732771 	 in
this 	 DET 	 1995909169258310477 	 this
happiest 	 ADJ 	 244022080605231780 	 happy
place 	 NOUN 	 7512738811199700769 	 place
with 	 ADP 	 12510949447758279278 	 with
all 	 DET 	 13409319323822384369 	 all
happiness 	 NOUN 	 2779265004918961325 	 happiness
. 	 PUNCT 	 12646065887601541794 	 .
It 	 PRON 	 10239237003504588839 	 it
feels 	 VERB 	 5741770584995928333 	 feel
how 	 ADV 	 16331095434822636218 	 how
happier 	 ADJ 	 244022080605231780 	 happy
we 	 PRON 	 16064069575701507746 	 we
are 	 AUX 	 10382539506755952630 	 be


In [7]:
# default english stop words in Spacy
print(load_en.Defaults.stop_words)

{'whereafter', 'over', 'wherein', 'also', 'about', 'anyone', 'former', 'per', 'make', 'he', 'others', 'still', 'into', 'back', 'nevertheless', 'always', 'so', 'his', 'we', 'myself', 'except', 'by', 'get', 'ca', 'during', 'some', 'it', 'last', 'sometime', 'three', 'these', 'their', 'sixty', 'if', 'something', 'amount', 'off', 'too', 'else', 'anywhere', 'thru', '’d', 'how', 'thereafter', 'enough', 'twelve', 'there', 'becoming', 'show', 'at', 'already', 'to', 'much', '‘re', 'rather', 'him', 'what', 'herself', 'thereby', 'keep', 'call', 'himself', 'sometimes', 'seem', '‘d', 'or', 'would', 'toward', 'without', 'here', 'have', 'does', 'least', 'almost', '‘ll', 'across', 'upon', 'an', 'four', 'one', 'its', 'other', 'has', 'hereafter', 'be', 'hundred', 'such', 'behind', 'however', 'third', 'somehow', "'m", 'serious', 'around', 'seems', 'throughout', '‘s', 'fifty', 'your', 'latterly', 'out', 'mostly', 'for', 'fifteen', 'then', 'via', 'beyond', 'next', 'may', 'although', 'go', 'might', 'n‘t', 'b

In [8]:
# Part of speech analysis
string = load_en('This is an example of Python code')

In [9]:
print(string[3])
# Coarse tag
print(string[3].pos_)
# Fine tag
print(string[3].tag_)

example
NOUN
NN


In [10]:
# Counting by coarse tag
string.count_by(spacy.attrs.POS)

{90: 2, 87: 1, 92: 2, 85: 1, 96: 1}

In [11]:
# The keys in the dictionary above are tag ids, and the values are the counts for each tag
# What tag is tag 90? (we have two of those)
string.vocab[90].text

'DET'

In [12]:
# Named entity recognition
string = load_en('Lewis, who lived in England by that time, worked for the Department of Work and Pensions')
if string.ents:
    for ner in string.ents:
        print(ner.text + ' - '+ ner.label_ + ' - ' + 
               str(spacy.explain(ner.label_)))
else:
    print('No Entity Found')

England - GPE - Countries, cities, states
the Department of Work and Pensions - ORG - Companies, agencies, institutions, etc.
