# NLP in python using NLTK

## Tokenize

In [1]:
# here is the string which will be used to demonstrate tokenization by word and sentence

# import the relevant tokenizer from nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

example_string = """Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."""

In [2]:
# download the punkt tokenizer for sentence tokenization
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# tokenize the string by sentence
sent_tokenize(example_string)

["Muad'Dib learned rapidly because his first training was in how to learn.",
 'And the first lesson of all was the basic trust that he could learn.',
 "It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."]

In [4]:
len(word_tokenize(example_string))

53

In [5]:
words = word_tokenize(example_string)
sentences = sent_tokenize(example_string)

## Filtering stop words

In [6]:
# download stop words and import relevant library
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kanna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [8]:
words = word_tokenize(worf_quote)
words

['Sir', ',', 'I', 'protest', '.', 'I', 'am', 'not', 'a', 'merry', 'man', '!']

In [9]:
# set stop words to english
stop_words = set(stopwords.words('english'))

filtered_words = []

In [10]:
for word in words:
    if word.casefold() not in stop_words:
        filtered_words.append(word)

filtered_words

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

In [11]:
# alternative using list comprehension
[word for word in words if word.casefold() not in stop_words]

['Sir', ',', 'protest', '.', 'merry', 'man', '!']

## stemming

In [12]:
#import the stemmer
from nltk.stem import PorterStemmer

In [13]:
stemmer = PorterStemmer()

In [14]:
string_for_stemming = """The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do."""


In [15]:
words = word_tokenize(string_for_stemming)

In [16]:
[stemmer.stem(word) for word in words]

['the',
 'crew',
 'of',
 'the',
 'uss',
 'discoveri',
 'discov',
 'mani',
 'discoveri',
 '.',
 'discov',
 'is',
 'what',
 'explor',
 'do',
 '.']

## Tag POS

In [17]:
sagan_quote = "If you wish to make an apple pie from scratch, you must first invent the universe."


In [18]:
words = word_tokenize(sagan_quote)

In [19]:
# download required libraries for pos tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kanna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\kanna\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [20]:
# tag the words
nltk.pos_tag(words)

[('If', 'IN'),
 ('you', 'PRP'),
 ('wish', 'VBP'),
 ('to', 'TO'),
 ('make', 'VB'),
 ('an', 'DT'),
 ('apple', 'NN'),
 ('pie', 'NN'),
 ('from', 'IN'),
 ('scratch', 'NN'),
 (',', ','),
 ('you', 'PRP'),
 ('must', 'MD'),
 ('first', 'VB'),
 ('invent', 'VB'),
 ('the', 'DT'),
 ('universe', 'NN'),
 ('.', '.')]

In [21]:
# look up the meaning of the tags
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [22]:
jabberwocky_excerpt = """'Twas brillig, and the slithy toves did gyre and gimble in the wabe: all mimsy were the borogoves, and the mome raths outgrabe."""

In [23]:
nltk.pos_tag(word_tokenize(jabberwocky_excerpt))

[("'T", 'NN'),
 ('was', 'VBD'),
 ('brillig', 'VBN'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('slithy', 'JJ'),
 ('toves', 'NNS'),
 ('did', 'VBD'),
 ('gyre', 'NN'),
 ('and', 'CC'),
 ('gimble', 'JJ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('wabe', 'NN'),
 (':', ':'),
 ('all', 'DT'),
 ('mimsy', 'NNS'),
 ('were', 'VBD'),
 ('the', 'DT'),
 ('borogoves', 'NNS'),
 (',', ','),
 ('and', 'CC'),
 ('the', 'DT'),
 ('mome', 'JJ'),
 ('raths', 'NNS'),
 ('outgrabe', 'RB'),
 ('.', '.')]

## lematize

In [24]:
# download the wordnet library for lemmatization
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kanna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
lematizer = WordNetLemmatizer()

In [26]:
lematizer.lemmatize('scarves')

'scarf'

In [27]:
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [28]:
words = word_tokenize(string_for_lemmatizing)

In [29]:
[lematizer.lemmatize(word) for word in words]

['The', 'friend', 'of', 'DeSoto', 'love', 'scarf', '.']

In [30]:
# what happens when we try to lematize a word that is very different from its base for example, the word "worst"
lematizer.lemmatize('worst')

'worst'

In [31]:
# we can use the pos parameter to specify the part of speech of the word
lematizer.lemmatize('worst', pos='a')

'bad'

In [32]:
# however some words such as best do not have a different form when they are adjectives
lematizer.lemmatize('best', pos='a')

'best'

## chunking

In [33]:
lotr_quote = "It's a dangerous business, Frodo, going out your door."


In [34]:
words = word_tokenize(lotr_quote)

In [35]:
# pos tag the words
pos_tag_words = nltk.pos_tag(words)

In [36]:
# regular expression for chunking
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

In [37]:
# create a parser
chunk_parser = nltk.RegexpParser(chunk_grammar)

In [41]:
# create a tree
chunked = chunk_parser.parse(pos_tag_words)

In [43]:
chunked

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [('It', 'PRP'), ("'s", 'VBZ'), Tree('NP', [('a', 'DT'), ('dangerous', 'JJ'), ('business', 'NN')]), (',', ','), ('Frodo', 'NNP'), (',', ','), ('going', 'VBG'), ('out', 'RP'), ('your', 'PRP$'), Tree('NP', [('door', 'NN')]), ('.', '.')])