# NLP in python using NLTK

## Tokenize

In [None]:
# here is the string which will be used to demonstrate tokenization by word and sentence

# import the relevant tokenizer from nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

example_string = """Muad'Dib learned rapidly because his first training was in how to learn. And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn, and how many more believe learning to be difficult."""

In [None]:
# download the punkt tokenizer for sentence tokenization
nltk.download('punkt')

In [None]:
# tokenize the string by sentence
sent_tokenize(example_string)

In [None]:
len(word_tokenize(example_string))

In [None]:
words = word_tokenize(example_string)
sentences = sent_tokenize(example_string)

## Filtering stop words

In [None]:
# download stop words and import relevant library
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
worf_quote = "Sir, I protest. I am not a merry man!"

In [None]:
words = word_tokenize(worf_quote)
words

In [None]:
# set stop words to english
stop_words = set(stopwords.words('english'))

filtered_words = []

In [None]:
for word in words:
    if word.casefold() not in stop_words:
        filtered_words.append(word)

filtered_words

In [None]:
# alternative using list comprehension
[word for word in words if word.casefold() not in stop_words]

## stemming

In [None]:
#import the stemmer
from nltk.stem import PorterStemmer

In [None]:
stemmer = PorterStemmer()

In [None]:
string_for_stemming = """The crew of the USS Discovery discovered many discoveries. Discovering is what explorers do."""


In [None]:
words = word_tokenize(string_for_stemming)

In [None]:
[stemmer.stem(word) for word in words]

## Tag POS

In [None]:
sagan_quote = "If you wish to make an apple pie from scratch, you must first invent the universe."


In [None]:
words = word_tokenize(sagan_quote)

In [None]:
# download required libraries for pos tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

In [None]:
# tag the words
nltk.pos_tag(words)

In [None]:
# look up the meaning of the tags
nltk.help.upenn_tagset('NNP')

In [None]:
jabberwocky_excerpt = """'Twas brillig, and the slithy toves did gyre and gimble in the wabe: all mimsy were the borogoves, and the mome raths outgrabe."""

In [None]:
nltk.pos_tag(word_tokenize(jabberwocky_excerpt))

## lematize

In [None]:
# download the wordnet library for lemmatization
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

In [None]:
lematizer = WordNetLemmatizer()

In [None]:
lematizer.lemmatize('scarves')

In [None]:
string_for_lemmatizing = "The friends of DeSoto love scarves."

In [None]:
words = word_tokenize(string_for_lemmatizing)

In [None]:
[lematizer.lemmatize(word) for word in words]

In [None]:
# what happens when we try to lematize a word that is very different from its base for example, the word "worst"
lematizer.lemmatize('worst')

In [None]:
# we can use the pos parameter to specify the part of speech of the word
lematizer.lemmatize('worst', pos='a')

In [None]:
# however some words such as best do not have a different form when they are adjectives
lematizer.lemmatize('best', pos='a')

## chunking

In [None]:
lotr_quote = "It's a dangerous business, Frodo, going out your door."


In [None]:
words = word_tokenize(lotr_quote)

In [None]:
# pos tag the words
pos_tag_words = nltk.pos_tag(words)

In [None]:
# regular expression for chunking
chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

In [None]:
# create a parser
chunk_parser = nltk.RegexpParser(chunk_grammar)

In [None]:
# create a tree
chunked = chunk_parser.parse(pos_tag_words)

In [None]:
chunked