# NLTK Tutorial

### Examples to learn the basics of nltk module (Natural Language ToolKit)

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize 

download de alguns módulos específicos do NLTK

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mariv\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mariv\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mariv\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\mariv\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

## Sentence Tokenization

In [9]:
text = "God is Great! I won a lottery."
tokenized = sent_tokenize(text)
tokenized

['God is Great!', 'I won a lottery.']

## Word Tokenization

In [10]:
text = "God is Great! I won a lottery."
tokenized = word_tokenize(text)
tokenized

['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']

## Stop Words

In [11]:
stop_words = set(stopwords.words('english'))
print(stop_words)

{"mustn't", 'hasn', 'after', 'because', 'while', 'was', 'against', "aren't", 'some', 'is', 'each', 'he', 'with', 'can', 'ours', 'by', 'has', 'our', 'she', "weren't", 'there', 'aren', 'in', 'ma', 's', 're', 'yourself', "didn't", "needn't", 'into', 'them', 'shan', 'again', 'if', 'how', "won't", 'this', "haven't", 'yourselves', 'i', 'which', 'are', 'of', 'same', 'above', 'out', 'my', 'will', 'from', 'own', 'until', 'needn', "mightn't", 'were', 'its', 'few', 'weren', 'very', 'theirs', 'why', "you've", 't', 'as', 'who', 'or', 'had', 'did', 'on', 'do', "shan't", "you'll", 'than', 'then', "should've", "that'll", "don't", 'other', 'when', 'and', 'up', 'off', 'yours', 'her', 'any', 'his', 'down', 'more', "wasn't", 'won', 'those', 'further', 'once', 'isn', 'no', "hadn't", 'doesn', 'don', 'whom', 'it', "hasn't", 'below', 'should', 'o', "isn't", 'm', "she's", "it's", 'these', 'before', 'be', 'about', 'not', 'didn', 'having', 'll', 'mustn', 'itself', 'nor', "you're", 'their', 'haven', 'doing', 'for

In [15]:
filtered_words = [word for word in tokenized if word not in stop_words]
print("Tokenized words: ", tokenized)
print("Filtered Sentence: ", filtered_words)

Tokenized words:  ['God', 'is', 'Great', '!', 'I', 'won', 'a', 'lottery', '.']
Filtered Sentence:  ['God', 'Great', '!', 'I', 'lottery', '.']


## Stemming

Stemming is the process of producing morphological variants of a root/base word. Stemming programs are commonly referred to as stemming algorithms or stemmers. A stemming algorithm reduces the words “chocolates”, “chocolatey”, and “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.

In [17]:
example_words = ['connect', 'connected', 'connecting', 'connection']
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in example_words]

print("Tokenized words: ", example_words)
print("Stemmed Sentence: ", stemmed_words)

Tokenized words:  ['connect', 'connected', 'connecting', 'connection']
Stemmed Sentence:  ['connect', 'connect', 'connect', 'connect']


### SnowBall

In [20]:
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [24]:
example_words = ['conexão', 'conectado', 'conectar', 'conectando']
ps = SnowballStemmer('portuguese')
stemmed_words = [ps.stem(w) for w in example_words]

print("Tokenized words: ", example_words)
print("Stemmed Sentence: ", stemmed_words)

Tokenized words:  ['conexão', 'conectado', 'conectar', 'conectando']
Stemmed Sentence:  ['conexã', 'conect', 'conect', 'conect']


## Lemmatization

takes into consideration the morphological analysis of the words. To do so, it is necessary to have detailed dictionaries which the algorithm can look through to link the form back to its lemma.

In [27]:
stemmer = PorterStemmer()
print(stemmer.stem('stones'))
print(stemmer.stem('speaking'))
print(stemmer.stem('are'))
print(stemmer.stem('geese'))
print(stemmer.stem('went'))

stone
speak
are
gees
went


In [30]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('stones'))
print(lemmatizer.lemmatize('speaking', pos='v'))
print(lemmatizer.lemmatize('are', pos='v'))
print(lemmatizer.lemmatize('geese'))
print(lemmatizer.lemmatize('went', pos='v'))

stone
speak
be
goose
go


## POS Tagging

It is a method of identifying words as nouns, verbs, adjectives, adverbs, etc

In [32]:
sent = "Albert Einstein was born in Ulm, Germany in 1879."

tokens = nltk.word_tokenize(sent)
print("Sentence:", tokens)

nltk.pos_tag(tokens)

Sentence: ['Albert', 'Einstein', 'was', 'born', 'in', 'Ulm', ',', 'Germany', 'in', '1879', '.']


[('Albert', 'NNP'),
 ('Einstein', 'NNP'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('in', 'IN'),
 ('Ulm', 'NNP'),
 (',', ','),
 ('Germany', 'NNP'),
 ('in', 'IN'),
 ('1879', 'CD'),
 ('.', '.')]

## N-Gram

N-gram can be defined as the contiguous sequence of n items from a given sample of text or speech. 

In [36]:
from nltk import bigrams
string_bigrams = list(bigrams(tokenized))
print(string_bigrams)

[('God', 'is'), ('is', 'Great'), ('Great', '!'), ('!', 'I'), ('I', 'won'), ('won', 'a'), ('a', 'lottery'), ('lottery', '.')]
