# Introduction To Natural Language Processing with NLTK

## Tokenization

In [1]:
from nltk import sent_tokenize, word_tokenize

### 01. Sentence Tokenization

In [2]:
text = "Success is not final. Failure is not fatal. It is the courage to continue that counts."
print(text)

Success is not final. Failure is not fatal. It is the courage to continue that counts.


In [3]:
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

['Success is not final.', 'Failure is not fatal.', 'It is the courage to continue that counts.']


In [4]:
for sentence in sentence_tokens:
    print(sentence)

Success is not final.
Failure is not fatal.
It is the courage to continue that counts.


### 02. Word Tokenization

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
sentence = "Let's see how the tokenizer split's this!"

In [7]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [8]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer, WhitespaceTokenizer

In [9]:
tree_tokenizer = TreebankWordTokenizer()
word_punct_tokenizer = WordPunctTokenizer()
white_space_tokenizer = WhitespaceTokenizer()

In [10]:
word_tokens = tree_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'s", 'see', 'how', 'the', 'tokenizer', 'split', "'s", 'this', '!']


In [11]:
word_tokens = word_punct_tokenizer.tokenize(sentence)
print(word_tokens)

['Let', "'", 's', 'see', 'how', 'the', 'tokenizer', 'split', "'", 's', 'this', '!']


In [12]:
word_tokens = white_space_tokenizer.tokenize(sentence)
print(word_tokens)

["Let's", 'see', 'how', 'the', 'tokenizer', "split's", 'this!']


## Stemming and Lemmatization

### 01. Stemming

In [13]:
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

In [14]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('lying'))
print(porter_stemmer.stem('lies'))
print(porter_stemmer.stem('lied'))

lie
lie
lie


In [15]:
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('lying'))
print(lancaster_stemmer.stem('lies'))
print(lancaster_stemmer.stem('lied'))

lying
lie
lied


In [16]:
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('lying'))
print(snowball_stemmer.stem('lies'))
print(snowball_stemmer.stem('lied'))

lie
lie
lie


### 02. Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer

In [18]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running"))

running


In [19]:
def lemmatize(word):
    lemmatizer = WordNetLemmatizer()
    print("verb form: " + lemmatizer.lemmatize(word, pos="v"))
    print("noun form: " + lemmatizer.lemmatize(word, pos="n"))
    print("adverb form: " + lemmatizer.lemmatize(word, pos="r"))
    print("adjective form: " + lemmatizer.lemmatize(word, pos="a"))

In [20]:
lemmatize("ears")

verb form: ears
noun form: ear
adverb form: ears
adjective form: ears


In [21]:
lemmatize("running")

verb form: run
noun form: running
adverb form: running
adjective form: running


### 03. Stemming vs Lemmatization 

In [22]:
stemmer = PorterStemmer();
lemmatizer = WordNetLemmatizer()

In [23]:
print(stemmer.stem("deactivating"))
print(stemmer.stem("deactivated"))
print(stemmer.stem("deactivates"))

deactiv
deactiv
deactiv


In [24]:
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))
print(lemmatizer.lemmatize("deactivating", pos="v"))

deactivate
deactivate
deactivate


In [25]:
print(stemmer.stem('stones')) 
print(stemmer.stem('speaking')) 
print(stemmer.stem('bedroom')) 
print(stemmer.stem('jokes')) 
print(stemmer.stem('lisa')) 
print(stemmer.stem('purple')) 

stone
speak
bedroom
joke
lisa
purpl


In [26]:
print(lemmatizer.lemmatize('stones')) 
print(lemmatizer.lemmatize('speaking'))
print(lemmatizer.lemmatize('bedroom'))
print(lemmatizer.lemmatize('jokes'))
print(lemmatizer.lemmatize('lisa'))
print(lemmatizer.lemmatize('purple'))

stone
speaking
bedroom
joke
lisa
purple


# POS Tagging

In [27]:
from nltk import pos_tag

In [28]:
sentence = "The hardest choices require the strongest wills"

In [29]:
sentence_tokens = word_tokenize(sentence)
print(sentence_tokens)

['The', 'hardest', 'choices', 'require', 'the', 'strongest', 'wills']


In [30]:
pos_tag(sentence_tokens)

[('The', 'DT'),
 ('hardest', 'JJS'),
 ('choices', 'NNS'),
 ('require', 'VBP'),
 ('the', 'DT'),
 ('strongest', 'JJS'),
 ('wills', 'NNS')]

# Chunking

In [31]:
from nltk import RegexpParser

In [32]:
sentence = "the big visious dog barked at the small feeble cat"
# sentence = "the little yellow hard tight dog barked at the cat"

In [33]:
#Define your grammar using regular expressions
grammar = ('''NP: {<DT>?<JJ>*<NN>} # NP''')

In [34]:
chunkParser = RegexpParser(grammar)
tagged = pos_tag(word_tokenize(sentence))
tagged

[('the', 'DT'),
 ('big', 'JJ'),
 ('visious', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('small', 'JJ'),
 ('feeble', 'JJ'),
 ('cat', 'NN')]

In [35]:
tree = chunkParser.parse(tagged)

In [36]:
for subtree in tree.subtrees():
    print(subtree)

(S
  (NP the/DT big/JJ visious/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT small/JJ feeble/JJ cat/NN))
(NP the/DT big/JJ visious/JJ dog/NN)
(NP the/DT small/JJ feeble/JJ cat/NN)


# Stop Word Removal

In [37]:
from nltk.corpus import stopwords

In [38]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [39]:
sentence = "Success is not final. Failure is not fatal. It is the courage to continue that counts."

In [40]:
word_tokens = word_tokenize(sentence)
print(word_tokens)

['Success', 'is', 'not', 'final', '.', 'Failure', 'is', 'not', 'fatal', '.', 'It', 'is', 'the', 'courage', 'to', 'continue', 'that', 'counts', '.']


In [41]:
clean_tokens = word_tokens[:] 
for token in word_tokens:
    if token in stopwords.words('english'):
        clean_tokens.remove(token)

In [42]:
print(clean_tokens)

['Success', 'final', '.', 'Failure', 'fatal', '.', 'It', 'courage', 'continue', 'counts', '.']


# Named Entity Recognition

In [43]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [44]:
sentence = "Mark who works at Yahoo and John who works at Google decided to meet at New York City"

In [45]:
print (ne_chunk(pos_tag(word_tokenize(sentence))))

(S
  (PERSON Mark/NNP)
  who/WP
  works/VBZ
  at/IN
  (ORGANIZATION Yahoo/NNP)
  and/CC
  (PERSON John/NNP)
  who/WP
  works/VBZ
  at/IN
  (ORGANIZATION Google/NNP)
  decided/VBD
  to/TO
  meet/VB
  at/IN
  (GPE New/NNP York/NNP City/NNP))


# Interface to WordNet

In [46]:
from nltk.corpus import wordnet

In [47]:
wordnet.synsets("computer")

[Synset('computer.n.01'), Synset('calculator.n.01')]

In [48]:
syn = wordnet.synset('computer.n.01')
syn.lemma_names()

['computer',
 'computing_machine',
 'computing_device',
 'data_processor',
 'electronic_computer',
 'information_processing_system']

In [49]:
syn.definition()

'a machine for performing calculations automatically'

In [50]:
wordnet.synset("car.n.01").examples()

['he needs a car to get to work']

In [51]:
synonyms = []
for syn in wordnet.synsets('large'):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
print(synonyms)

['large', 'large', 'big', 'large', 'bombastic', 'declamatory', 'large', 'orotund', 'tumid', 'turgid', 'big', 'large', 'magnanimous', 'big', 'large', 'prominent', 'large', 'big', 'enceinte', 'expectant', 'gravid', 'great', 'large', 'heavy', 'with_child', 'large', 'large', 'boastfully', 'vauntingly', 'big', 'large']


In [52]:
antonyms = []
for syn in wordnet.synsets("large"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(antonyms)

['small', 'little']
