# Module 2 (Python 3)

## Basic NLP Tasks with NLTK

In [1]:
import nltk
# download the text corpora
nltk.download()

from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


### Counting vocabulary of words

In [2]:
text1 # <Text: Moby Dick by Herman Melville 1851>
sents() # look at all the sentences
sent1 # ['Call', 'me', 'Ishmael', '.']

len(sent1) # 4
len(text1) # 260819

set(text1) # look at unique words 
list(set(sent1))[:2] # look at the first 2 words in text1

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


['Call', 'Ishmael']

### Frequency of words

In [54]:
dist = FreqDist(sent1) # frequency 
dist # FreqDist({'.': 1, 'Call': 1, 'Ishmael': 1, 'me': 1})

vocab1 = dist.keys() # actual words
vocab1 # dict_keys(['Call', 'me', 'Ishmael', '.'])
list(vocab1)[:2] # ['Call', 'me']

dist['me'] # how many times a particularly word occurs # 1

freqwords = [w for w in vocab1 if len(w) > 1 and dist[w] >= 1] # how many times a particular word occurs and also have a condition on the length of the word
freqwords # ['Call', 'me', 'Ishmael']

['Call', 'me', 'Ishmael']

### Normalization and stemming

In [3]:
# different forms of the same'word'
input1 = "List listed lists listing listings"

words1 = input1.lower().split(' ') # normalization - lowercase
words1 # ['list', 'listed', 'lists', 'listing', 'listings']

porter = nltk.PorterStemmer() # # find the root/root form of the word
[porter.stem(t) for t in words1] # ['list', 'list', 'list', 'list', 'list']
# notes - you may want to keep 'listing' as different from 'list'

['list', 'list', 'list', 'list', 'list']

### Lemmatization

In [65]:
# when stemming returns invalid words
text2 = 'Universal Delcaration of Human Right'
text2 = text2.split(' ')
[porter.stem(t) for t in text2] # ['univers', 'delcar', 'of', 'human', 'right']

# lemmatization returns valid words
WNlemma = nltk.WordNetLemmatizer()
[WNlemma.lemmatize(t) for t in text2] # ['Universal', 'Delcaration', 'of', 'Human', 'Right']

['Universal', 'Delcaration', 'of', 'Human', 'Right']

### Tokenization

In [68]:
# recall splitting a sentence into words/tokens
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ') # ['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

# NLTK has an inbuilt tokenizer
nltk.word_tokenize(text11) #['Children','should',"n't",'drink','a','sugary','drink','before','bed','.']
# notes - you may want clear about 'should' 'n't' 

# Setence splitting
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
nltk.sent_tokenize(text12) 
# ['This is the first sentence.',
# 'A gallon of milk in the U.S. costs $2.99.',
# 'Is this the third sentence?',
# 'Yes, it is!']

['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

In [6]:
### Advanced NLP Tasks with NLTK

# NLP Tasks 
# Counting words, counting frequency of words
# Finding sentence boundaries
# Part of speech tagging
# parsing the sentence structure 
# Identifying semantic role labeling
# Named entity recognition
# Co-reference and pronoun resoluation

### Part-of-Speech (POS) tagging

In [69]:
# Tag  Wordclass    Tag  Wordclass   Tag  Wordclass
# CC   Conjunction  JJ   Adjective   PRP  Pronoun
# CD   Cardinal     MD   Modal       RB   Adverb
# DT   Determiner   NN   Noun        SYM  Symbol 
# IN   Preposition  POS  Possessive  VB   Verb

import nltk 
nltk.help.upenn_tagset('MD')
# MD: modal auxiliary
# can cannot could couldn't dare may might must need ought shall should
# shouldn't will would

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [34]:
text13 = nltk.word_tokenize(text11)
nltk.pos_tag(text13) 

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [35]:
# Ambiguity in POS Tagging 
text14 = nltk.word_tokenize("Visiting aunts can be a nuisance")
nltk.pos_tag(text14)
# [('Visiting', 'VBG'),
# ('aunts', 'NNS'),
# ('can', 'MD'),
# ('be', 'VB'),
# ('a', 'DT'),
# ('nuisance', 'NN')]

# Another alternative POS tagging that arent show by pos_tag
# [('Visiting', 'JJ'),
# ('aunts', 'NNS'),
# ('can', 'MD'),
# ('be', 'VB'),
# ('a', 'DT'),
# ('nuisance', 'NN')]

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

In [71]:
# Parsing sentence structure
# Making sense of sentences is eady if they follow a well-defined grammatical structure

text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP  
VP -> V NP 
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


In [76]:
# Ambiguity in Parsing
# Ambiguity may exist even if sentences are grammaatically corret! 

text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1b = nltk.CFG.fromstring("""
S -> NP VP  
VP -> V NP | VP PP
PP -> P NP
NP -> DT N | DT N PP | 'I'
DT -> 'a' | 'the'
N -> 'man' | 'telescope'
V -> 'saw'
P -> 'with' 
""")

parser = nltk.ChartParser(grammar1b)
trees = parser.parse_all(text16)
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (DT the) (N man)))
    (PP (P with) (NP (DT a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (DT the) (N man) (PP (P with) (NP (DT a) (N telescope))))))


In [42]:
# NLTK and Parse Tree Collection 
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [43]:
## POS tagging and parsing ambiguity
# Uncommon usages of words
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [44]:
# Well-formed sentences may sill be meaningless! 
text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
nltk.pos_tag(text19)

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]