In [1]:
import nltk

In [2]:
import sys
import sklearn

In [29]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [13]:
# Tokenizing
from nltk.tokenize import word_tokenize, sent_tokenize

text = 'Charles ate the french fries knowing they would be his last meal. We have never been to Asia, nor have we visited Africa. All they could see was the blue water surrounding their sailboat.'

print(sent_tokenize(text))
print(word_tokenize(text))

['Charles ate the french fries knowing they would be his last meal.', 'We have never been to Asia, nor have we visited Africa.', 'All they could see was the blue water surrounding their sailboat.']
['Charles', 'ate', 'the', 'french', 'fries', 'knowing', 'they', 'would', 'be', 'his', 'last', 'meal', '.', 'We', 'have', 'never', 'been', 'to', 'Asia', ',', 'nor', 'have', 'we', 'visited', 'Africa', '.', 'All', 'they', 'could', 'see', 'was', 'the', 'blue', 'water', 'surrounding', 'their', 'sailboat', '.']


In [17]:
# Removing stop words
from nltk.corpus import stopwords

print(stopwords.words('english'))

stop_words = set(stopwords.words('english'))
w_tokens = word_tokenize(text)

filtered = [w for w in w_tokens if not w in stop_words]

print('\n', filtered)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [18]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

print([ps.stem(w) for w in filtered])

['charl', 'ate', 'french', 'fri', 'know', 'would', 'last', 'meal', '.', 'We', 'never', 'asia', ',', 'visit', 'africa', '.', 'all', 'could', 'see', 'blue', 'water', 'surround', 'sailboat', '.']


In [19]:
ex = ['ride', 'rode', 'rider', 'riding', 'rides', 'pride', 'bride']

print([ps.stem(w) for w in ex])

['ride', 'rode', 'rider', 'ride', 'ride', 'pride', 'bride']


In [21]:
print([ps.stem(w) for w in w_tokens])

['charl', 'ate', 'the', 'french', 'fri', 'know', 'they', 'would', 'be', 'hi', 'last', 'meal', '.', 'We', 'have', 'never', 'been', 'to', 'asia', ',', 'nor', 'have', 'we', 'visit', 'africa', '.', 'all', 'they', 'could', 'see', 'wa', 'the', 'blue', 'water', 'surround', 'their', 'sailboat', '.']


In [36]:
from nltk.corpus import udhr
print(udhr.raw('English-Latin1')[:500])

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as 


In [24]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

In [34]:
print(train_text[:500])

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territo


In [31]:
# Train and test the tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(test_text)

print(tokenized[:5])

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.']


In [28]:
# Tagging words with their types
def process_content():
    for s in tokenized[:2]:
        tokens = word_tokenize(s)
        tagged = nltk.pos_tag(tokens) # Part of speech tagging
        print(tagged)

process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [35]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Regular Expressions
\+ = match 1 or more  
? = match 0 or 1 repetitions  
\* = match 0 or MORE repetitions  
. = Any character except a new line

## Chunk Type
<RB.?>* = 0 or more of any tense of adverb, followed by  
<VB.?>* = 0 or more of any tense of verb, followed by  
\<NNP>+ = One or more proper nouns, followed by  
\<NN>?  = zero or one singular noun.

In [38]:
# Chunking: Grouping similar/words we want together
def process_contents():
    for s in tokenized[:50]:
        tokens = word_tokenize(s)
        tagged = nltk.pos_tag(tokens)
        
        # Combine part of speech tag with Regular expressions
        chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        
        # Print the nltk tree
        for sub in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):
            print(sub)
        
        # Draw chunks
        #chunked.draw()

process_contents()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)
(Chunk White/NNP House/NNP photo/NN)
(Chunk Eric/NNP DraperEvery/NNP time/NN)
(Chunk Capitol/NNP dome/NN)
(Chunk have/VBP served/VBN America/NNP)
(Chunk Tonight/NNP)
(Chunk Union/NNP)
(Chunk Applause/NNP)
(Chunk United/NNP)
(Chunk America/NNP)
(Chunk Applause/NNP)
(Chunk America/NNP)
(Chunk September/NNP)
(Chunk Dictatorships/NNP shelter/NN)
(Chunk Applause/NNP)
(Chunk Afghanistan/NNP)
(

In [41]:
# Chinking: Removing words we do not want
def process_contents():
    for s in tokenized[:5]:
        tokens = word_tokenize(s)
        tagged = nltk.pos_tag(tokens)
        
        # Use {} to include required words/types, and use }{ to remove/eliminate words/types we do not want in chunk.
        
        # Combine part of speech tag with Regular expressions
        chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""
        chunkParser = nltk.RegexpParser(chunkGram)
        chunked = chunkParser.parse(tagged)
        
        # Print the nltk tree
        for sub in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):
            print(sub)
        
        # Draw chunks
        #chunked.draw()

process_contents()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(Chunk
  Mr./NNP
  Speaker/NNP
  ,/,
  Vice/NNP
  President/NNP
  Cheney/NNP
  ,/,
  members/NNS)
(Chunk Congress/NNP ,/, members/NNS)
(Chunk
  Supreme/NNP
  Court/NNP
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:)
(Chunk our/PRP$ nation/NN)
(Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
(Chunk America/NNP)
(Chunk its/PRP$ founding/NN ideals/NNS and/CC)
(Chunk noble/JJ dream/NN ./.)
(Chunk Tonight/NN we/PRP)
(Chunk hope/NN)
(Chunk glad/JJ reunion/NN)
(Chunk husband/NN who/WP)
(Chunk so/RB long/RB ago/RB ,/, and/CC we/PRP)
(Chunk grateful/JJ)
(Chunk good/JJ life/NN)
(Chunk Coretta/NNP Scott/NNP King/NNP ./.)
(Chunk

In [43]:
# Named Entity Recognition
def process_contents():
    for s in tokenized[:5]:
        tokens = word_tokenize(s)
        tagged = nltk.pos_tag(tokens)
        
        # Either recognize all named entities(bin = True) or recognize named entities as their respective types i.e. names, location, address etc.(bin = False)
        namedEnt = nltk.ne_chunk(tagged, binary = False)
        
        # Draw chunks
        namedEnt.draw()

process_contents()