# NLTK Tutorial

In [2]:
# Take a look at the preface here: http://www.nltk.org/book/ch00.html
# This tutorial is based on Python 2.7, but it shouldn't be an issue to write the same code for Python 3 as the differences
# are minimal so long as the tutorial is concerned
import nltk
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [253]:
# Let's look at the text4: Inaugural Address Corpus
# NLTK can show a word in context, called a concordance (with a given text window size)
# width: a parameter forthe window size of surrounding character context
# lines: a parameter for the number of lines returned 
print(text4.concordance("Americans", width=100, lines=10))

Displaying 10 of 66 matches:
r occupations may come . The Negroes are now Americans . Their ancestors came here years ago agains
e it so or not . And yet we are not the less Americans on that account . We shall be the more Ameri
we find them now secure ; and there comes to Americans the profound assurance that our representati
have called me . I am certain that my fellow Americans expect that on my induction into the Preside
 and the hurricanes of disaster . In this we Americans were discovering no wholly new truth ; we we
 and that freedom is an ebbing tide . But we Americans know that this is not true . Eight years ago
eat . We are not content to stand still . As Americans , we go forward , in the service of our coun
uguration be simple and its words brief . We Americans of today , together with our allies , are pa
in the discharge of this responsibility , we Americans know and we observe the difference between w
cked bargain of trading honor for security . Americans , indeed all fre

In [7]:
# Other words that appear in a similar range of contexts as a given word
print(text4.similar("patriotic"))

the free power opportunity fellow opinions colleges peace gangs
judgments consent noblest ideas colors fidelity unquestionable worship
discipline industrious just
None


In [3]:
# Let's look at common contexts of two words:
print(text4.common_contexts(["patriotic", "free"])) 

every_citizen our_citizens
None


In [31]:
# Lexical diversity shows the richness of a text's vocabulary:
from __future__ import division # in Python 3 you don't need to do the import
def lexical_diversity(text):
    return len(set(text))/len(text)

lex_div=lexical_diversity(text4)
print(round(lex_div, 4))

# What interesting ways can you use "lexical_diversity" for?
# Can you play with some texts, say from presidential candidates and tell us what you find?

0.0669


In [3]:
text="What interesting ways can you you use".split()
len(text)
len(set(text))

6

In [35]:
# sorted set of words
print(sorted(set(text4))[100:120])

[u'AS', u'Abandonment', u'Abhorring', u'About', u'Above', u'Abraham', u'Abroad', u'Accept', u'Across', u'Act', u'Acting', u'Action', u'Actual', u'Adams', u'Additional', u'Address', u'Administered', u'Administration', u'Administrations', u'Advance']


In [9]:
#P ay attenstion to the difference of these!
tokens=["man", "woman", "father", "mother"]
x= tokens.sort() # Returns "None", but sorts the list in place
print x
print tokens
print sorted(tokens) # Returns the sorted list


None
['father', 'man', 'mother', 'woman']
['father', 'man', 'mother', 'woman']


In [81]:
# Counting word frequencies:
words=["man", "woman", "father", "mother"]
for w in words:
    print(w, text4.count(w))

('man', 102)
('woman', 3)
('father', 4)
('mother', 4)


In [None]:
 # We Stopped here:

In [61]:
# Frequency distribution
freq_dist = FreqDist(text4) 
print("*"*100)
print(freq_dist.most_common(1000))[200:300]
print("*"*100)
print(freq_dist["European"])
print(freq_dist["world"])
print("*"*100)
#------------------------------------------
# Vocabulary
V=set(text4)
words=[w for w in V if freq_dist[w] > 200][:10]
print(words)

****************************************************************************************************
[(u'institutions', 76), (u'come', 75), (u'party', 75), (u'better', 75), (u'always', 74), (u'today', 74), (u'office', 73), (u'still', 73), (u'need', 73), (u'others', 73), (u'strength', 72), (u'Let', 72), (u'nor', 72), (u'itself', 72), (u'means', 70), (u'believe', 70), (u'themselves', 70), (u'place', 70), (u'land', 69), (u'could', 69), (u'then', 69), (u'."', 69), (u'home', 69), (u'equal', 69), (u'together', 68), (u'might', 68), (u'things', 67), (u'secure', 67), (u'Nation', 67), (u'whose', 66), (u'find', 66), (u'given', 66), (u'prosperity', 66), (u'Americans', 66), (u'old', 65), (u'am', 65), (u'full', 65), (u'give', 65), (u'here', 64), (u'Federal', 64), (u'action', 64), (u'order', 64), (u'yet', 64), (u'proper', 64), (u'found', 63), (u'up', 63), (u'important', 63), (u'responsibility', 63), (u'take', 62), (u'where', 62), (u'being', 62), (u'change', 62), (u'Executive', 62), (u'even', 62), (u'

In [10]:
l=["a", "b"]
l[0]

d= {"Hi": 44, "Hello": 2}
d["Hello"]

'a'

In [62]:
# Collocations
print(text4.collocations())

United States; fellow citizens; four years; years ago; Federal
Government; General Government; American people; Vice President; Old
World; Almighty God; Fellow citizens; Chief Magistrate; Chief Justice;
God bless; every citizen; Indian tribes; public debt; one another;
foreign nations; political parties
None


In [67]:
# Could you tell the difference?
print(sorted(w.lower() for w in set(text1))[-10:])
print(sorted(set(w.lower() for w in text1))[-10:])

[u'zodiac', u'zodiac', u'zogranda', u'zone', u'zoned', u'zoned', u'zones', u'zoology', u'zoology', u'zoroaster']
[u'zephyr', u'zeuglodon', u'zig', u'zodiac', u'zogranda', u'zone', u'zoned', u'zones', u'zoology', u'zoroaster']


In [4]:
# Fetching and cleaning a webpage:
from urllib import urlopen
from bs4 import BeautifulSoup
url="http://shakespeare.mit.edu/hamlet/full.html"
page = urlopen(url)
soup = BeautifulSoup(page.read())   
raw = BeautifulSoup.get_text(soup)  
print(raw[:300])
tokens=nltk.word_tokenize(raw)



Hamlet: Entire Play
 





The Tragedy of Hamlet, Prince of Denmark

Shakespeare homepage 
    | Hamlet 
    | Entire play

ACT I
SCENE I. Elsinore. A platform before the castle.

FRANCISCO at his post. Enter to him BERNARDO

BERNARDO

Who's there?

FRANCISCO

Nay, answer me: stand, and unfold you


In [11]:
# Word tokenization with NLTK:
import nltk
raw="I am happy"
tokens=nltk.word_tokenize(raw)
print tokens

['I', 'am', 'happy']


# More on files, this time with NLTK:

In [230]:
import codecs
from nltk import word_tokenize, Text
text_string=codecs.open("hamlet.txt", "r", "utf-8").read() # Opens for reading and gets you the file content as a list
tokens = word_tokenize(text_string)
print(type(tokens))
print(tokens[:20])
text = Text(tokens)
print("*"*50)
print(text[:50])
print(text.collocations())


<type 'list'>
[u'The', u'Project', u'Gutenberg', u'EBook', u'of', u'Hamlet', u',', u'by', u'William', u'Shakespeare', u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'at']
**************************************************
[u'The', u'Project', u'Gutenberg', u'EBook', u'of', u'Hamlet', u',', u'by', u'William', u'Shakespeare', u'This', u'eBook', u'is', u'for', u'the', u'use', u'of', u'anyone', u'anywhere', u'at', u'no', u'cost', u'and', u'with', u'almost', u'no', u'restrictions', u'whatsoever', u'.', u'You', u'may', u'copy', u'it', u',', u'give', u'it', u'away', u'or', u're-use', u'it', u'under', u'the', u'terms', u'of', u'the', u'Project', u'Gutenberg', u'License', u'included', u'with']
Project Gutenberg-tm; _1st Clo._; Project Gutenberg; _Crosses to_;
Literary Archive; Gutenberg-tm electronic; Archive Foundation;
electronic works; Gutenberg Literary; United States; _2nd Clo._;
ROSENCRANTZ _and_; public domain; _and_ GUILDENSTERN; Dr. Johnson;
_1st Play

In [90]:
# Sentence splitting
from nltk.corpus import gutenberg
# This will return each sentence as a list of words
hamlet_sent=gutenberg.sents('shakespeare-hamlet.txt')
for sent in hamlet_sent[:5]:
    print(sent)


[u'[', u'The', u'Tragedie', u'of', u'Hamlet', u'by', u'William', u'Shakespeare', u'1599', u']']
[u'Actus', u'Primus', u'.']
[u'Scoena', u'Prima', u'.']
[u'Enter', u'Barnardo', u'and', u'Francisco', u'two', u'Centinels', u'.']
[u'Barnardo', u'.']


In [93]:
# NLTK fileids:
from nltk.corpus import inaugural
print(inaugural.fileids()[:5])
print("*"*50)
#print([fileid[:4] for fileid in inaugural.fileids()])

[u'1789-Washington.txt', u'1793-Washington.txt', u'1797-Adams.txt', u'1801-Jefferson.txt', u'1805-Jefferson.txt']
**************************************************
[u'1789', u'1793', u'1797', u'1801', u'1805', u'1809', u'1813', u'1817', u'1821', u'1825', u'1829', u'1833', u'1837', u'1841', u'1845', u'1849', u'1853', u'1857', u'1861', u'1865', u'1869', u'1873', u'1877', u'1881', u'1885', u'1889', u'1893', u'1897', u'1901', u'1905', u'1909', u'1913', u'1917', u'1921', u'1925', u'1929', u'1933', u'1937', u'1941', u'1945', u'1949', u'1953', u'1957', u'1961', u'1965', u'1969', u'1973', u'1977', u'1981', u'1985', u'1989', u'1993', u'1997', u'2001', u'2005', u'2009']


# Generate text

In [122]:
# A function from the NLTK book: http://www.nltk.org/book/ch02.html
def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word),
        word = cfdist[word].max()

text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
print(generate_model(cfd, 'living', num=30))
#print(cfd["living"].max())

living creature that he said , and the land of the land of the land of the land of the land of the land of the land of the land None


# WordNet

In [133]:
# WordNet is a very useful resource.
# You should get familiar with its structure, and with ways to navigate it.
# NLTK provides many off-the-shelf useful functions
from nltk.corpus import wordnet as wn
print(wn.synsets('nice'))
print("*"*50)
print(wn.synset('nice.s.03').definition())
print(wn.synset('nice.s.03').lemma_names())
print("*"*50)
print(wn.synset('courteous.s.01').definition())
print(wn.synset('courteous.s.01').lemma_names())
print("*"*50)
print(wn.synset('dainty.s.04').definition())
print(wn.synset('dainty.s.04').lemma_names())
print("*"*50)

[Synset('nice.n.01'), Synset('nice.a.01'), Synset('decent.s.01'), Synset('nice.s.03'), Synset('dainty.s.04'), Synset('courteous.s.01')]
**************************************************
done with delicacy and skill
[u'nice', u'skillful']
**************************************************
exhibiting courtesy and politeness
[u'courteous', u'gracious', u'nice']
**************************************************
excessively fastidious and easily disgusted
[u'dainty', u'nice', u'overnice', u'prissy', u'squeamish']


In [138]:
# Printing the definition and lemma names/lemmas of a given word is easily done in a "for" loop
for synset in wn.synsets('boring'):
    print("*"*50)
    print(synset.definition())
    print(synset.lemma_names())
    

**************************************************
the act of drilling
[u'drilling', u'boring']
**************************************************
the act of drilling a hole in the earth in the hope of producing petroleum
[u'boring', u'drilling', u'oil_production']
**************************************************
cause to be bored
[u'bore', u'tire']
**************************************************
make a hole, especially with a pointed power or hand tool
[u'bore', u'drill']
**************************************************
so lacking in interest as to cause mental weariness
[u'boring', u'deadening', u'dull', u'ho-hum', u'irksome', u'slow', u'tedious', u'tiresome', u'wearisome']


In [145]:
# You can access lemmas of a word directly, using the "lemmas" function:
print(wn.lemmas('boring'))
print(wn.lemmas('wonderful'))
print(wn.lemmas('dazzling'))

[Lemma('drilling.n.01.boring'), Lemma('boring.n.02.boring'), Lemma('boring.s.01.boring')]
[Lemma('fantastic.s.02.wonderful')]
[Lemma('dazzling.s.01.dazzling'), Lemma('blazing.s.01.dazzling')]


In [153]:
# Play with the word "dish"
print(wn.lemmas('dish'))
print("= "*50)
for synset in wn.synsets('dish'):
    print("*"*50)
    print(synset.definition())
    print(synset.lemma_names())

[Lemma('dish.n.01.dish'), Lemma('dish.n.02.dish'), Lemma('dish.n.03.dish'), Lemma('smasher.n.02.dish'), Lemma('dish.n.05.dish'), Lemma('cup_of_tea.n.01.dish'), Lemma('serve.v.06.dish'), Lemma('dish.v.02.dish')]
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
**************************************************
a piece of dishware normally used as a container for holding or serving food
[u'dish']
**************************************************
a particular item of prepared food
[u'dish']
**************************************************
the quantity that a dish will hold
[u'dish', u'dishful']
**************************************************
a very attractive or seductive looking woman
[u'smasher', u'stunner', u'knockout', u'beauty', u'ravisher', u'sweetheart', u'peach', u'lulu', u'looker', u'mantrap', u'dish']
**************************************************
directional antenna consisting of a parabolic reflector for microwave o

In [198]:
# A function that prints the synsets and definitions of a given word:
def get_definitions(word):
    for synset in wn.synsets(word):
        try:
            print synset.name().capitalize() + ':', synset.definition() # capitalizing to give the feel of a dict entry
        except:
            continue
            
happy_words=["happy", "glad", "joyful", "joyous", "exhuberant"]
for w in happy_words:
    get_definitions(w)

# You can condition by a part of speech (POS), see the book!
#for synset in wn.synsets('mint', wn.NOUN):
#...     print(synset.name() + ':', synset.definition())


Happy.a.01: enjoying or showing or marked by joy or pleasure
Felicitous.s.02: marked by good fortune
Glad.s.02: eagerly disposed to act or to be of service
Happy.s.04: well expressed and to the point
Gladiolus.n.01: any of numerous plants of the genus Gladiolus native chiefly to tropical and South Africa having sword-shaped leaves and one-sided spikes of brightly colored funnel-shaped flowers; widely cultivated
Glad.a.01: showing or causing joy and pleasure; especially made happy
Glad.s.02: eagerly disposed to act or to be of service
Glad.s.03: feeling happy appreciation
Beaming.s.01: cheerful and bright
Joyful.a.01: full of or producing joy
Elated.s.02: full of high-spirited delight
Joyous.a.01: full of or characterized by joy


In [157]:
# A function to print the lemma names of a passed word
def get_lemma_names(word):
    for synset in wn.synsets(word):
        try:
            print(synset.lemma_names())
        except:
            continue
            
happy_words=["happy", "glad", "joyful", "joyous", "exhuberant"]
for w in happy_words:
    get_lemma_names(w)

[u'happy']
[u'felicitous', u'happy']
[u'glad', u'happy']
[u'happy', u'well-chosen']
[u'gladiolus', u'gladiola', u'glad', u'sword_lily']
[u'glad']
[u'glad', u'happy']
[u'glad']
[u'beaming', u'glad']
[u'joyful']
[u'elated', u'gleeful', u'joyful', u'jubilant']
[u'joyous']


In [185]:
# As above, but we uniqify using a set.
def get_unique_lemma_names(word):
    l=[]
    for synset in wn.synsets(word):
        try:
            l.extend(synset.lemma_names())
        except:
            continue
    l=set(l)
    return l

happy_words=["happy", "glad", "joyful", "joyous", "exhuberant"]
for w in happy_words:
    l=get_unique_lemma_names(w)
    print(l)

# To get a set
print("*"*50)
print("\nHere's a single unique list/set:\n")
uniq_list=[]
for w in happy_words:
    l=get_unique_lemma_names(w)
    uniq_list.extend(l)
print(set(uniq_list))

set([u'felicitous', u'well-chosen', u'glad', u'happy'])
set([u'gladiolus', u'beaming', u'sword_lily', u'gladiola', u'glad', u'happy'])
set([u'elated', u'jubilant', u'joyful', u'gleeful'])
set([u'joyous'])
set([])
**************************************************

Here's a single unique list/set:

set([u'elated', u'gladiolus', u'beaming', u'joyous', u'sword_lily', u'well-chosen', u'felicitous', u'jubilant', u'gleeful', u'gladiola', u'joyful', u'glad', u'happy'])


In [190]:
#Nice example from the book (http://www.nltk.org/book/ch02.html)
motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
print(types_of_motorcar[0]) # prints: Synset('ambulance.n.01')
print("*"*50)
print(sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()))
print("*"*50)

# Remember, the tuple coprehension can be broken down as follows (with no sorting):
for synset in types_of_motorcar:
    for lemma in synset.lemmas():
        print(lemma.name()),

Synset('ambulance.n.01')
**************************************************
[u'Model_T', u'S.U.V.', u'SUV', u'Stanley_Steamer', u'ambulance', u'beach_waggon', u'beach_wagon', u'bus', u'cab', u'compact', u'compact_car', u'convertible', u'coupe', u'cruiser', u'electric', u'electric_automobile', u'electric_car', u'estate_car', u'gas_guzzler', u'hack', u'hardtop', u'hatchback', u'heap', u'horseless_carriage', u'hot-rod', u'hot_rod', u'jalopy', u'jeep', u'landrover', u'limo', u'limousine', u'loaner', u'minicar', u'minivan', u'pace_car', u'patrol_car', u'phaeton', u'police_car', u'police_cruiser', u'prowl_car', u'race_car', u'racer', u'racing_car', u'roadster', u'runabout', u'saloon', u'secondhand_car', u'sedan', u'sport_car', u'sport_utility', u'sport_utility_vehicle', u'sports_car', u'squad_car', u'station_waggon', u'station_wagon', u'stock_car', u'subcompact', u'subcompact_car', u'taxi', u'taxicab', u'tourer', u'touring_car', u'two-seater', u'used-car', u'waggon', u'wagon']
**************

In [193]:
# Another useful example, this time on hypernyms:
motorcar = wn.synset('car.n.01')
print(motorcar.hypernyms()) # prints: [Synset('motor_vehicle.n.01')]

paths = motorcar.hypernym_paths()
print(len(paths)) # prints 2 as there are two paths, as the book states, between car.n.01 and entity.n.01 
                  # because wheeled_vehicle.n.01 can be classified as both a vehicle and a container.
                  # Take a look at the output below

print("\nPath 1 between car.n.01 and entity.n.01")
print([synset.name() for synset in paths[0]])
print("\nPath 2 between car.n.01 and entity.n.01")
print([synset.name() for synset in paths[1]])

[Synset('motor_vehicle.n.01')]
2

Path 1
[u'entity.n.01', u'physical_entity.n.01', u'object.n.01', u'whole.n.02', u'artifact.n.01', u'instrumentality.n.03', u'container.n.01', u'wheeled_vehicle.n.01', u'self-propelled_vehicle.n.01', u'motor_vehicle.n.01', u'car.n.01']

Path 2
[u'entity.n.01', u'physical_entity.n.01', u'object.n.01', u'whole.n.02', u'artifact.n.01', u'instrumentality.n.03', u'conveyance.n.03', u'vehicle.n.01', u'wheeled_vehicle.n.01', u'self-propelled_vehicle.n.01', u'motor_vehicle.n.01', u'car.n.01']


In [None]:
# Try the graphical WordNet browser from your command line:
nltk.app.wordnet()

In [227]:
# Similarity
from nltk.corpus import wordnet as wn
print(wn.synsets('lilac'))
print(wn.synsets('tulip'))
print(wn.synsets('flower'))
print(wn.synsets('tree'))
print(wn.synsets('daffodil'))
#--------------------------
print("*"*50)
african = wn.synset('african_daisy.n.01')
orchid = wn.synset('orchid.n.01')
scarlet = wn.synset('scarlet_musk_flower.n.01')
aster = wn.synset('white-topped_aster.n.01')
tree = wn.synset('tree.n.01')
daffodil = wn.synset('daffodil.n.01')
#--------------------------
print("*"*50)
print(african.lowest_common_hypernyms(orchid))
print(orchid.lowest_common_hypernyms(orchid))
print(scarlet.lowest_common_hypernyms(tree))
print(aster.lowest_common_hypernyms(daffodil))
#print(wn.synset('flower.n.01').hypernyms())
#print(wn.synset('flower.n.01').hyponyms())

[Synset('lilac.n.01'), Synset('lavender.s.01')]
[Synset('tulip.n.01')]
[Synset('flower.n.01'), Synset('flower.n.02'), Synset('flower.n.03'), Synset('bloom.v.01')]
[Synset('tree.n.01'), Synset('tree.n.02'), Synset('tree.n.03'), Synset('corner.v.02'), Synset('tree.v.02'), Synset('tree.v.03'), Synset('tree.v.04')]
[Synset('daffodil.n.01')]
**************************************************
[Synset('flower.n.01')]
[Synset('orchid.n.01')]
[Synset('vascular_plant.n.01')]
[Synset('vascular_plant.n.01')]
