In [10]:
import nltk
import re

In [11]:
f = open('TheLittlePrince.txt',encoding='utf-8')
prince = f.read()
print(prince[:50])
print(len(prince))

The Little Prince
written and illustrated by
Antoi
90305


In [12]:
## TOKENIZE
princetokens = nltk.word_tokenize(prince)
len(princetokens)

21198

In [14]:
## LOWER
princewords = [w.lower() for w in princetokens]
len(princewords)

21198

In [15]:
## FREQUENCY
from nltk import FreqDist
ndist=FreqDist(princewords)
nitems = ndist.most_common(10)
for item in nitems:
    print (item[0], '\t', item[1])

. 	 1567
, 	 1036
the 	 971
'' 	 715
`` 	 549
i 	 544
to 	 462
a 	 409
and 	 348
of 	 336


In [45]:
topkeys = ndist.most_common(10)
for pair in topkeys:
    print(pair)

('.', 1567)
(',', 1036)
('the', 971)
("''", 715)
('``', 549)
('i', 544)
('to', 462)
('a', 409)
('and', 348)
('of', 336)


In [46]:
#normalized frequency
numwords = len(princewords)
topkeysnorm=[(word, freq/numwords) for (word,freq) in topkeys]
for pair in topkeysnorm:
    print(pair)

('.', 0.07392206811963392)
(',', 0.04887253514482498)
('the', 0.045806208132842724)
("''", 0.033729597131804887)
('``', 0.025898669685819418)
('i', 0.025662798377205396)
('to', 0.021794508915935467)
('a', 0.01929427304462685)
('and', 0.016416643079535807)
('of', 0.015850551938862156)


In [16]:
## NON-ALPHA FILTER
pattern = re.compile('^[^a-z]+$')
nonAlphaMatch = pattern.match('**')
#  if it matched, print a message
if nonAlphaMatch:
    print ('matched non-alphabetical')

matched non-alphabetical


In [19]:
# function
def alpha_filter(w):
  # pattern to match word of non-alphabetical characters
  pattern = re.compile('^[^a-z]+$')
  if (pattern.match(w)):
    return True
  else:
    return False

In [21]:
# apply the function to emmawords
alphaprincewords = [w for w in princewords if not alpha_filter(w)]
print(alphaprincewords[:100])
print(len(alphaprincewords))

['the', 'little', 'prince', 'written', 'and', 'illustrated', 'by', 'antoine', 'de', 'saint', 'exupéry', 'translated', 'from', 'the', 'french', 'by', 'katherine', 'woods', 'once', 'when', 'i', 'was', 'six', 'years', 'old', 'i', 'saw', 'a', 'magnificent', 'picture', 'in', 'a', 'book', 'called', 'true', 'stories', 'from', 'nature', 'about', 'the', 'primeval', 'forest', 'it', 'was', 'a', 'picture', 'of', 'a', 'boa', 'constrictor', 'in', 'the', 'act', 'of', 'swallowing', 'an', 'animal', 'here', 'is', 'a', 'copy', 'of', 'the', 'drawing', 'in', 'the', 'book', 'it', 'said', 'boa', 'constrictors', 'swallow', 'their', 'prey', 'whole', 'without', 'chewing', 'it', 'after', 'that', 'they', 'are', 'not', 'able', 'to', 'move', 'and', 'they', 'sleep', 'through', 'the', 'six', 'months', 'that', 'they', 'need', 'for', 'digestion', 'i', 'pondered']
16753


In [23]:
## STOPWORDS
nltkstopwords = nltk.corpus.stopwords.words('english')
morestopwords = ['could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't"]
stopwords = nltkstopwords + morestopwords
len(stopwords)

195

In [25]:
stoppedprincewords = [w for w in alphaprincewords if not w in stopwords]
print(len(stoppedprincewords))

7085


In [26]:
## FREQUENCY #2
princedist = FreqDist(stoppedprincewords)
princeitems = princedist.most_common(10)
for item in princeitems:
  print(item)

('little', 258)
('said', 195)
('prince', 185)
('one', 133)
('planet', 69)
('like', 58)
('flower', 55)
('good', 49)
('time', 45)
('sheep', 43)


In [54]:
#normalized frequency
numwords2 = len(stoppedprincewords)
topkeys2norm=[(word, freq/numwords2) for (word,freq) in princeitems]
for pair in topkeys2norm:
    print(pair)

('little', 0.03641496118560339)
('said', 0.027522935779816515)
('prince', 0.02611150317572336)
('one', 0.018772053634438957)
('planet', 0.009738884968242767)
('like', 0.008186309103740297)
('flower', 0.0077628793225123505)
('good', 0.006916019760056458)
('time', 0.006351446718419196)
('sheep', 0.006069160197600565)


In [27]:
## BIGRAMS
# use "words", don't remove stopwords and symbols yet
princebigrams = list(nltk.bigrams(princewords))
print(princebigrams[:20])

[('the', 'little'), ('little', 'prince'), ('prince', 'written'), ('written', 'and'), ('and', 'illustrated'), ('illustrated', 'by'), ('by', 'antoine'), ('antoine', 'de'), ('de', 'saint'), ('saint', 'exupéry'), ('exupéry', 'translated'), ('translated', 'from'), ('from', 'the'), ('the', 'french'), ('french', 'by'), ('by', 'katherine'), ('katherine', 'woods'), ('woods', 'once'), ('once', 'when'), ('when', 'i')]


In [36]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [37]:
finder = BigramCollocationFinder.from_words(princewords)
scored = finder.score_ngrams(bigram_measures.raw_freq)

In [38]:
for bscore in scored[:10]:
    print(bscore)

(('.', '.'), 0.013303141805830738)
(('.', '``'), 0.012878573450325502)
(('.', "''"), 0.010708557411076517)
((',', "''"), 0.0093405038211152)
(("''", '``'), 0.009293329559392395)
(('little', 'prince'), 0.008680064156995944)
(('the', 'little'), 0.008491367110104727)
(('?', "''"), 0.006038305500518917)
((',', 'and'), 0.005000471742617228)
(('.', 'i'), 0.0046230776488347955)


In [39]:
# apply a filter to remove non-alphabetical tokens from the emma bigram finder
finder.apply_word_filter(alpha_filter)
scored = finder.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:30]:
    print (bscore)

(('little', 'prince'), 0.008680064156995944)
(('the', 'little'), 0.008491367110104727)
(('it', 'is'), 0.004528729125389187)
(('said', 'the'), 0.004387206340220775)
(('of', 'the'), 0.0032550240588734786)
(('in', 'the'), 0.0029248042268138503)
(('i', 'have'), 0.0027832814416454384)
(('i', 'am'), 0.002500235871308614)
(('that', 'is'), 0.002170016039248986)
(('said', 'to'), 0.0020284932540805736)
(('and', 'i'), 0.0018397962071893576)
(('he', 'said'), 0.0018397962071893576)
(('i', 'was'), 0.0018397962071893576)
(('to', 'me'), 0.0017926219454665535)
(('a', 'little'), 0.0016982734220209455)
(('to', 'the'), 0.0016039248985753372)
(('it', 'was'), 0.0015567506368525334)
(('that', 'i'), 0.0015567506368525334)
(('and', 'the'), 0.0015095763751297292)
(('did', 'not'), 0.0015095763751297292)
(('the', 'stars'), 0.0015095763751297292)
(('of', 'a'), 0.0014624021134069251)
(('i', 'shall'), 0.001415227851684121)
(('is', 'a'), 0.0013680535899613171)
(('you', 'are'), 0.0013680535899613171)
(('all', 'the'), 

In [40]:
# apply a filter to remove stop words
finder.apply_word_filter(lambda w: w in stopwords)
scored = finder.score_ngrams(bigram_measures.raw_freq)
for bscore in scored[:20]:
    print (bscore)

(('little', 'prince'), 0.008680064156995944)
(('good', 'morning'), 0.0009906594961788847)
(('conceited', 'man'), 0.00047174261722804036)
(('long', 'time'), 0.00042456835550523637)
(('boa', 'constrictor'), 0.0003773940937824323)
(('one', 'day'), 0.0003773940937824323)
(('prince', 'said'), 0.0003773940937824323)
(('drawing', 'number'), 0.00033021983205962826)
(('glass', 'globe'), 0.00033021983205962826)
(('little', 'man'), 0.00033021983205962826)
(('said', 'nothing'), 0.00033021983205962826)
(('far', 'away'), 0.0002830455703368242)
(('fresh', 'water'), 0.0002830455703368242)
(('one', 'never'), 0.0002830455703368242)
(('prince', 'went'), 0.0002830455703368242)
(('went', 'away'), 0.0002830455703368242)
(('boa', 'constrictors'), 0.00023587130861402018)
(('little', 'bushes'), 0.00023587130861402018)
(('never', 'knows'), 0.00023587130861402018)
(('shall', 'look'), 0.00023587130861402018)


In [43]:
### PMI
finder3 = BigramCollocationFinder.from_words(princewords)
scored = finder3.score_ngrams(bigram_measures.pmi)
for bscore in scored[:20]:
    print (bscore)

(('$', '20,000'), 14.371640534611808)
(("'ve", 'finished'), 14.371640534611808)
(('7,500,000', 'tipplers'), 14.371640534611808)
(('7000', 'geographers'), 14.371640534611808)
(('900,000', 'businessmen'), 14.371640534611808)
(('antoine', 'de'), 14.371640534611808)
(('astronomical', 'congress'), 14.371640534611808)
(('billion', 'inhabitants'), 14.371640534611808)
(('bulky', 'almanac'), 14.371640534611808)
(('caravan', 'passing'), 14.371640534611808)
(('considerable', 'risks'), 14.371640534611808)
(('de', 'saint'), 14.371640534611808)
(('earliest', 'youth'), 14.371640534611808)
(('exact', 'spot'), 14.371640534611808)
(('extra', 'task'), 14.371640534611808)
(('exupéry', 'translated'), 14.371640534611808)
(('faded', 'peacefully'), 14.371640534611808)
(('fairly', 'starting'), 14.371640534611808)
(('familiar', 'tasks'), 14.371640534611808)
(('field', 'poppies'), 14.371640534611808)


In [44]:
# to get good results, must first apply frequency filter
finder.apply_freq_filter(5)
scored = finder.score_ngrams(bigram_measures.pmi)
for bscore in scored[:30]:
    print (bscore)

(('glass', 'globe'), 11.564285612554205)
(('boa', 'constrictor'), 10.671200816470717)
(('boa', 'constrictors'), 10.671200816470717)
(('thousand', 'miles'), 10.301251206720412)
(('fresh', 'water'), 10.201715533169496)
(('six', 'years'), 9.649174510140718)
(('far', 'away'), 9.111112984388589)
(('drawing', 'number'), 9.019124119891023)
(('conceited', 'man'), 8.863845894413114)
(('went', 'away'), 8.433041079275952)
(('good', 'morning'), 8.195051802888486)
(('never', 'knows'), 8.166091623438776)
(('long', 'time'), 8.049712439724448)
(('shall', 'look'), 6.706304617426634)
(('little', 'bushes'), 6.360413279188556)
(('little', 'prince'), 6.352593774729256)
(('prince', 'went'), 5.425221574816652)
(('one', 'day'), 5.3621617887237445)
(('said', 'nothing'), 4.571665142919802)
(('one', 'never'), 4.543768595213692)
(('little', 'man'), 4.038485184301191)
(('prince', 'said'), 2.232928760345887)
