In [1]:
import nltk
text = """Monticello wasn't designated as UNESCO World Heritage Site until 1987"""

import regex
regex.split("[\s\.\,]", text)

['Monticello',
 "wasn't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987']

In [2]:
### Better word tokenizers

In [3]:
nltk.word_tokenize(text)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 'Site',
 'until',
 '1987']

In [4]:
### Stemming
# There are multiple stemmers in nltk, let's investigate them

In [5]:
### Porter Stemmer

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 
           'starting', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted']

for word in plurals:
    print(f"{word} >>> {stemmer.stem(word)}")

caresses >>> caress
flies >>> fli
dies >>> die
mules >>> mule
denied >>> deni
died >>> die
agreed >>> agre
owned >>> own
humbled >>> humbl
sized >>> size
meeting >>> meet
starting >>> start
siezing >>> siez
itemization >>> item
sensational >>> sensat
traditional >>> tradit
reference >>> refer
colonizer >>> colon
plotted >>> plot


In [6]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [7]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [8]:
text2.concordance("affection")

Displaying 25 of 79 matches:
, however , and , as a mark of his affection for the three girls , he left them
t . It was very well known that no affection was ever supposed to exist between
deration of politeness or maternal affection on the side of the former , the tw
d the suspicion -- the hope of his affection for me may warrant , without impru
hich forbade the indulgence of his affection . She knew that his mother neither
rd she gave one with still greater affection . Though her late conversation wit
 can never hope to feel or inspire affection again , and if her home be uncomfo
m of the sense , elegance , mutual affection , and domestic comfort of the fami
, and which recommended him to her affection beyond every thing else . His soci
ween the parties might forward the affection of Mr . Willoughby , an equally st
 the most pointed assurance of her affection . Elinor could not be surprised at
he natural consequence of a strong affection in a young and ardent mind . This 
 opinion . 

In [9]:
#import matplotlib 
#text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [10]:
### top 50 tokens from the Reuters corpus

r1 = FreqDist(nltk.corpus.reuters.words())
r1.most_common(50)

[('.', 94687),
 (',', 72360),
 ('the', 58251),
 ('of', 35979),
 ('to', 34035),
 ('in', 26478),
 ('said', 25224),
 ('and', 25043),
 ('a', 23492),
 ('mln', 18037),
 ('vs', 14120),
 ('-', 13705),
 ('for', 12785),
 ('dlrs', 11730),
 ("'", 11272),
 ('The', 10968),
 ('000', 10277),
 ('1', 9977),
 ('s', 9298),
 ('pct', 9093),
 ('it', 8842),
 (';', 8762),
 ('&', 8698),
 ('lt', 8694),
 ('on', 8556),
 ('from', 7986),
 ('cts', 7953),
 ('is', 7580),
 ('>', 7449),
 ('that', 7377),
 ('its', 7265),
 ('by', 6872),
 ('"', 6816),
 ('at', 6537),
 ('2', 6528),
 ('U', 6388),
 ('S', 6382),
 ('year', 6310),
 ('be', 6288),
 ('with', 5945),
 ('will', 5856),
 ('was', 5787),
 ('billion', 5652),
 ('3', 5091),
 ('5', 4683),
 ('has', 4679),
 ('would', 4634),
 ('loss', 4528),
 ('/', 4495),
 ('company', 4399)]

In [11]:
### top 50 bigrams from text1

fr_dist1 = FreqDist(list(bigrams(text1)))
fr_dist1.most_common(50)

[((',', 'and'), 2607),
 (('of', 'the'), 1847),
 (("'", 's'), 1737),
 (('in', 'the'), 1120),
 ((',', 'the'), 908),
 ((';', 'and'), 853),
 (('to', 'the'), 712),
 (('.', 'But'), 596),
 ((',', 'that'), 584),
 (('.', '"'), 557),
 ((',', 'as'), 523),
 ((',', 'I'), 461),
 ((',', 'he'), 446),
 (('from', 'the'), 428),
 ((',', 'in'), 402),
 (('of', 'his'), 371),
 (('the', 'whale'), 369),
 (('.', 'The'), 369),
 (('and', 'the'), 357),
 ((';', 'but'), 340),
 ((',', 'but'), 339),
 (('of', 'a'), 327),
 (('on', 'the'), 326),
 (('to', 'be'), 320),
 (('at', 'the'), 316),
 (('with', 'the'), 308),
 (('by', 'the'), 301),
 (('for', 'the'), 285),
 (("'", 't'), 277),
 (('.', 'And'), 261),
 ((',', 'with'), 260),
 (('.', 'I'), 260),
 ((',', 'or'), 257),
 (('in', 'his'), 253),
 (('."', '"'), 246),
 (('into', 'the'), 246),
 (('in', 'a'), 240),
 ((',', 'when'), 238),
 (('.', 'It'), 238),
 ((',', 'it'), 238),
 (('the', 'ship'), 235),
 (('it', ','), 234),
 (('with', 'a'), 233),
 (('him', ','), 232),
 ((';', 'the'), 

In [17]:
sorted(set(w.lower() for w in text1))



['!',
 '!"',
 '!"--',
 "!'",
 '!\'"',
 '!)',
 '!)"',
 '!*',
 '!--',
 '!--"',
 "!--'",
 '"',
 '"\'',
 '"--',
 '"...',
 '";',
 '$',
 '&',
 "'",
 "',",
 "',--",
 "'-",
 "'--",
 "';",
 '(',
 ')',
 '),',
 ')--',
 ').',
 ').--',
 '):',
 ');',
 ');--',
 '*',
 ',',
 ',"',
 ',"--',
 ",'",
 ",'--",
 ',)',
 ',*',
 ',--',
 ',--"',
 ",--'",
 '-',
 '--',
 '--"',
 "--'",
 '--\'"',
 '--(',
 '---"',
 '---,',
 '.',
 '."',
 '."*',
 '."--',
 ".'",
 '.\'"',
 '.)',
 '.*',
 '.*--',
 '.,',
 '.--',
 '.--"',
 '...',
 '....',
 '.]',
 '000',
 '1',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '13',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '14',
 '144',
 '1492',
 '15',
 '150',
 '15th',
 '16',
 '1652',
 '1668',
 '1671',
 '1690',
 '1695',
 '16th',
 '17',
 '1726',
 '1729',
 '1750',
 '1772',
 '1775

In [16]:
sorted(w.lower() for w in set(text1))

['!',
 '!"',
 '!"--',
 "!'",
 '!\'"',
 '!)',
 '!)"',
 '!*',
 '!--',
 '!--"',
 "!--'",
 '"',
 '"\'',
 '"--',
 '"...',
 '";',
 '$',
 '&',
 "'",
 "',",
 "',--",
 "'-",
 "'--",
 "';",
 '(',
 ')',
 '),',
 ')--',
 ').',
 ').--',
 '):',
 ');',
 ');--',
 '*',
 ',',
 ',"',
 ',"--',
 ",'",
 ",'--",
 ',)',
 ',*',
 ',--',
 ',--"',
 ",--'",
 '-',
 '--',
 '--"',
 "--'",
 '--\'"',
 '--(',
 '---"',
 '---,',
 '.',
 '."',
 '."*',
 '."--',
 ".'",
 '.\'"',
 '.)',
 '.*',
 '.*--',
 '.,',
 '.--',
 '.--"',
 '...',
 '....',
 '.]',
 '000',
 '1',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '12',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '13',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '14',
 '144',
 '1492',
 '15',
 '150',
 '15th',
 '16',
 '1652',
 '1668',
 '1671',
 '1690',
 '1695',
 '16th',
 '17',
 '1726',
 '1729',
 '1750',
 '1772',
 '1775

In [18]:
import nltk
nltk.corpus.gutenberg.fileids()

emma = nltk.corpus.gutenberg.words('austen-emma.txt')
len(emma)

192427

In [21]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
emma.concordance("surprize")

Displaying 25 of 37 matches:
er father , was sometimes taken by surprize at his being still able to pity ` 
hem do the other any good ." " You surprize me ! Emma must do Harriet good : a
Knightley actually looked red with surprize and displeasure , as he stood up ,
r . Elton , and found to his great surprize , that Mr . Elton was actually on 
d aid ." Emma saw Mrs . Weston ' s surprize , and felt that it must be great ,
father was quite taken up with the surprize of so sudden a journey , and his f
y , in all the favouring warmth of surprize and conjecture . She was , moreove
he appeared , to have her share of surprize , introduction , and pleasure . Th
ir plans ; and it was an agreeable surprize to her , therefore , to perceive t
talking aunt had taken me quite by surprize , it must have been the death of m
f all the dialogue which ensued of surprize , and inquiry , and congratulation
 the present . They might chuse to surprize her ." Mrs . Cole had many to agre
the mode of it , the my

In [28]:
from nltk.corpus import brown
print(brown.categories())

cfd = nltk.ConditionalFreqDist((genre, word) 
                               for genre in brown.categories() 
                               for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']

print(cfd.tabulate(conditions=genres, samples=modals))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 
None


In [36]:
#from nltk.corpus import PlaintextCorpusReader

#corpus_root = "C:\Users\paulc\Downloads\text"
#wordlists = PlaintextCorpusReader(corpus_root, '.*')
#wordlists.fileids()

In [40]:
puzzle_letters = nltk.FreqDist('egivrvonl')
obligatory = 'r'
wordlist = nltk.corpus.words.words()

for w in wordlist:
    if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters:
        print(w)

glover
gorlin
govern
grovel
ignore
involver
lienor
linger
longer
lovering
noiler
overling
region
renvoi
revolving
ringle
roving
violer
virole
