In [20]:
import nltk
from nltk.tokenize import word_tokenize
from pathlib import Path
from pprint import pprint

## A world of Corpora

### Books

In [6]:
from nltk.corpus import gutenberg as gb
gb.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [32]:
emma = nltk.Text(gb.words('austen-emma.txt'))
emma.concordance("remarkably")

Displaying 11 of 11 matches:
. She was not struck by any thing remarkably clever in Miss Smith ' s conversa
He is very plain , undoubtedly -- remarkably plain :-- but that is nothing com
!" " Mr . Knightley ' s air is so remarkably good that it is not fair to compa
 before of Mr . Elton ' s being a remarkably handsome man , with most agreeabl
lever boy , indeed . They are all remarkably clever ; and they have so many pr
quiet manners , and a disposition remarkably amiable and affectionate ; wrapt 
 my children in : but _we_ are so remarkably airy !-- Mr . Wingfield thinks th
ut for her private perplexities , remarkably comfortable , as such seclusion e
. Jane Fairfax was very elegant , remarkably elegant ; and she had herself the
 Very nicely dressed , indeed ; a remarkably elegant gown ." " I am not at all
 am --.' Thank you , my mother is remarkably well . Gone to Mr . Woodhouse ' s


Okay, I knew about that. Let's try importing a text to perform concordance search on it!

### Lyrics (importing text files)

In [16]:
# use pathlib to easily import a file
raw_lyrics = Path("resources/lottalyrics.txt").read_text()

In [21]:
# the text needs to be word-tokenized
tok_lyrics = word_tokenize(raw_lyrics)
# for creating a nltk.Text object
lyrics = nltk.Text(tok_lyrics)

In [34]:
lyrics.concordance("trasero")

Displaying 2 of 2 matches:
on nalga sonando aplaudiendo con el trasero parece q me están hablando Y me gus
e panadero Barriendo el piso con el trasero Toda la grasa se desplaza por la te


### Web and Chat Text

In [35]:
from nltk.corpus import webtext

In [37]:
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:50], "...")

firefox.txt Cookie Manager: "Don't allow sites that set remove ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Who ...
overheard.txt White guy: So, do you have any plans for this even ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for  ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished lea ...


In [38]:
nyc = webtext.raw("overheard.txt")
len(nyc)

830118

In [52]:
print(nyc[120:200])

Guy #1: So this Jack guy is basically the luckiest man in the world.
Guy #2: Why


In [53]:
from nltk.corpus import nps_chat

In [54]:
nps_chat.fileids()

['10-19-20s_706posts.xml',
 '10-19-30s_705posts.xml',
 '10-19-40s_686posts.xml',
 '10-19-adults_706posts.xml',
 '10-24-40s_706posts.xml',
 '10-26-teens_706posts.xml',
 '11-06-adults_706posts.xml',
 '11-08-20s_705posts.xml',
 '11-08-40s_706posts.xml',
 '11-08-adults_705posts.xml',
 '11-08-teens_706posts.xml',
 '11-09-20s_706posts.xml',
 '11-09-40s_706posts.xml',
 '11-09-adults_706posts.xml',
 '11-09-teens_706posts.xml']

In [57]:
chatroom = nps_chat.posts(nps_chat.fileids()[0])
print(len(chatroom))
print(chatroom[123])

706
['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']


### Brown Corpus

In [59]:
# made in 1961 at Brown university (near Boston)
from nltk.corpus import brown
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [64]:
# access by words or sents, optionally with a specific ID
#text = brown.words(categories='government')
text = brown.sents(categories='government')
print(text[:5])

[['The', 'Office', 'of', 'Business', 'Economics', '(', 'OBE', ')', 'of', 'the', 'U.S.', 'Department', 'of', 'Commerce', 'provides', 'basic', 'measures', 'of', 'the', 'national', 'economy', 'and', 'current', 'analysis', 'of', 'short-run', 'changes', 'in', 'the', 'economic', 'situation', 'and', 'business', 'outlook', '.'], ['It', 'develops', 'and', 'analyzes', 'the', 'national', 'income', ',', 'balance', 'of', 'international', 'payments', ',', 'and', 'many', 'other', 'business', 'indicators', '.'], ['Such', 'measures', 'are', 'essential', 'to', 'its', 'job', 'of', 'presenting', 'business', 'and', 'Government', 'with', 'the', 'facts', 'required', 'to', 'meet', 'the', 'objective', 'of', 'expanding', 'business', 'and', 'improving', 'the', 'operation', 'of', 'the', 'economy', '.'], ['Contact'], ['For', 'further', 'information', 'contact', 'Director', ',', 'Office', 'of', 'Business', 'Economics', ',', 'U.S.', 'Department', 'of', 'Commerce', ',', 'Washington', '25', ',', 'D.C.', '.']]


In [67]:
print(brown.sents()[:1])

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']]


Get the "*ing" words!

In [99]:
def get_ing(genre_list):
    """lists all high-frequency words ending with -ing in the various genres of the Brown corpus."""
    frequency = 20
    for genre in genre_list:
        genre_words = brown.words(categories=genre)
        fdist = nltk.FreqDist(w for w in genre_words)
        th_ings = set([w for w in genre_words if w.endswith("ing") and fdist[w] >= frequency])
        print(genre, "->")
        for ing in th_ings:
            print("{}: {}".format(ing, fdist[ing]), end=", ")
        print()
        print()

In [98]:
# show high-frequency -ing words in the different genres of the Brown corpus
get_ing(brown.categories())

adventure ->
nothing: 38, running: 21, morning: 27, thinking: 20, thing: 28, anything: 28, going: 50, something: 50, 

belles_lettres ->
being: 136, bring: 22, feeling: 35, according: 24, During: 21, understanding: 42, evening: 22, during: 74, living: 37, writing: 57, going: 33, reading: 28, something: 49, nothing: 77, thinking: 23, thing: 44, beginning: 35, having: 59, anything: 38, doing: 23, following: 29, looking: 21, morning: 27, trying: 24, including: 22, coming: 25, making: 39, 

editorial ->
being: 45, during: 25, going: 25, 

fiction ->
being: 31, nothing: 26, morning: 32, getting: 21, thing: 30, anything: 26, going: 45, coming: 21, something: 45, 

government ->
operating: 20, being: 42, during: 51, financing: 21, During: 29, hearing: 27, including: 25, planning: 36, making: 25, 

hobbies ->
working: 24, being: 36, cooling: 30, during: 31, marketing: 30, training: 25, locking: 25, building: 26, using: 22, shooting: 29, 

humor ->


learned ->
being: 119, coating: 33, feeling:

Neat stuff! NLTK's **Conditional Frequency Distributions**:

In [100]:
cfd = nltk.ConditionalFreqDist(
        (genre, word)
        for genre in brown.categories()
        for word in brown.words(categories=genre))

In [102]:
#genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
genres = brown.categories()
words = ['what', 'who', 'why', 'when', 'where']
cfd.tabulate(conditions=genres, samples=words)

                 what   who   why  when where 
      adventure   110    91    13   126    53 
 belles_lettres   244   452    36   252   107 
      editorial    84   172    10   103    40 
        fiction   128   103    18   133    76 
     government    43    74     6    56    46 
        hobbies    78   103    10   119    72 
          humor    36    48     9    52    15 
        learned   141   212    20   227   118 
           lore   130   259    25   182    97 
        mystery   109    80    25   114    59 
           news    76   268     9   128    58 
       religion    64   100    14    53    20 
        reviews    44   128     9    54    25 
        romance   121    89    34   126    54 
science_fiction    27    13     4    21    10 


Interesting in which genres which of these words are more common. Whys are _so rare_! Only belle_lettres, romance, and to a lesser degree mystery, lore and learned seem to be concerned about finding out the reason for things...

---

lots of stuff in between

---

<img src="img/target.png">

- Lover
- Giver
- Girl
- _Vergil_
- rove
- _love_
- _give_
- _go_
- _gin_
- ren
- _live_
- liver
- virgo
- Norge
- lore
- over
- goner
- _vive_
- _legion_
- region

(enough for now. not a very good perormance!!)
