 # From the previous notebook, "Phrase Sampling (Part 1)" we assemble a collection of functions.

main
    select docs for corpus from a list
    execute the pull
    execute each step in the preprocessing

In [1]:
from pathlib import Path
import re, os, math
from operator import itemgetter

In [2]:
def loadDocs(author1, *authors2):
    #load a selection of texts by selected authors
    auths = [author1]
    re1 = re.compile('(\w+)')
    for other_author in authors2:
        a1 = str(other_author)
        match = re1.search(a1)
        if match:
            auths.append(match.group())
    docs = {}
    for author in auths:
        #print(author)
        data_folder = Path("data/" + author)
        idx = 0
        for file in data_folder.iterdir():
            if str(file).endswith(".txt"):
                file_to_open = file
                o = open(file_to_open, 'r')
                documentName = idx
                idx += 1
                document = list(o)
                docs[documentName] = document
    return docs

In [3]:
def trimHeaders(first_document):
    # Determine whether a Project Gutenberg Text
    first_header_index = 0
    second_header_index = 0
    footer_index = 0
    if any("GUTENBERG" in s for s in first_document):
        for first_header_index in range( len(first_document) ):
            if ( ( first_document[first_header_index].find('*END*THE SMALL PRINT!') ) != -1 ) :
                break
            else:
                for first_header_index in range( len(first_document) ):
                    if ( ( first_document[first_header_index].find('START OF THIS PROJECT GUTENBERG') ) != -1 ) :
                        break        
        second_document = list(first_document[first_header_index + 1 :])
        for second_header_index in range( len(second_document) ):
            if ( ( second_document[second_header_index].find('www.gutenberg.org') ) != -1 ) :
                break            
        for footer_index in range( len(first_document) ):
            if ( ( first_document[footer_index].find('End of Project') ) != -1 ) :
                break
            else:
                for footer_index in range( len(first_document) ):
                    if ( ( first_document[footer_index].find('End of the Project') ) != -1 ) :
                        break    
        script = list()
        if (second_header_index < (first_header_index + 100)):
            manuscript = list(first_document[first_header_index +1 + second_header_index +1 : footer_index-1])
        else:
            manuscript = list(first_document[first_header_index +1 : footer_index-1])
    else:
        manuscript = first_document
    return manuscript

In [4]:
def collectLines(script):
    #Compile a list of speakers
    r = re.compile("[A-Z0-9][A-Z0-9]+")
    speakers = []
    for line in script:
        mtch = r.match(line)
        if mtch:
            speakers.append(mtch.group())
    #Omit speakers from the list of text
    s = re.compile(r"\b[A-Z{3}\.]+\b")
    spoken = list(filter(lambda i: not s.search(i), script))
    return speakers, spoken

In [5]:
def sentencer(spoken):
    #Concatenate lines into list entries for future sentence splitting
    newLines = []
    singleLine = ''
    singleLines = []

    #Remove all line returns(ok)
    for j in range(0, len(spoken)):
        spoken[j] = spoken[j].replace('\n', '')

    #Split 5 lines at a time into new list
    for k in range( 0, len(spoken), 3):
        newLines = []
        for line in range( 0, 3 ):
            try:
                newLines.append(' '+spoken[line+k])
            except:
                #print("Index Error at", k, line)
                break
        #Join 5-line groups into one line and append to a list
        singleLine = ''.join(newLines)
        singleLines.append(singleLine)
    
    #Create list of sentences
    sentences = []
    for m in range(0, len(singleLines)):
        mtch = re.findall("[A-Z][^\.!?]*[\.!?]", singleLines[m], re.M|re.I)
        if mtch:
            sentences.append(mtch)
    return sentences

In [6]:
def readStopList():
    #Clean the stopword list
    stoplist = []
    clean_line = []
    data_folder = Path("data/")
    file_to_open = data_folder / "snowball_stop.txt"
    f = open(file_to_open, 'r')
    full_stop = list(f)

    for n in range( 0, len(full_stop), 1 ):
        clean_line = full_stop[n].split('|')
        stoplist.append(clean_line[0])

    for p in range(len(stoplist)):
        stoplist[p] = stoplist[p].replace('\n', '')

    #print(stoplist)
    return stoplist

In [7]:
def collectPhrases(sentences, stoplist):
    # Create list of phrases using stopwords
    phrases = []
    candidate_phrases = []

    for q in range(len(sentences)):
        for r in sentences[q]:
            words = re.split("\\s+", r)
            previous_stop = False

            # Examine each word to determine if it is a phrase boundary marker or part of a phrase or alone
            for w in words:

                if w in stoplist and not previous_stop:
                    # phrase boundary encountered, so put a hard indicator
                    candidate_phrases.append(";")
                    previous_stop = True
                elif w not in stoplist and len(w) > 3:
                    # keep adding words to list until a phrase boundary is detected
                    candidate_phrases.append(w.strip())
                    previous_stop = False

        # Create a list of candidate phrases without boundary demarcation
        phrases = re.split(";+", ' '.join(candidate_phrases))

    # Clean up phrases    
    re2 = re.compile('[^\.!?,"(){}\*:]*[\.!?,"(){}\*:]')
    for s in range(len(phrases)):
        phrases[s] = re.sub(re2, '', phrases[s])
        phrases[s] = phrases[s].strip(' ')
        phrases[s] = phrases[s].replace(' ', '_')
        phrases[s] = phrases[s].replace('__', '_')
        phrases[s] = phrases[s].strip('_')

    for s in range(len(phrases)):
        try:
            phrases.remove('')
            phrases.remove(' ')
            phrases.remove('/n')
        except:
            pass

    #for t in range(len(phrases)):
        #print(phrases[t])
    
    return phrases

In [8]:
def phraseFreq(phrases):
    # Phrase frequency count
    wordfreq = []
    for u in range(len(phrases)):
        utterance = phrases[u]
        uttcnt = 0
        uttcnt = phrases.count(utterance)
        if uttcnt > 1:
            wordfreq.append(uttcnt)
    zipped = list(zip(phrases, wordfreq))
    sortzip = sorted(zipped, key=itemgetter(1), reverse=True)
    #for v in range(len(sortzip)):
        #print(sortzip[v])  
    return sortzip

In [9]:
def collectWords(sentences):
    #Establish wordList
    wordList = []
    for u in range(len(sentences)):
        for v in sentences[u]:
            words = re.split("\\s+", v)
            wordList.extend(words)
    #Establish wordDict
    wordDict = {}
    for w in range(len(wordList)):
        newWord = wordList[w]
        newWord = newWord.lower()
        newWord = newWord.replace('.', '')
        wordDict[w] = newWord
    return wordDict

In [10]:
def wordFreq(wordDict):
    #Perform word counts on dict
    countDict = {}
    for x in range(len(wordDict)):
        term = wordDict[x]
        #print(wordDict)
        count = 1
        for y in range(len(wordDict)):
            try:
                if wordDict[y].find(term) > 0:
                    count += 1
            except:
                pass
            countDict[term] = count #MAJOR ERROR HERE "TypeError: unhashable type: 'dict'"

    for k, v in countDict.items():
        print(k, v)
    return countDict

In [11]:
def computeIDF(docList):
    # Calculates the weight of rare words across all docs
    idfDict = {}
    N = len(docList)
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))

    return idfDict

In [12]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [18]:
#Main
documents = loadDocs('poe')
#print(documents)
# wordsPulled = []
tf = []
for d in range(len(documents)):
    trimmedDoc = trimHeaders(documents[d])
    heads, linesCollected = collectLines(trimmedDoc)
    sentencesPulled = sentencer(linesCollected)
    stoplist = readStopList()
    phrasesPulled = collectPhrases(sentencesPulled, stoplist)
    phraseFreq(phrasesPulled)
    a = collectWords(sentencesPulled)
#     wordsPulled.append(a) 
    b = wordFreq(a)
    tf.append(b)
idf = computeIDF(documents)
tfidf = []
for doc in documents:
    c = computeTFIDF(wordsPulled(doc), idf)
    tfidf.append(c)
print(tfidf)

extraordinary 1
series 1
of 38
adventure 1
in 2495
the 393
south 3
seas 4
and 264
elsewhere, 1
which 5
an 1287
account 2
is 1057
given 1
following 1
pages, 1
accident 1
threw 2
me 808
into 1
society 1
several 1
gentlemen 1
richmond, 1
va 91
were 5
constantly 1
urging 1
it 1289
upon 4
me, 26
as 1086
a 8732
duty, 1
to 361
give 1
my 10
narrative 1
nature 1
altogether 1
private, 1
concern 1
no 177
person 1
but 24
myself; 1
others 2
not 40
so 202
much 1
unavoidable 1
exaggeration 1
all 275
us 507
are 117
prone 1
when 2
detailing 1
events 1
have 3
had 20
powerful 1
influence 1
exciting 1
imaginative 1
faculties 1
veracity--the 1
probability 1
being 4
that 13
public 1
at 1662
large 1
would 3
one 144
principal 1
causes 1
prevented 1
from 4
complying 1
with 8
suggestions 1
advisers 1
related 1
antarctic 1
ocean, 1
was 3
mr 2
poe, 1
lately 1
editor 1
“southern 1
literary 1
messenger,” 1
monthly 1
magazine, 1
published 1
by 8
truth 1
myself, 1
publishing 1
messenger” 1
_under 1
garb 1
only 1
real

delivered 1
supposed, 1
hiding-place 1
make 1
comforts 1
cabin; 1
father, 1
laugh 1
heartily 1
joke 1
sent 55
home 1
explaining 1
parents 1
middle 1
june 1
arrived, 1
thing 52
matured 1
waiting 2
corner 1
street 1
slip 1
brig; 1
but, 1
thick 1
fog 1
favor, 1
agreed 1
lose 29
secreting 1
seaman’s 1
cloak, 1
brought 1
him, 1
recognized 1
turned 16
second 1
corner, 1
passing 1
front 3
looking 7
face, 6
peterson, 1
grandfather 1
tones--“sir! 1
sum’mat 1
mistaken--my 1
name, 1
place, 1
blackguard, 1
call 5
obercoat 1
darty 1
received 1
handsome 1
rebuke 1
steps, 1
pale 1
excessively 1
red, 15
spectacles, 1
then, 1
ran 99
tilt 1
umbrella 1
uplifted 1
good-for-nothing 1
salt 1
tom 31
destination 1
safety 1
board, 5
busy 1
doing 1
forecastle 1
combings 1
vredenburgh’s, 1
remain 5
evening, 1
apprehend 1
noticed 2
work 3
found 9
fitted 1
comfortable 2
style--a 1
unusual 1
whaling-vessel 1
excellent 1
staterooms, 1
wide 1
convenient 2
berths 1
carpet 1
covering 6
floor 1
staterooms 1
necessity 1


seafaring 1
anecdotes 1
prodigious 1
excitement, 1
rise 11
sanity 1
progressing 1
science 2
verify 1
important 2
improbable 1
statements 1
clerk) 1
smallest 1
whaleboats 1
appearance, 1
recovered 1
adrift, 1
wherever 1
chose, 1
steps 2
bringing 1
justice 1
spoken 1
winds 1
hurled 1
sail, 2
oar, 1
compass 4
towed 6
astern 2
consultation--it 1
moon 1
stars 1
visible--and 1
running, 1
deal 2
event 20
happened, 1
latitude 2
35 1
degrees 1
30’ 1
north, 1
longitude 1
61 1
20’ 1
west, 3
consequently 1
distance 1
bermuda 1
islands 1
console 1
succeed 1
reaching 1
land, 15
expedition, 1
which, 2
understood, 1
intercepted 1
cape 23
verd 1
porto 2
rico 1
anywhere 1
companion-way 1
treated 5
kindness, 1
occasion 2
saved 1
brutality 1
precarious, 1
continually 1
relying 1
good-humor 1
carelessness 1
regard 1
sincerity 1
friendship 1
beheld, 1
partly 1
relief 1
occurred 1
length, 1
third 2
day, 1
eastward, 1
ensued, 1
unobserved, 1
ship-furniture, 1
fathoms 1
chain-cable, 1
stowed 2
companion-ladder

assistance 1
we, 4
depended 1
growl, 1
bounded 3
critical 1
pinned 1
tremendous 1
stool, 1
contact 1
hicks, 1
dint 1
sheer 1
strength, 1
strangled 1
instantaneously 1
parker 2
pump-handle 1
commencement 1
shattered 1
stateroom; 1
touching 1
foot, 1
spoke, 1
entreated 1
mercy 1
inflicted, 1
sharp 1
tremendously 1
easing 4
leeward 1
caboose, 1
jollyboat 1
counter 9
creaking 1
working 1
mainmast, 1
indication 1
sprung 1
heel 1
stepped 1
decks 2
reprehensible 1
practice, 1
ignorant 1
ship-builders), 1
plummed 1
well, 1
seven 8
bodies 1
pumps--parker, 1
liberty 1
assist 1
labour 1
could, 1
manage 2
keep 1
pump 2
severe 3
labour; 1
endeavoured 1
spirits, 1
anxiously 1
daybreak, 1
hoped 1
lighten 1
cutting 1
mainmast 1
signs 4
care 1
(having 1
cabin), 1
stays 2
lanyards 1
mass 1
wood 4
rigging 1
plunged 1
clear 1
injury 1
utmost 1
importance 1
capsizing 1
excessive 1
undergone, 1
bleeding 1
wreck 1
bowsprit, 1
complete 1
hulk 1
rejoice 1
longboat, 1
windlass 1
redoubled 1
afternoon 1
stand 7


towards 1
us! 3
miles 5
distant 2
clearly 1
people 1
appear, 1
perpetuated 1
similar, 1
beings 1
hauling 1
bore 3
eighty 1
tons 1
burden 1
sea-boat, 1
good, 1
trade 2
destined 1
service, 1
larger 1
proportionate 1
draught, 1
desirable--say 1
ships 3
armed 8
brass 1
blunderbusses, 1
water-tight 1
arm-chests 1
sixty 1
able-bodied 1
guy 1
urbanity 1
southern 3
traffic, 1
devoted 1
spirit 2
enterprise 1
here 160
rasps, 1
hammers, 1
nails, 1
scissors, 1
razors, 1
needles, 1
thread, 1
crockery-ware, 1
calico, 1
trinkets, 1
necessaries 1
meridians 1
twenty-eight 1
thirty 1
europe 2
route 1
east 44
indies 1
guy’s 1
st 1060
north 2
south, 1
_not 1
five-and-twenty 1
degrees! 1
distressed 1
demanded 1
transition, 1
joy--the 1
forgetfulness 1
proportioned 1
incidents 1
spermaceti 1
twenty-fifth 1
parallel 1
liverpool 1
rage 14
change, 1
squall, 1
qualities 1
seaboat, 1
pitching 1
wave 2
buried 1
headsail 1
flapping 1
listlessly 1
preparation, 1
beam-ends, 1
magic, 1
wilderness 1
breach 1
anchor 1


forked 1
stood, 2
through, 1
bend 1
lower, 1
thicker 1
shallow 1
caverns, 1
scratched 1
stone, 3
fuller’s 1
primitive 1
caverns 1
tenant 2
ascertain, 1
stone 8
brawling 1
magical-looking 1
described 1
domesticated 1
creatures 1
hog 2
structure 1
snout; 1
tail, 2
bushy, 1
legs 1
slender 1
antelope 1
indecisive, 1
wool 1
tame 1
fowls 1
constitute 1
natives 1
home, 1
incubation 1
elephantfish, 1
mullets, 1
soles, 1
parrotfish, 1
leather-jackets, 1
gurnards, 1
hake, 1
flounders, 1
paracutas, 1
varieties 1
auckland 1
animals, 1
familiar 1
serpents 1
formidable 1
path, 1
paid 2
attention, 1
venomous 1
everlasting 1
weapons 1
latter, 1
villagers 1
wanting 1
termed 1
personal 1
straight, 1
tall, 1
grace 1
clumsy, 1
laughing, 1
disclosed 1
finer 1
texture 1
males 1
clothed, 1
too-wit, 1
skin, 1
lances 2
clubs 1
addressed 1
wampoo 1
tenants 1
palaces 1
constructed 1
covering, 2
wooden 1
skewers, 1
pegs 1
strewed 1
leaves, 1
example 1
risen 1
trampling 1
incessant 1
vociferations 1
extricating 1


stifled 1
dictates 1
compassion, 1
hoisted 1
cruelly 1
san 26
miguel, 1
1769; 1
aurora, 1
1774; 1
pearl, 1
1779; 1
dolores, 1
1790 1
agree 3
mean 9
narrative, 1
relied 1
marl 1
black; 1
colored 4
substances 1
dates 1
perspicity 1
pencil 1
memorandum 1
therein 1
lieth, 1
dieth 1
vigor? 1
pervading 1
intentness 1
joseph 1
glanvill 1
stealthily 1
progressive 1
unnoticed 1
remotely 1
ancient 1
ligeia! 1
impressions 1
world, 1
alone--by 1
betrothed, 1
studies, 1
wife 1
playful 1
charge 4
ligeia? 1
inquiries 1
point? 1
own--a 1
wildly 1
romantic 1
offering 1
shrine 1
devotion? 1
originated 1
it? 1
she, 1
wan 6
misty-winged 1
ashtophet 1
idolatrous 1
egypt, 1
presided, 1
tell, 1
marriages 1
ill-omened, 1
presided 1
mine 28
dear 3
topic, 1
fails 1
ligeia 1
majesty, 1
demeanor, 1
incomprehensible 1
lightness 1
elasticity 1
footfall 1
maiden 1
equalled 1
slumbering 1
souls 1
daughters 1
delos 1
features 1
falsely 1
taught 1
worship 1
classical 1
labors 1
heathen 1
faultless--how 1
majesty 1
divi

moissart 2
mother, 1
madame 1
moissart, 1
fourteen 1
altar 1
france 1
croissart, 1
direct 1
hesitated 1
accepting 1
legacy 1
annoying 1
proviso 1
curling 2
nose 1
gray; 1
weak 1
inconvenient 1
defect 1
suspected 1
remedy--short 1
glasses 2
youthful 1
impresses 1
feature 1
demureness, 1
sanctimoniousness 1
talbot 2
rare 1
attraction, 1
house 2
elbowed 1
write 1
resist, 1
personified, 1
incarnate, 1
beau 1
ideal 1
wildest 1
reaching, 1
fullness 1
tournure 1
psyche, 1
displayed 1
elegant 1
gaze 1
aerienne, 1
ventum 1
textilem 1
apuleius 1
nerve 1
exquisite 1
symmetry 1
draperied 1
sleeves 1
fashion 1
elbow 1
material, 1
close-fitting, 1
cuff 5
lace, 6
gracefully 1
revealing 1
fastidious 1
wearer 1
irrevocably 1
love--and 1
anomalous 1
love--of 1
sight--and 1
dependent 2
conditions 1
create 1
control 1
quieted 1
exalted 1
partook 1
transport 1
enthusiasm 1
not--and 1
opera-glass? 1
impatiently 1
stage--box? 1
there! 1
no, 1
lovely 1
beautiful, 1
doubt,” 1
angelic, 1
don’t 2
is? 1
argues 1


toddy, 1
he, 6
disappeared 1
whirlpool 1
effervescent 1
tamely, 1
seaman 1
discomfiture 1
slammed 1
oath, 1
strode 1
glimpses 1
apartment, 1
brains 1
hugh 1
overturned--the 1
tressels 1
backs--the 1
fire-place--and 1
ladies 1
hysterics 1
piles 1
death-furniture 1
floundered 1
promiscuously 1
melee, 1
wicker 1
sneezed 1
times, 1
panted 1
puffed 1
rumgudgeon--shaking 1
fist 1
strenuous 1
ditty 1
remplis 1
ton 45
verre 1
vide! 1
vide 27
plein! 1
blandest 1
“you 1
considerate, 1
benevolence 1
many--so 1
acquiescence 1
hem! 2
“good 1
boy! 1
on! 10
really, 1
seriously, 1
oppose 1
union 3
kate 1
“curse 1
you! 1
yes! 3
advice 1
as--as 1
time--you 1
know, 1
uncle--in 1
yourself, 1
wedding 1
shall--shall 1
know? 1
scoundrel! 1
goes 2
he! 1
hi! 1
ho! 1
hu! 1
good! 1
oh 5
capital--such 1
wit! 1
uncle, 1
indicate 1
precisely? 1
uncle--that 1
please, 1
uncle--precisely 1
bobby, 1
boy--you’re 1
aren’t 1
“dear 1
uncle! 1
consent--and 1
plum, 1
mus’n’t 1
forget 1
plum--let 1
to-day’s 1
sunday--isn’t 1


AttributeError: 'list' object has no attribute 'keys'

In [15]:
len(a)

41682

In [16]:
wordsPulled?

In [17]:
len(wordsPulled)

1