In [1]:
import spacy
import json
from collections import Counter
import pandas as pd
import numpy as np
from spacy.pipeline import Pipe
import itertools

## Load Spacy and Documents
I'm loading from the actual location of the model, used this as a workaround to symbolic link problems.

In [2]:
nlp = spacy.load('C:/Users/mj514/Anaconda3/lib/site-packages/en_core_web_lg/en_core_web_lg-2.0.0')

In [3]:
with open('childrensAuthorContent.json') as infile:
    pages = json.load(infile)
pages[10]

'After the First Death (1979) is a suspense novel for young adults by American author Robert Cormier. The focus is on the complex relationships that develop between the various characters. The novel takes the name from the poem, "A Refusal to Mourn the Death, by Fire, of a Child in London" by Dylan Thomas. It originates from the last line: "After the first death, there is no other."\n\n\n== Synopsis ==\nAfter the First Death describes the terrorist hijacking of a summer camp bus full of children. The main characters include Kate, a high school student driving the bus, Miro, one of the terrorists, and Ben, the son of a general holding a senior position in "Inner Delta"; a government anti-terrorism organisation. The story is mostly written from the points-of-view of Kate, Miro, and Ben, switching back and forth, and brief sections are told from the point of view of some other characters.\nKate is driving the bus when it is hijacked by four terrorists, Miro, Artkin, Antibbe and Stroll. Th

In [4]:
texts  = pages#[0:50] #Use this to test on fewer pages

## Test Out Spacy's Pipe Feature
This dataset isn't a great example of the pipe feature because it only has 700 documents. I'll run some more tests on more documents. Roughly, you can see that there isn't always a speed up with more threads (fastest in this test was 4 threads). I suspect this could change with much larger documents, but see the tests and the time below.

In [39]:
%%time

toks = []
for doc in nlp.pipe(texts, n_threads=16, batch_size=50):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 47s


In [40]:
%%time
toks = []
for doc in nlp.pipe(texts, n_threads=16, batch_size=100):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 20s


In [41]:
%%time
toks = []
for doc in nlp.pipe(texts, n_threads=16, batch_size=1000):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 26s


In [42]:
%%time
toks = []
for doc in nlp.pipe(texts, n_threads=16, batch_size=10):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 38s


In [43]:
%%time
toks = []
for doc in nlp.pipe(texts, n_threads=4, batch_size=1000):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 17s


In [44]:
%%time
toks = []
for doc in nlp.pipe(texts, n_threads=2, batch_size=1000):
    toks.append([tok.lemma_ for tok in doc if tok.pos_ == 'NOUN'])

Wall time: 8min 32s


## Parse All Documents
This step gathers a list of lists corresponding to the nouns and people that are in each document. Here I used the tag to find our nouns instead of the POS because the tag is slightly more specific. In some cases 'what' and 'who' are considered nouns and this eliminates them

In [5]:
%%time
nounLists = []
peopleLists = []
for doc in nlp.pipe(texts, n_threads=16, batch_size=1000):
    nounLists.append([tok.lemma_.lower() for tok in doc if tok.tag_ in ['NN','NNS']])
    peopleLists.append([ent.merge().text.lower() for ent in doc.ents if ent.label_ == 'PERSON'])

Wall time: 9min 1s


Collapse list of lists into one list:

In [6]:
%%time
#Apparently this is the fastest way to flatten a list of lists. https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
nounList = list(itertools.chain.from_iterable(nounLists))
peopleList = sum(peopleLists,[])

Wall time: 248 ms


## Remove Stop Words
In this case, there are a bunch of nouns that we probably don't want to consider, and many of them are the most popular. We'll use a Counter to count the words and then peek at the top 10 to see if they should be removed.

In [7]:

count = Counter(nounList)
series = pd.Series(count, name='count')
stopList = list(series.sort_values(ascending=False).head(10).index)
print(stopList)

['book', 'child', 'year', 'story', 'work', 'time', '–', '=', 'century', 'series']


In [8]:
stoppedNouns = [word for word in nounList if word not in stopList]
print(f"before stopping: {len(nounList)}, after: {len(stoppedNouns)}")

before stopping: 477525, after: 435139


## Convert to Dataframes For 'tidy' Simulation
This allows us to leverage the parralelization built into pandas to quickly do our simulation.

In [9]:
thingsDF  = pd.DataFrame({'thing': stoppedNouns})

In [10]:
thingsDF.head()

Unnamed: 0,thing
0,author
1,teddy
2,bear
3,poem
4,writer


In [11]:
peopleDF = pd.DataFrame({'name': peopleList})

For this excercise we only want to consider english words and names. The following removes punctuation, then identifies the rows that contain non engish characters and removes them

In [12]:
thingsDF = thingsDF[~(thingsDF.thing.str.replace('/p{P}+','',regex = True).str.contains('[^a-zA-Z]', regex = True))]
peopleDF = peopleDF[~(peopleDF.name.str.replace('/p{P}+','',regex = True).str.contains('[^a-zA-Z]', regex = True))]

In [65]:
thingsDF[~(thingsDF.thing.str.replace('/p{P}+','',regex = True).str.contains('[^a-zA-Z]', regex = True))]

Unnamed: 0,thing
0,author
1,teddy
2,bear
3,poem
4,writer
5,playwright
6,success
7,captain
8,parent
9,school


## Simulate
First draw randomly with replacement from people and things. First draw from people to figure out friends name, then from things twice.

In [24]:
numSamples = 1000000
sampledPeople = peopleDF.sample(n= numSamples, replace = True).reset_index(drop=True)
sampledThings      = thingsDF.sample(n = numSamples, replace = True,).rename(columns ={'thing':'thing1'}).reset_index(drop=True)
sampledThingsAgain = thingsDF.sample(n = numSamples, replace = True).rename(columns ={'thing':'thing2'}).reset_index(drop=True)
fullSamples = pd.concat([sampledPeople,sampledThings,sampledThingsAgain], axis = 1)

In [25]:
fullSamples[['name','thing1','thing2']].head(5)

Unnamed: 0,name,thing1,thing2
0,sonderbundskrieg,interplay,family
1,schelde,matter,mother
2,joshua,man,classic
3,gandhi,edge,wimp
4,lewis,character,family


In [27]:
fullSamples.shape

(1000000, 3)

Get starting letter, check if its the same accross all 3 columns

In [30]:
fullSamples['first_name'] = fullSamples.name.str[:1].str.lower()
fullSamples['first_thing1'] = fullSamples.thing1.str[:1].str.lower()
fullSamples['first_thing2'] = fullSamples.thing2.str[:1].str.lower()
fullSamples['isMagicMatch'] = ((fullSamples.first_name == fullSamples.first_thing1) & (fullSamples.first_name == fullSamples.first_thing2))
fullSamples['isPartialMatch'] = ((fullSamples.first_name == fullSamples.first_thing1))

Take the mean of the isMagicMatch column, change it to percentage

In [34]:
magicPercentage  = np.mean(fullSamples.isMagicMatch)*100
print(f"Our friend and our two objects start witht the same letter {magicPercentage.round(3)}% of the time, which is {magicPercentage.round(3)*10} in 1000 simulations...")
magicPercentage  = np.mean(fullSamples[fullSamples['first_name']=='m'].isMagicMatch)*100
print(f"Our friend is named Morgan and our two objects start with an M {magicPercentage.round(3)}% of the time, which is {magicPercentage.round(3)*10} in 1000 simulations...")
magicPercentage  = np.mean(fullSamples.isPartialMatch)*100
print(f"Our friend and our first object starts with the same letter {magicPercentage.round(3)}% of the time, which is {magicPercentage.round(3)*10} in 1000 simulations...")
1/26**3*100

Our friend and our two objects start witht the same letter 0.392% of the time, which is 3.92 in 1000 simulations...
Our friend is named Morgan and our two objects start with an M 0.321% of the time, which is 3.21 in 1000 simulations...
Our friend and our first object starts with the same letter 5.269% of the time, which is 52.69 in 1000 simulations...


0.005689576695493855

In [35]:
fullSamples[fullSamples.isMagicMatch][['name','thing1','thing2']].head()

Unnamed: 0,name,thing1,thing2
526,penguin,party,person
1367,page,production,protagonist
1374,clay,chorister,capacity
1669,pinocchio,printing,principle
1773,elementary,education,ed


# End