# Measures and Windows

Madhu Sivaraj

For a given word:
- Calculate Measures (Occurrences, Word Frequency, Contextual Diversity, Burstingness)
- Obtain Windows

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import string
import collections
from collections import Counter
import matplotlib.pyplot as plt
import statistics
import math
import re
import os
from os import listdir
from os.path import isfile, join
from scipy.optimize import curve_fit

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package punkt to /Users/madhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/madhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Load Harry Potter Books

In [2]:
hp_texts = pd.DataFrame(columns = ['Book No.', 'Chapter No.', 'Text', 'Word Count'])
book_dir = "../Preprocessing/"

In [3]:
def read_chapter(path):  
    with open(path, "r", encoding ="utf8") as current_file:
        text = current_file.read()
        text = text.replace("\n", " ").replace("\r", "")
    return text

for i in range(1,8):
    filepath = book_dir + "HPBook" + str(i) +"/"
    chapters = sorted([f for f in os.listdir(filepath)])
    for c in range(len(chapters)):
        text = read_chapter(filepath+chapters[c])
        wc = len(text.split())
        new_row = pd.DataFrame([[i, c+1, text, wc]], columns = ['Book No.', 'Chapter No.', 'Text', 'Word Count'])
        hp_texts = pd.concat([hp_texts, new_row], ignore_index=True)

In [4]:
hp_texts

Unnamed: 0,Book No.,Chapter No.,Text,Word Count
0,1,1,THE BOY WHO LIVED Mr. and Mrs. Dursley o...,4723
1,1,2,THE VANISHING GLASS Nearly ten years ...,3502
2,1,3,THE LETTERS FROM NO ONE The escape of...,3921
3,1,4,THE KEEPER OF THE KEYS BOOM. They kno...,3809
4,1,5,DIAGON ALLEY Harry woke early the nex...,6713
...,...,...,...,...
193,7,32,THE ELDER WAND The world had ended so...,5677
194,7,33,THE PRINCES TALE Harry remained kneel...,8460
195,7,34,THE FOREST AGAIN Finally the truth. L...,3950
196,7,35,KINGS CROSS He lay facedown listening...,5098


## 3. Load Harry Potter Domain Words

In [5]:
def load_domain_words(sheetname):
    hp_domain_file = "../../Data/HP_word_information/HP_critical_specific_words.xlsx"
    df = pd.read_excel(hp_domain_file, sheet_name=sheetname)
    return df

In [6]:
domain_words = []
exp1 = load_domain_words('HP1-Criticalwords')
for ind, row in exp1.iterrows():
    word0 = row["Supported"]
    word1 = row["Unsupported"]
    if word0 not in domain_words:
        domain_words.append(word0)
    if word1 not in domain_words:
        domain_words.append(word1)
exp2 = load_domain_words('HP2-Criticalwords')
for ind, row in exp2.iterrows():
    word0 = row["Critical word GREEN=search on singular"]
    if word0 not in domain_words:
        domain_words.append(word0)
exp3 = load_domain_words('HP3-Criticalwords')
for ind, row in exp3.iterrows():
    word0 = row["Supported"]
    word1 = row["Related"]
    word2 = row["Unrelated"]
    if word0 not in domain_words:
        domain_words.append(word0)
    if word1 not in domain_words:
        domain_words.append(word1)
    if word2 not in domain_words:
        domain_words.append(word2)
exp3x = load_domain_words('HP3_HPworld-Criticalwords-only')
for ind, row in exp3x.iterrows():
    word0 = row["Word"]
    word1 = row["Alternate form"]
    if word0 not in domain_words:
        domain_words.append(word0)
    if word1 not in domain_words:
        domain_words.append(word1)
exp4 = load_domain_words('Additional HP words')
for ind, row in exp4.iterrows():
    word0 = row["Word"]
    if word0 not in domain_words:
        domain_words.append(word0)

## 4. Measures 

Occurrences, Word Frequency, Contextual Diversity, Burstingness

### 4.1 Obtain Domain Word Occurrences

Dictionary of all the books and chapters in which a word appears

In [7]:
def getWordOccurrence(word):
    occurrences = collections.defaultdict(list)
    
    def find_word(w, s):
        return Counter(w.lower() for w in re.findall(r"\w+", s))

    for index, row in hp_texts.iterrows():
        book, chapter, text = row["Book No."], row["Chapter No."], row["Text"]
        wf = dict(find_word(word, row["Text"]))
        lcw = str(word).lower().replace('.','')
        if lcw in wf:
            occurrences[book].append((chapter, wf[lcw]))
    return dict(occurrences)

def getWordCountByBook(dictionary):
    word_count_book = {}

    for book, value in dictionary.items():
        wc_book = 0
        for chapter in value:
            wc_book += chapter[1]
        word_count_book[book] = wc_book
    return word_count_book

In [8]:
occurrences = []
for word in domain_words:
    occurrences.append(getWordOccurrence(word))
    
hp_domain_info = pd.DataFrame(domain_words, columns=['Word'])
hp_domain_info['Occurrences'] = occurrences

In [9]:
hp_domain_info = hp_domain_info[hp_domain_info["Occurrences"] != {}].reset_index()

In [10]:
word_count_by_book = []
for index, row in hp_domain_info.iterrows():
    word, occurrence = row["Word"], row["Occurrences"]
    word_count_by_book.append(getWordCountByBook(occurrence))

hp_domain_info['Word Count By Book'] = word_count_by_book

In [11]:
hp_domain_info

Unnamed: 0,index,Word,Occurrences,Word Count By Book
0,0,rat,"{1: [(2, 1), (6, 5)], 2: [(3, 1), (5, 1), (8, ...","{1: 6, 2: 6, 3: 49, 4: 3, 5: 4}"
1,1,dog,"{1: [(1, 1), (3, 1), (5, 2), (9, 4), (10, 3), ...","{1: 22, 2: 4, 3: 41, 4: 20, 5: 18, 6: 4, 7: 2}"
2,2,frog,"{1: [(4, 1), (6, 2), (7, 1), (12, 2), (13, 2),...","{1: 9, 2: 3, 3: 3, 4: 9, 5: 20, 6: 1, 7: 2}"
3,3,monkey,"{5: [(35, 1)], 6: [(10, 1)], 7: [(23, 1)]}","{5: 1, 6: 1, 7: 1}"
4,4,Bus,"{1: [(2, 1)], 2: [(4, 1)], 3: [(3, 22), (4, 1)...","{1: 1, 2: 1, 3: 26, 4: 1, 5: 20, 6: 3, 7: 2}"
...,...,...,...,...
495,536,jinxes,"{3: [(11, 1), (12, 1)], 4: [(37, 1)], 5: [(5, ...","{3: 2, 4: 1, 5: 13, 6: 11, 7: 3}"
496,542,boarhound,"{1: [(8, 1), (14, 1)], 2: [(7, 1), (14, 2), (1...","{1: 2, 2: 4, 3: 4, 4: 1, 6: 1, 7: 1}"
497,543,spider,"{1: [(2, 1)], 2: [(9, 1), (15, 9)], 3: [(1, 1)...","{1: 1, 2: 10, 3: 5, 4: 30, 5: 2, 6: 9, 7: 9}"
498,544,N.E.W.T.s,"{3: [(4, 1)], 5: [(19, 1)]}","{3: 1, 5: 1}"


### 4.2 Identify "Book of Acquisition"

The first book in which a given book appears

Like "age of acquisition"

In [12]:
def getBookOfAcquisition(word):
    # input: word (string)
    # output: first_appearance - book #, chapter #)
    first_appearance = (0,0)
    occurrences = getWordOccurrence(word)
    if len(occurrences.keys()) > 0:
        first_appearance = (list(occurrences.items())[0][0], 
                            list(occurrences.items())[0][1][0][0])
    return first_appearance

In [13]:
hp_domain_info['Book of Acquisition'] = None

for index, row in hp_domain_info.iterrows():
    word = row["Word"]
    first_appearance = getBookOfAcquisition(word)
    if first_appearance != (0,0):
        first_book, first_chapter = first_appearance[0], first_appearance[1] 
        hp_domain_info.at[index, 'Book of Acquisition'] = first_book

In [14]:
hp_domain_info

Unnamed: 0,index,Word,Occurrences,Word Count By Book,Book of Acquisition
0,0,rat,"{1: [(2, 1), (6, 5)], 2: [(3, 1), (5, 1), (8, ...","{1: 6, 2: 6, 3: 49, 4: 3, 5: 4}",1
1,1,dog,"{1: [(1, 1), (3, 1), (5, 2), (9, 4), (10, 3), ...","{1: 22, 2: 4, 3: 41, 4: 20, 5: 18, 6: 4, 7: 2}",1
2,2,frog,"{1: [(4, 1), (6, 2), (7, 1), (12, 2), (13, 2),...","{1: 9, 2: 3, 3: 3, 4: 9, 5: 20, 6: 1, 7: 2}",1
3,3,monkey,"{5: [(35, 1)], 6: [(10, 1)], 7: [(23, 1)]}","{5: 1, 6: 1, 7: 1}",5
4,4,Bus,"{1: [(2, 1)], 2: [(4, 1)], 3: [(3, 22), (4, 1)...","{1: 1, 2: 1, 3: 26, 4: 1, 5: 20, 6: 3, 7: 2}",1
...,...,...,...,...,...
495,536,jinxes,"{3: [(11, 1), (12, 1)], 4: [(37, 1)], 5: [(5, ...","{3: 2, 4: 1, 5: 13, 6: 11, 7: 3}",3
496,542,boarhound,"{1: [(8, 1), (14, 1)], 2: [(7, 1), (14, 2), (1...","{1: 2, 2: 4, 3: 4, 4: 1, 6: 1, 7: 1}",1
497,543,spider,"{1: [(2, 1)], 2: [(9, 1), (15, 9)], 3: [(1, 1)...","{1: 1, 2: 10, 3: 5, 4: 30, 5: 2, 6: 9, 7: 9}",1
498,544,N.E.W.T.s,"{3: [(4, 1)], 5: [(19, 1)]}","{3: 1, 5: 1}",3


### 4.3 Measure Word Frequency and Log10 WF

The number of times a word appears in a corpus over the total number of words in the corpus

“Word frequency” (WF) effect: Higher frequency words are recognized more quickly than lower frequency words.

In [15]:
word_count = []
for index, row in hp_texts.iterrows():
    chapter = list(filter(None,hp_texts.iloc[index][2].strip().split(" ")))
    word_count.append(len(chapter))
    
total_word_count = sum(word_count)
total_word_count

1118639

In [16]:
def calculateWF(unigram_occurrences, all_word_occurrences):
    return unigram_occurrences/all_word_occurrences

def calculateLog10WF(unigram_occurrences, all_word_occurrences):
    return math.log10(unigram_occurrences/all_word_occurrences)

In [17]:
hp_domain_info['WF'] = None
hp_domain_info['Lg10WF'] = None

for index, row in hp_domain_info.iterrows():
    word, occurrence = row["Word"], row["Word Count By Book"]
    word_count = sum(occurrence.values())
    wf = calculateWF(word_count, total_word_count)
    hp_domain_info.loc[index, 'WF'] = wf
    if word_count != 0:
        log10wf = calculateLog10WF(sum(occurrence.values()), total_word_count)
        hp_domain_info.loc[index, 'Lg10WF'] = log10wf
    else:
        hp_domain_info.loc[index, 'Lg10WF'] = np.nan

#### SUBTLEX WF

In [29]:
df_subtlex = pd.read_csv("../../Data/WF_databases/SUBTLEXusfrequencyabove1.csv")
df_subtlex.rename(columns={"Lg10WF": "Lg10WF (SUBTLEX)"}, inplace=True)
df_subtlex

Unnamed: 0,Word,FREQcount,CDcount,FREQlow,Cdlow,SUBTLWF,Lg10WF (SUBTLEX),SUBTLCD,Lg10CD
0,the,1501908,8388,1339811,8388,29449.18,6.1766,100.00,3.9237
1,to,1156570,8383,1138435,8380,22677.84,6.0632,99.94,3.9235
2,a,1041179,8382,976941,8380,20415.27,6.0175,99.93,3.9234
3,you,2134713,8381,1595028,8376,41857.12,6.3293,99.92,3.9233
4,and,682780,8379,515365,8374,13387.84,5.8343,99.89,3.9232
...,...,...,...,...,...,...,...,...,...
60379,zionism,2,1,2,1,0.04,0.4771,0.01,0.3010
60380,Zionists,2,1,0,0,0.04,0.4771,0.01,0.3010
60381,zloty,2,1,2,1,0.04,0.4771,0.01,0.3010
60382,zoon,6,1,6,1,0.12,0.8451,0.01,0.3010


In [30]:
hp_domain_info = pd.merge(hp_domain_info, 
                     df_subtlex[['Word', 'Lg10WF (SUBTLEX)']], 
                     on="Word")

In [31]:
hp_domain_info

Unnamed: 0,index,Word,Occurrences,Word Count By Book,Book of Acquisition,WF,Lg10WF,Lg10WF (SUBTLEX)
0,0,rat,"{1: [(2, 1), (6, 5)], 2: [(3, 1), (5, 1), (8, ...","{1: 6, 2: 6, 3: 49, 4: 3, 5: 4}",1,0.000061,-4.216181,3.2212
1,1,dog,"{1: [(1, 1), (3, 1), (5, 2), (9, 4), (10, 3), ...","{1: 22, 2: 4, 3: 41, 4: 20, 5: 18, 6: 4, 7: 2}",1,0.000099,-4.003367,3.9928
2,2,frog,"{1: [(4, 1), (6, 2), (7, 1), (12, 2), (13, 2),...","{1: 9, 2: 3, 3: 3, 4: 9, 5: 20, 6: 1, 7: 2}",1,0.000042,-4.376592,2.7810
3,3,monkey,"{5: [(35, 1)], 6: [(10, 1)], 7: [(23, 1)]}","{5: 1, 6: 1, 7: 1}",5,0.000003,-5.571569,3.2330
4,6,motorcycle,"{1: [(1, 5), (2, 3)], 5: [(7, 1)], 7: [(4, 1),...","{1: 8, 5: 1, 7: 2}",1,0.00001,-5.007297,2.6590
...,...,...,...,...,...,...,...,...
162,465,Harry,"{1: [(1, 19), (2, 73), (3, 64), (4, 45), (5, 1...","{1: 1213, 2: 1513, 3: 1868, 4: 2929, 5: 3731, ...",1,0.014981,-1.824468,3.6226
163,475,Avis,"{4: [(18, 1)]}",{4: 1},4,0.000001,-6.04869,1.2553
164,535,hexes,"{4: [(20, 1), (26, 1), (29, 3), (31, 4)], 5: [...","{4: 9, 5: 5, 6: 5}",4,0.000017,-4.769936,0.7782
165,536,jinxes,"{3: [(11, 1), (12, 1)], 4: [(37, 1)], 5: [(5, ...","{3: 2, 4: 1, 5: 13, 6: 11, 7: 3}",3,0.000027,-4.571569,0.7782


### 4.4 Measure Contextual Diversity

Contextual diversity: The number of passages (chapters) in a corpus that contain a word over the number of total passages in a corpus

A word’s contextual diversity is the number of different contexts in which it appears.

In [32]:
# Find total number of passages (chapters) in HP series
total_passage_count = len(hp_texts["Chapter No."].tolist())

In [33]:
def calculateCD(occurrence_count, total_passage_count):
    # occurrence_count = number of passages (chapters) a given word appears in
    # total_passage_count = number of total passages (chapters)
    cd = float(occurrence_count/total_passage_count)
    return cd

def calculateLog10CD(occurrence_count, total_passage_count):
    cd = float(occurrence_count/total_passage_count)
    return math.log10(cd)

In [34]:
hp_domain_info['CD'] = None
hp_domain_info['Lg10CD'] = None

for index, row in hp_domain_info.iterrows():
    occurrence = row["Occurrences"]
    passage_count = 0
    for key, val in occurrence.items():
        passage_count += len(val)
    cd = calculateCD(passage_count, total_passage_count)
    hp_domain_info.at[index, 'CD'] = cd
    if cd != 0:
        log10cd = calculateLog10CD(passage_count, total_passage_count)
        hp_domain_info.at[index, 'Lg10CD'] = log10cd
    else:
        hp_domain_info.at[index, 'Lg10CD'] = np.nan

In [35]:
hp_domain_info

Unnamed: 0,index,Word,Occurrences,Word Count By Book,Book of Acquisition,WF,Lg10WF,Lg10WF (SUBTLEX),CD,Lg10CD
0,0,rat,"{1: [(2, 1), (6, 5)], 2: [(3, 1), (5, 1), (8, ...","{1: 6, 2: 6, 3: 49, 4: 3, 5: 4}",1,0.000061,-4.216181,3.2212,0.121212,-0.916454
1,1,dog,"{1: [(1, 1), (3, 1), (5, 2), (9, 4), (10, 3), ...","{1: 22, 2: 4, 3: 41, 4: 20, 5: 18, 6: 4, 7: 2}",1,0.000099,-4.003367,3.9928,0.242424,-0.615424
2,2,frog,"{1: [(4, 1), (6, 2), (7, 1), (12, 2), (13, 2),...","{1: 9, 2: 3, 3: 3, 4: 9, 5: 20, 6: 1, 7: 2}",1,0.000042,-4.376592,2.7810,0.146465,-0.834267
3,3,monkey,"{5: [(35, 1)], 6: [(10, 1)], 7: [(23, 1)]}","{5: 1, 6: 1, 7: 1}",5,0.000003,-5.571569,3.2330,0.015152,-1.819544
4,6,motorcycle,"{1: [(1, 5), (2, 3)], 5: [(7, 1)], 7: [(4, 1),...","{1: 8, 5: 1, 7: 2}",1,0.00001,-5.007297,2.6590,0.025253,-1.597695
...,...,...,...,...,...,...,...,...,...,...
162,465,Harry,"{1: [(1, 19), (2, 73), (3, 64), (4, 45), (5, 1...","{1: 1213, 2: 1513, 3: 1868, 4: 2929, 5: 3731, ...",1,0.014981,-1.824468,3.6226,1.0,0.0
163,475,Avis,"{4: [(18, 1)]}",{4: 1},4,0.000001,-6.04869,1.2553,0.005051,-2.296665
164,535,hexes,"{4: [(20, 1), (26, 1), (29, 3), (31, 4)], 5: [...","{4: 9, 5: 5, 6: 5}",4,0.000017,-4.769936,0.7782,0.070707,-1.150537
165,536,jinxes,"{3: [(11, 1), (12, 1)], 4: [(37, 1)], 5: [(5, ...","{3: 2, 4: 1, 5: 13, 6: 11, 7: 3}",3,0.000027,-4.571569,0.7782,0.116162,-0.934937


### 4.5 Measure Burstingness

Burstingness: average number of words between two occurrences

In [36]:
def get_chapter_word_count(book, chapter):
    chapter_wc = hp_texts[(hp_texts["Book No."]==book) & (hp_texts["Chapter No."]==chapter)]["Word Count"]
    return int(chapter_wc)

def get_book_word_count(book):
    book_wc = 0
    chapters_in_book = [i for i in hp_texts[(hp_texts["Book No."]==book)]["Chapter No."]]
    for chapter in chapters_in_book:
        book_wc += int(hp_texts[(hp_texts["Book No."]==book) & (hp_texts["Chapter No."]==chapter)]["Word Count"])
    return book_wc

In [37]:
def find_locations(word, text):
    text_dict = text.split()
    indices = [i for i, x in enumerate(text_dict) if x==word or x==word.lower() or x==word.capitalize()]
    return indices

def find_all_word_locations(word, hp_texts=hp_texts):
    word_locations = []
    for index, row in hp_texts.iterrows():
        chapter_text = hp_texts.at[index, "Text"]
        locations = find_locations(word, chapter_text)
        if locations != []:
            word_locations.append((hp_texts.at[index, "Book No."], 
                                   hp_texts.at[index, "Chapter No."],
                                   locations))
    return word_locations

In [38]:
def findSpacingBetweenWords(word):
    def spaceBetweenWords(prev, curr):
        prev_book, prev_chapter, prev_idx = prev[0], prev[1], prev[2]
        curr_book, curr_chapter, curr_idx = curr[0], curr[1], curr[2]
        space = 0
        if prev_book == curr_book and prev_chapter == curr_chapter:
            return curr_idx - prev_idx - 1
        elif prev_book == curr_book and prev_chapter < curr_chapter:
            space += get_chapter_word_count(curr_book, prev_chapter) - prev_idx # words left over in prev chapter
            for chapter in range(prev_chapter+1, curr_chapter):
                space += get_chapter_word_count(curr_book, chapter)
            return space + curr_idx - 1
        elif prev_book < curr_book:
            space += get_chapter_word_count(prev_book, prev_chapter) - prev_idx  # finish prev chapter of prev book
            # words left over in prev book (prev_book+1 to end of prev_book)
            chapters_left_prev = [i for i in hp_texts[(hp_texts["Book No."]==prev_book)]["Chapter No."]]
            for chapter in range(prev_chapter+1, chapters_left_prev[-1]+1):
                space += get_chapter_word_count(prev_book, chapter)
            # words in books in between
            books_inbetween = list(range(prev_book+1,curr_book))
            for book in books_inbetween:
                space += get_book_word_count(book)
            # words in curr_book, up to curr_chapter
            for chap in range(1, curr_chapter):
                space += get_chapter_word_count(curr_book, chap)
            # words in curr_chapter, up to curr_idx
            return space + curr_idx - 1
        return space

    locations = find_all_word_locations(word)
    expanded_locations = []
    for book, chapter, loc in locations:
        for l in loc:
            expanded_locations.append((book, chapter, l))
    if len(expanded_locations) == 0:
        print(f"The word '{word}' never appears in the HP Series (Books 1-7).")
        return float(np.NaN)
    elif len(expanded_locations) == 1:
        print(f"The word '{word}' appears once in the HP Series in Book {expanded_locations[0][0]}, Chapter {expanded_locations[0][1]}, Position {expanded_locations[0][2]}.")
        return float(np.NaN)
    spacing = []
    prev = expanded_locations[0][0], expanded_locations[0][1], expanded_locations[0][2]
    for book, chapter, loc in expanded_locations[1:]:
        curr = book, chapter, loc
        spacing.append(spaceBetweenWords(prev, curr))
        prev = curr
    return spacing

In [39]:
def calculateAverageBurstingness(word):
    spacing = findSpacingBetweenWords(word)
    if np.isnan(spacing).all():
        return float(spacing), float(spacing)
    burstingness = sum(spacing)/len(spacing)
    log10burstingness = math.log10(burstingness)
    return burstingness, log10burstingness

In [40]:
hp_domain_info['Burstingness'] = None
hp_domain_info['Lg10B'] = None

for index, row in hp_domain_info.iterrows():
    word = row["Word"]
    burstingness, log10burstingness = calculateAverageBurstingness(word)
    hp_domain_info.at[index, 'Burstingness'] = burstingness
    hp_domain_info.at[index, 'Lg10B'] = log10burstingness

The word 'tiger' appears once in the HP Series in Book 3, Chapter 4, Position 2929.
The word 'squirrel' appears once in the HP Series in Book 7, Chapter 18, Position 1400.
The word 'ogre' appears once in the HP Series in Book 3, Chapter 8, Position 4314.
The word 'Avis' appears once in the HP Series in Book 4, Chapter 18, Position 6021.


### Export Domain Info DF to CSV

In [41]:
hp_domain_info

Unnamed: 0,index,Word,Occurrences,Word Count By Book,Book of Acquisition,WF,Lg10WF,Lg10WF (SUBTLEX),CD,Lg10CD,Burstingness,Lg10B
0,0,rat,"{1: [(2, 1), (6, 5)], 2: [(3, 1), (5, 1), (8, ...","{1: 6, 2: 6, 3: 49, 4: 3, 5: 4}",1,0.000061,-4.216181,3.2212,0.121212,-0.916454,11119.966667,4.046103
1,1,dog,"{1: [(1, 1), (3, 1), (5, 2), (9, 4), (10, 3), ...","{1: 22, 2: 4, 3: 41, 4: 20, 5: 18, 6: 4, 7: 2}",1,0.000099,-4.003367,3.9928,0.242424,-0.615424,11430.11828,4.058051
2,2,frog,"{1: [(4, 1), (6, 2), (7, 1), (12, 2), (13, 2),...","{1: 9, 2: 3, 3: 3, 4: 9, 5: 20, 6: 1, 7: 2}",1,0.000042,-4.376592,2.7810,0.146465,-0.834267,25834.361111,4.412198
3,3,monkey,"{5: [(35, 1)], 6: [(10, 1)], 7: [(23, 1)]}","{5: 1, 6: 1, 7: 1}",5,0.000003,-5.571569,3.2330,0.015152,-1.819544,321452.0,5.507116
4,6,motorcycle,"{1: [(1, 5), (2, 3)], 5: [(7, 1)], 7: [(4, 1),...","{1: 8, 5: 1, 7: 2}",1,0.00001,-5.007297,2.6590,0.025253,-1.597695,106621.888889,5.027846
...,...,...,...,...,...,...,...,...,...,...,...,...
162,465,Harry,"{1: [(1, 19), (2, 73), (3, 64), (4, 45), (5, 1...","{1: 1213, 2: 1513, 3: 1868, 4: 2929, 5: 3731, ...",1,0.014981,-1.824468,3.6226,1.0,0.0,73.7552,1.867793
163,475,Avis,"{4: [(18, 1)]}",{4: 1},4,0.000001,-6.04869,1.2553,0.005051,-2.296665,,
164,535,hexes,"{4: [(20, 1), (26, 1), (29, 3), (31, 4)], 5: [...","{4: 9, 5: 5, 6: 5}",4,0.000017,-4.769936,0.7782,0.070707,-1.150537,29811.611111,4.474385
165,536,jinxes,"{3: [(11, 1), (12, 1)], 4: [(37, 1)], 5: [(5, ...","{3: 2, 4: 1, 5: 13, 6: 11, 7: 3}",3,0.000027,-4.571569,0.7782,0.116162,-0.934937,32860.111111,4.516669


In [56]:
hp_domain_info.to_csv(r'../Results/20220319_DomainWords_Measures.csv', index = False)

# 5. Find Windows

In [44]:
def get_windows(word_location, text, position, window_size):
    # position (window): ["before", "after", "before_and_after"]
    text_dict = text.split()
    window_words = []
    if position == 'before':
        window_words = text_dict[word_location-window_size:word_location+1]
    elif position == 'after':
        window_words = text_dict[word_location:word_location+window_size+1]
    elif position == 'both':
        window_words = text_dict[word_location-window_size:word_location+window_size+1]
    window = ' '.join(window_words)
    return window

In [45]:
def print_windows(word, position, window_size):
    word_locations = find_all_word_locations(word, hp_texts)
    
    for item in word_locations:
        book, chapter, locations = item[0], item[1], item[2]
        for loc in locations:
            hp_idx = hp_texts[(hp_texts["Book No."]==book) & (hp_texts["Chapter No."]==chapter)].index[0]
            text = hp_texts.at[hp_idx, "Text"]
            print("Book", book,
                  "Chapter", str(chapter).zfill(2),
                  "Location", str(loc).zfill(4),
                  " | Window:", get_windows(loc, text, position, window_size))

In [46]:
print_windows(word="Wingardium", position="after", window_size=5)

Book 1 Chapter 10 Location 2226  | Window: Wingardium Leviosa he shouted waving his
Book 1 Chapter 10 Location 2277  | Window: Wingardium Leviosa Their feather rose off
Book 1 Chapter 10 Location 3524  | Window: Wingardium Leviosa The club flew suddenly
Book 5 Chapter 36 Location 0651  | Window: Wingardium Leviosa and they flew into
Book 7 Chapter 04 Location 4303  | Window: Wingardium Leviosa The sidecar rose like
Book 7 Chapter 32 Location 3621  | Window: Wingardium Leviosa The twig flew up


In [47]:
print_windows(word="avada", position="both", window_size=4)

Book 4 Chapter 14 Location 1698  | Window: Moody looking at her. Avada Kedavra Hermione whispered. Several
Book 4 Chapter 14 Location 1726  | Window: the last and worst. Avada Kedavra . . .
Book 4 Chapter 14 Location 1798  | Window: sudden thrill of foreboding. Avada Kedavra Moody roared. There
Book 4 Chapter 14 Location 2210  | Window: what Moody was saying. Avada Kedavras a curse that
Book 4 Chapter 14 Location 2319  | Window: those three curses — Avada Kedavra Imperius and Cruciatus
Book 4 Chapter 14 Location 2870  | Window: he When he did Avada Kedavra the way that
Book 4 Chapter 32 Location 0474  | Window: words to the night: Avada Kedavra A blast of
Book 4 Chapter 34 Location 0264  | Window: . . the unblockable Avada Kedavra curse — and
Book 4 Chapter 34 Location 1175  | Window: shouted Expelliarmus Voldemort cried Avada Kedavra A jet of
Book 5 Chapter 05 Location 5254  | Window: Something worse than the Avada Kedavra — Thats enough.
Book 5 Chapter 06 Location 0574  | Window: any

In [48]:
print_windows(word="Expecto", position="after", window_size=7)

Book 3 Chapter 12 Location 1309  | Window: Expecto Patronum Expecto Patronum Harry repeated under his
Book 3 Chapter 12 Location 1311  | Window: Expecto Patronum Harry repeated under his breath Expecto
Book 3 Chapter 12 Location 1318  | Window: Expecto Patronum. Concentrating hard on your happy memory
Book 3 Chapter 12 Location 1342  | Window: Expecto Patrono — no Patronum — sorry —
Book 3 Chapter 12 Location 1350  | Window: Expecto Patronum Expecto Patronum — Something whooshed suddenly
Book 3 Chapter 12 Location 1352  | Window: Expecto Patronum — Something whooshed suddenly out of
Book 3 Chapter 12 Location 1530  | Window: Expecto Patronum Harry yelled. Expecto Patronum Expecto —
Book 3 Chapter 12 Location 1534  | Window: Expecto Patronum Expecto — But the classroom and
Book 3 Chapter 12 Location 1536  | Window: Expecto — But the classroom and the dementor
Book 3 Chapter 12 Location 1917  | Window: Expecto Patronum Harry yelled. Expecto Patronum Expecto Pat
Book 3 Chapter 12 Location

In [49]:
print_windows(word="reparo", position="both", window_size=5)

Book 4 Chapter 11 Location 3044  | Window: pulled out her wand muttered Reparo and the glass shards flew
Book 5 Chapter 15 Location 6917  | Window: floor. Ron nodded and left. Reparo Harry muttered pointing his wand
Book 5 Chapter 26 Location 6954  | Window: swirling in its draining potion. Reparo hissed Snape and the jar
Book 5 Chapter 30 Location 0982  | Window: cup to crack into two. Reparo said Hermione quickly mending Rons
Book 6 Chapter 11 Location 0922  | Window: covers tapped each and said Reparo There sat the Princes copy
Book 6 Chapter 14 Location 0987  | Window: the bowl and shattered it. Reparo he said hastily poking the


In [50]:
print_windows(word="manor", position="before", window_size=5)

Book 2 Chapter 03 Location 1428  | Window: Malfoy strutting around a large manor
Book 2 Chapter 04 Location 2957  | Window: Ill expect you at the manor
Book 2 Chapter 12 Location 5010  | Window: Ministry of Magic raided our manor
Book 4 Chapter 01 Location 0056  | Window: its face. Once a fine-looking manor
Book 6 Chapter 10 Location 1636  | Window: opposite hillside was a handsome manor
Book 6 Chapter 10 Location 5461  | Window: Tom Riddle reappeared at the manor
Book 7 Chapter 01 Location 0287  | Window: with a snort. A handsome manor
Book 7 Chapter 23 Location 5363  | Window: in the cellar of Malfoy Manor
Book 7 Chapter 26 Location 2264  | Window: that the inhabitants of Malfoy Manor


In [51]:
print_windows(word="He-Who-Must-Not-Be-Named", position="after", window_size=5)

Book 1 Chapter 05 Location 6275  | Window: He-Who-Must-Not-Be-Named did great things — terrible
Book 2 Chapter 02 Location 0853  | Window: He-Who-Must-Not-Be-Named — Voldemort said Harry. Dobby
Book 2 Chapter 02 Location 1274  | Window: He-Who-Must-Not-Be-Named sir — But Dobbys eyes
Book 2 Chapter 02 Location 1375  | Window: He-Who-Must-Not-Be-Named at the height of his
Book 2 Chapter 06 Location 1431  | Window: He-Who-Must-Not-Be-Named He glanced at the lightning
Book 2 Chapter 10 Location 4576  | Window: He-Who-Must-Not-Be-Named was at the height of
Book 2 Chapter 18 Location 3147  | Window: He-Who-Must-Not-Be-Named remember Well — It was
Book 3 Chapter 16 Location 2792  | Window: He-Who-Must-Not-Be-Named My dear boy thats hardly
Book 3 Chapter 19 Location 2835  | Window: He-Who-Must-Not-Be-Named taught him a few tricks
Book 3 Chapter 19 Location 4818  | Window: He-Who-Must-Not-Be-Named forced me — DONT LIE
Book 4 Chapter 24 Location 1535  | Window: He-Who-Must-Not-Be-Named and were 

In [52]:
print_windows(word="Gryffindor", position="both", window_size=5)

Book 1 Chapter 06 Location 4668  | Window: and I hope Im in Gryffindor it sounds by far the
Book 1 Chapter 06 Location 4767  | Window: your brothers in asked Harry. Gryffindor said Ron. Gloom seemed to
Book 1 Chapter 07 Location 0254  | Window: The four Houses are called Gryffindor Hufflepuff Ravenclaw and Slytherin. Each
Book 1 Chapter 07 Location 1211  | Window: be. You might belong in Gryffindor Where dwell the brave at
Book 1 Chapter 07 Location 1589  | Window: Lavender became the first new Gryffindor and the table on the
Book 1 Chapter 07 Location 2139  | Window: and walked shakily toward the Gryffindor table. He was so relieved
Book 1 Chapter 07 Location 2347  | Window: Ron joined Harry at the Gryffindor table. Turpin Lisa became a
Book 1 Chapter 07 Location 2732  | Window: your service. Resident ghost of Gryffindor Tower. I know who you
Book 1 Chapter 07 Location 3937  | Window: bedtime. Off you trot The Gryffindor first years followed Percy through
Book 1 Chapter 07 Location 42

Book 2 Chapter 06 Location 0060  | Window: Ron sat down at the Gryffindor table next to Hermione who
Book 2 Chapter 06 Location 0760  | Window: McGonagall was moving along the Gryffindor table handing out course schedules.
Book 2 Chapter 06 Location 1608  | Window: state. Excellent. Ten points to Gryffindor said Professor Sprout. The Mandrake
Book 2 Chapter 06 Location 2773  | Window: tentative step forward. Im in Gryffindor too. Dyou think — would
Book 2 Chapter 06 Location 4026  | Window: excellent Take ten points for Gryffindor And so — to business
Book 2 Chapter 07 Location 0168  | Window: Oliver Wood Captain of the Gryffindor Quidditch team. Whassamatter said Harry
Book 2 Chapter 07 Location 0678  | Window: and George Weasley are the Gryffindor Beaters. And what are the
Book 2 Chapter 07 Location 0815  | Window: fifty points. And youre the Gryffindor Seeker arent you said Colin
Book 2 Chapter 07 Location 0904  | Window: stands. The rest of the Gryffindor team were already in the
B

Book 3 Chapter 15 Location 3587  | Window: waving scarlet flags with the Gryffindor lion upon them or brandishing
Book 3 Chapter 15 Location 3857  | Window: of the Snitch. And its Gryffindor in possession Alicia Spinnet of
Book 3 Chapter 15 Location 3863  | Window: in possession Alicia Spinnet of Gryffindor with the Quaffle heading straight
Book 3 Chapter 15 Location 3909  | Window: its caught by — Johnson Gryffindor back in possession come on
Book 3 Chapter 15 Location 4023  | Window: between them. Penalty shot to Gryffindor for an unprovoked attack on
Book 3 Chapter 15 Location 4108  | Window: hovering in front of the Gryffindor goalposts his jaw clenched. Course
Book 3 Chapter 15 Location 4179  | Window: Malfoy off the Snitch until Gryffindor was more than fifty points
Book 3 Chapter 15 Location 4187  | Window: than fifty points up — Gryffindor in possession no Slytherin in
Book 3 Chapter 15 Location 4197  | Window: in possession — no — Gryffindor back in possession and its
Book 3 C

Book 5 Chapter 14 Location 4556  | Window: set up a chant of Gryffindor are losers Gryffindor are losers
Book 5 Chapter 14 Location 4559  | Window: chant of Gryffindor are losers Gryffindor are losers but there was
Book 5 Chapter 14 Location 4757  | Window: the portrait hole into the Gryffindor common room. It was —
Book 5 Chapter 14 Location 4954  | Window: in getting the chant of Gryffindor are losers out of his
Book 5 Chapter 14 Location 5411  | Window: the scroll: To Ronald Weasley Gryffindor House Hogwarts. He looked up
Book 5 Chapter 15 Location 1237  | Window: sat down together at the Gryffindor table. Obviously Id have been
Book 5 Chapter 15 Location 3395  | Window: to take five points from Gryffindor House. There was an outbreak
Book 5 Chapter 15 Location 3652  | Window: as he arrived at the Gryffindor table for breakfast on Tuesday
Book 5 Chapter 15 Location 3693  | Window: Great Hall Five points from Gryffindor But Professor — hes gone
Book 5 Chapter 15 Location 3795  | Wind

Book 6 Chapter 18 Location 1581  | Window: an extra ten points to Gryffindor for sheer cheek Still chuckling
Book 6 Chapter 18 Location 3730  | Window: followed Harry back to the Gryffindor Tower at a run. They
Book 6 Chapter 19 Location 0983  | Window: have a grudge against the Gryffindor Quidditch team could they said
Book 6 Chapter 19 Location 2096  | Window: Peeves near the turning into Gryffindor Tower but he was streaking
Book 6 Chapter 19 Location 3979  | Window: was being quite rude about Gryffindor I expect he regrets that
Book 6 Chapter 19 Location 4365  | Window: Luna vaguely. Oh look The Gryffindor Keepers got hold of one
Book 6 Chapter 21 Location 3830  | Window: class. Another ten points from Gryffindor said Snape. I would expect
Book 6 Chapter 23 Location 3799  | Window: track down objects owned by Gryffindor or Ravenclaw. Four objects from
Book 6 Chapter 23 Location 3843  | Window: the only known relic of Gryffindor remains safe. Dumbledore pointed his
Book 6 Chapter 24

In [53]:
print_windows(word="Slytherin", position="both", window_size=5)

Book 1 Chapter 05 Location 4131  | Window: I know Ill be in Slytherin all our family have been
Book 1 Chapter 05 Location 4679  | Window: the rules. And what are Slytherin and Hufflepuff School Houses. Theres
Book 1 Chapter 05 Location 4707  | Window: Harry gloomily. Better Hufflepuff than Slytherin said Hagrid darkly. Theres not
Book 1 Chapter 07 Location 1271  | Window: their kind; Or perhaps in Slytherin Youll make your real friends
Book 1 Chapter 07 Location 1623  | Window: after all hed heard about Slytherin but he thought they looked
Book 1 Chapter 07 Location 2068  | Window: the stool and thought Not Slytherin not Slytherin. Not Slytherin eh
Book 1 Chapter 07 Location 2072  | Window: Not Slytherin not Slytherin. Not Slytherin eh said the small voice.
Book 1 Chapter 07 Location 2094  | Window: here in your head and Slytherin will help you on the
Book 1 Chapter 07 Location 2153  | Window: chosen and not put in Slytherin he hardly noticed that he
Book 1 Chapter 07 Location 2909  | 

Book 2 Chapter 17 Location 1117  | Window: She set the serpent of Slytherin on four Mudbloods and the
Book 2 Chapter 17 Location 2230  | Window: runs the blood of Salazar Slytherin himself through my mothers side
Book 2 Chapter 17 Location 3085  | Window: to Hogwarts since the great Slytherin himself. We even look something
Book 2 Chapter 17 Location 3152  | Window: Lord Voldemort Heir of Salazar Slytherin against famous Harry Potter and
Book 2 Chapter 17 Location 3204  | Window: into the stone face of Slytherin high above him in the
Book 2 Chapter 17 Location 3232  | Window: . . Speak to me Slytherin greatest of the Hogwarts Four.
Book 2 Chapter 18 Location 1504  | Window: last remaining descendant of Salazar Slytherin — can speak Parseltongue. Unless
Book 2 Chapter 18 Location 1560  | Window: So I should be in Slytherin Harry said looking desperately into
Book 2 Chapter 18 Location 1598  | Window: to have many qualities Salazar Slytherin prized in his hand-picked students.
Book 3 Cha

Book 6 Chapter 29 Location 3983  | Window: Houses — Slughorn can represent Slytherin — that I want to
Book 6 Chapter 30 Location 2002  | Window: unwonted venom. Over at the Slytherin table Crabbe and Goyle were
Book 6 Chapter 30 Location 2193  | Window: at the head of the Slytherin column wearing magnificent long emerald
Book 7 Chapter 07 Location 4969  | Window: could defeat the Heir of Slytherin Did he wish to give
Book 7 Chapter 10 Location 0737  | Window: all the rest of the Slytherin family. There were many pictures
Book 7 Chapter 10 Location 3069  | Window: to emphasize the opposite. The Slytherin colors of emerald and silver
Book 7 Chapter 16 Location 0942  | Window: He venerated Snape the first Slytherin headmaster since he himself had
Book 7 Chapter 23 Location 0952  | Window: were you in at Hogwarts Slytherin said Harry automatically. Funny ow
Book 7 Chapter 23 Location 1027  | Window: really ave caught a little Slytherin said Scabior. Good for you
Book 7 Chapter 29 Location 

In [54]:
print_windows(word="Ravenclaw", position="both", window_size=5)

Book 1 Chapter 06 Location 4685  | Window: in it but I suppose Ravenclaw wouldnt be too bad. .
Book 1 Chapter 06 Location 4797  | Window: Im not. I dont suppose Ravenclaw would be too bad but
Book 1 Chapter 07 Location 0256  | Window: Houses are called Gryffindor Hufflepuff Ravenclaw and Slytherin. Each House has
Book 1 Chapter 07 Location 1251  | Window: Or yet in wise old Ravenclaw If youve a ready mind
Book 1 Chapter 07 Location 1580  | Window: them. Brocklehurst Mandy went to Ravenclaw too but Brown Lavender became
Book 1 Chapter 07 Location 2353  | Window: table. Turpin Lisa became a Ravenclaw and then it was Rons
Book 1 Chapter 17 Location 3954  | Window: match we were steamrollered by Ravenclaw without you — but the
Book 1 Chapter 17 Location 4605  | Window: with three hundred and fifty-two; Ravenclaw has four hundred and twenty-six
Book 1 Chapter 17 Location 5064  | Window: storm of applause for even Ravenclaw and Hufflepuff were celebrating the
Book 2 Chapter 05 Location 3147 

In [55]:
print_windows(word="Hufflepuff", position="both", window_size=5)

Book 1 Chapter 05 Location 4141  | Window: been — imagine being in Hufflepuff I think Id leave wouldnt
Book 1 Chapter 05 Location 4681  | Window: And what are Slytherin and Hufflepuff School Houses. Theres four. Everyone
Book 1 Chapter 05 Location 4688  | Window: Houses. Theres four. Everyone says Hufflepuff are a lot o duffers
Book 1 Chapter 05 Location 4700  | Window: — I bet Im in Hufflepuff said Harry gloomily. Better Hufflepuff
Book 1 Chapter 05 Location 4705  | Window: Hufflepuff said Harry gloomily. Better Hufflepuff than Slytherin said Hagrid darkly.
Book 1 Chapter 07 Location 0255  | Window: four Houses are called Gryffindor Hufflepuff Ravenclaw and Slytherin. Each House
Book 1 Chapter 07 Location 0741  | Window: Hope to see you in Hufflepuff said the Friar. My old
Book 1 Chapter 07 Location 1230  | Window: apart; You might belong in Hufflepuff Where they are just and
Book 1 Chapter 07 Location 1521  | Window: to sit down at the Hufflepuff table. Harry saw the ghost
Book 1 Cha