In [None]:
import csv
import urllib.request
import json
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import quote
import os
import re

In [None]:
# Moving the characters from our .csv file into a list of tuples
characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv = csv.reader(sent_file, delimiter=",")
    for row in csv:
        name = row[0].replace(' ', '_') 
        # Parentage, House, and Occupation will be our attributes in the nodes
        parentage = row[1]
        house = row[2]
        occupation = row[3]
        characters.append((name, parentage, house, occupation))

In [None]:
### Do not need to run this, might just take it out or place it somewhere else????
lengths = []

#https://www.reddit.com/r/learnpython/comments/muwu7v/scraping_fandomwiki_pages/
baseurl = "https://harrypotter.fandom.com/api.php?"
action = "action=query"

for character in characters:
    #if (character[0] == 'Eldred_Worple'):
    # Set up the query for the character
    title = "titles=" + character[0]
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    # Since we have checked the articles, we know that urlopen will succeed
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    lengths.append((character, len(wikitext)))

    try:
        jsonobj = json.loads(wikitext)
    except ValueError: 
        print(f'Decoding JSON has failed for {character}, moving on...')
    # Get the number for the article
    num = list(jsonobj['query']['pages'].keys())[0]
    # Get the wikitext
    wikitext = jsonobj['query']['pages'][num]['revisions'][0]['*']
    f_name = character[0] + '.txt'

    # Remove <ref> tags including content.
    clean_text = re.sub(r'<ref.*?</ref>', "", wikitext)
    clean_text = re.sub(r'<ref.*?/>', "", clean_text)

    # Write to a file with that name
    with open("characters/" + f_name, 'w') as f:
        f.write(clean_text)

In [None]:
def find_aliases(text, alias_dict, name):
    #print("Name: " + name)
    #Find all aliases. Will be between |alias and |title, can be on several lines
    aliases = re.findall(r'\|alias = (.(.|\s)*?)\|', text)
    
    if len(aliases)==0:
        return
    else:
        #Take the first match in the first group and split by new line so each alias becomes an element
        all_aliases = aliases[0][0].split("\n")
        #print("length of aliases: " + str(len(all_aliases)))

        #Find the aliases to keep
        for alias in all_aliases:
            #Don't keep the ones used for a disguise
            if "disguise" in alias:
                continue
            #Mudblood not an alias
            if "Mudblood" in alias:
                continue
            #Don't keep the ones used to tell others a wrong name 
            if "the name he told" in alias:
                continue
            if "the name she told" in alias:
                continue
            #If alias is empty string then don't keep
            if alias == '':
                continue
            #Cleaning of the aliases
            processed_alias = alias.split(" (by")[0]
            if '{{' in processed_alias:
                processed_alias = alias.split("{{")[0]
            processed_alias = processed_alias.replace('*', '')
            processed_alias = processed_alias.replace('[[', '').replace(']]', '')
            processed_alias = processed_alias.replace('"', '')
            if processed_alias in alias_dict:
                print("Processed alias: " + processed_alias + ", belonging to: " + name)
            else:
                alias_dict[processed_alias.lower()] = name   

In [None]:
alias_dictionary = {}
not_added = []
for file_ in os.listdir("./characters"):
    #print(file_)
    name = file_[:-4]
    #print(name)
    with open("./characters/"+file_, 'r') as f:
        text = f.read()
    first_name = name.split("_")[0].lower()
    #print("First name: " + first_name)
    full_name = name.replace('_', ' ').lower()
    if first_name in alias_dictionary:
        not_added.append((first_name, name))
    else:
        alias_dictionary[first_name] = name
    alias_dictionary[full_name] = name
    find_aliases(text, alias_dictionary, name)
    


In [None]:
for item in not_added:
    print("Dict has: " + alias_dictionary[item[0]] + " instead of: " + item[1])
    

### Notes to aliases/ first names that collapses:
**Following characters only appears in the last chapter of B7:**
- James Potter II 
- Edward Lupin
- Rose Granger-Weasley
- Lily L. Potter

- Frank Bryce only appears in first chapter of B4

For the above characters we will change dictionary when looking in the corresponding chapters

**Following characters has first names that are the same and appears in different books:**
- Marcus Flint (appears in B1, B2, B3) and Marcus Belby (appears in B6), equally important
- Graham Montague (in B3, B5) more important than Graham Pritchard (in B4)
- Frank Longbottom (in B5) more important than Frank Bryce (in B4)
- Avery II (in B4, B5) more important than Avery I (in B6)

**Following characters has first names that are the same and appears in the same books:**
- Graham Montague (in B3, B5) more important than Graham Montague's father (in B5) and mother (in B5)
- Hermione Granger (in all books) more important than Hermione Granger's Father(in B2, B5) and Hermione Granger's Mother (in B2, B5)
- Dennis Creevey (in B4-B6) more important than Dennis (in B5)
- Ernest Macmillan (in B2, B4-B7) more important than Ernest Prang (in B3, B6)
- Fat Friar (in B1, B2, B5) and Fat Lady (in all books) equally important
- Mary Cattermole (in B7) equally important as Mary Macdonald (in B7)
- Evans sister's father (in B7) equally important as Evans sister's mother (in B7)

For those not in the same books we will make dictionary to fit with book. 
For those in the same books we will have to find another solution and prioritize the most important characters. 

Mr. and Mrs. Dursley appears often like this. Needs to be replaced with Vernon_Dursley and Petunia_Dursley.
The Potters should be replaced with James_Potter_I and Lily_J._Potter



In [None]:
# adjusting dictionary to contain those characters that are most important or appears in most books
alias_dictionary['james'] = 'James_Potter_I'
alias_dictionary['frank'] = 'Frank_Longbottom'
alias_dictionary['marcus'] = 'Marcus_Flint'
alias_dictionary['hermione'] = 'Hermione_Granger'
alias_dictionary['mr. dursley'] = 'Vernon_Dursley'
alias_dictionary['mrs. dursley'] = 'Petunia_Dursley'
alias_dictionary['mcgonagall'] = 'Minerva_McGonagall'
alias_dictionary['dumbledore'] = 'Albus_Dumbledore'
alias_dictionary['ernest'] = 'Ernest_Macmillan'
alias_dictionary['dennis'] = 'Dennis_Creevey'

#delete those first names that collapses with other characters or words and aren't able to find a solution to
#i.e. rose also in 'the sun rose' and evans also in evans sister's father. 
#check first if they are a key to avoid errors
if 'rose' in alias_dictionary:
    del alias_dictionary['rose']

if 'graham' in alias_dictionary:
    del alias_dictionary['graham']  

if 'avery' in alias_dictionary:
    del alias_dictionary['avery'] 

if 'mary' in alias_dictionary:
    del alias_dictionary['mary'] 

if 'evans' in alias_dictionary:
    del alias_dictionary['evans'] 

In [None]:
# remove generic words used as keys
if 'mr' in alias_dictionary:
    del alias_dictionary['mr']
if 'mrs' in alias_dictionary:
    del alias_dictionary['mrs']
if 'fat' in alias_dictionary:
    del alias_dictionary['fat']

#if a key is short, put white space aroung to make sure it won't be found as a substring in another word
# i.e al is a key but also appears in the word normal
for alias, name in alias_dictionary.items():
    if (len(alias)<4):
        #print(alias)
        alias_dictionary[' '+alias+' '] = alias_dictionary.pop(alias)

In [None]:
print(sorted(alias_dictionary.items()))
#print(alias_dictionary[' al '])

In [None]:
# Replace names in chapters so they are called our character names and not aliases

def replace_aliases(directory):
    for chap in os.listdir(directory):
        # do not look in files where aliases already have been replaced
        if 'replace' in chap:
            continue
        clean_text = ''
        with open(directory+'/'+chap, 'r') as f:
                clean_text = f.read()
                f.close()
        
        # remove all tabs and new lines, lower all text 
        #replace a contraction of 2 names with both their names
        clean_text = clean_text.replace('\t', ' ').replace('\n', ' ').replace('  ', ' ')
        clean_text = clean_text.lower()
        clean_text = clean_text.replace('mr. and mrs. dursley', 'Vernon_Dursley and Petunia_Dursley')
        clean_text = clean_text.replace('the potters', 'James_Potter_I and Lily_J._Potter')
        
        #maybe also replace 'the dursleys with Vernon and Petunia'
        
        # some change in dictionary as those characters only appears in one chapter 
        # and their names collapses with other characters
        if (directory == './B7'):
            if chap == 'B7_Ch37.txt':
                alias_dictionary['james'] = 'James_Potter II'
                alias_dictionary['edward'] = 'Edward_Lupin'
                alias_dictionary['rose'] = 'Rose_Granger-Weasley'
                alias_dictionary['lily'] = 'Lily_L._Potter'
                
        for alias, name in alias_dictionary.items():
            # wait with replacing those aliases as they collapse with other characters in these books
            if (directory == './B2' or directory == './B5'):
                if (alias == 'hermione' or alias == 'hermione granger' or alias == 'dennis' 
                    or alias =='graham montague'):
                    continue
            if(directory == './B6'):
                if (alias == 'ernest'):
                    continue
            clean_text = clean_text.replace(alias, name)
        
        # now those skipped aliases can be found and replaced
        if (directory == './B2'):
            clean_text = clean_text.replace('hermione granger','Hermione_Granger')
            clean_text = clean_text.replace('hermione','Hermione_Granger')
        if (directory == './B5'):
            clean_text = clean_text.replace('hermione granger','Hermione_Granger')
            clean_text = clean_text.replace('hermione','Hermione_Granger')
            clean_text = clean_text.replace('dennis creevey','Dennis_Creevey')
            clean_text = clean_text.replace('dennis','Dennis')
            clean_text = clean_text.replace('graham montague','Graham_Montague')
        if (directory == './B6'):
            clean_text = clean_text.replace('ernest','Ernest_Macmillan')
            
        
        # Write to a file with that name
        filename = chap.split('_')[1].lower()
        dst_filename = directory+'/replaced_' + filename

        with open(dst_filename, 'w') as f:
                f.write(clean_text)

In [None]:
# Loop over chapters, will do it book by book as we have some places where we need to change dictionary
   
#Book 1
replace_aliases('./B1')

#Book 2
replace_aliases('./B2')

#Book 3
alias_dictionary['ernest'] = 'Ernest_Prang'
replace_aliases('./B3')
alias_dictionary['ernest'] = 'Ernest_Macmillan'

#Book 4
alias_dictionary['frank'] = 'Frank_Bryce'
replace_aliases('./B4')
alias_dictionary['frank'] = 'Frank_Longbottom'

#Book 5
alias_dictionary['rose'] = "Rose_Zeller"
replace_aliases('./B5')
del alias_dictionary['rose']

#Book 6
replace_aliases('./B6')

#Book 7
replace_aliases('./B7')

In [None]:
# used for small checks or tests
with open('./B1/replaced_ch1.txt', 'r') as f:
    text = f.read()

print(text)