In [None]:
import csv
import urllib.request
import json
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import quote
import os
import re

In [None]:
# Moving the characters from our .csv file into a list of tuples
characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv = csv.reader(sent_file, delimiter=",")
    for row in csv:
        name = row[0].replace(' ', '_') 
        # Parentage, House, and Occupation will be our attributes in the nodes
        parentage = row[1]
        house = row[2]
        occupation = row[3]
        characters.append((name, parentage, house, occupation))

In [None]:
lengths = []

#https://www.reddit.com/r/learnpython/comments/muwu7v/scraping_fandomwiki_pages/
baseurl = "https://harrypotter.fandom.com/api.php?"
action = "action=query"

for character in characters:
    #if (character[0] == 'Eldred_Worple'):
    # Set up the query for the character
    title = "titles=" + character[0]
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    # Since we have checked the articles, we know that urlopen will succeed
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    lengths.append((character, len(wikitext)))

    try:
        jsonobj = json.loads(wikitext)
    except ValueError: 
        print(f'Decoding JSON has failed for {character}, moving on...')
    # Get the number for the article
    num = list(jsonobj['query']['pages'].keys())[0]
    # Get the wikitext
    wikitext = jsonobj['query']['pages'][num]['revisions'][0]['*']
    f_name = character[0] + '.txt'

    # Remove <ref> tags including content.
    clean_text = re.sub(r'<ref.*?</ref>', "", wikitext)
    clean_text = re.sub(r'<ref.*?/>', "", clean_text)

    # Write to a file with that name
    with open("characters/" + f_name, 'w') as f:
        f.write(clean_text)

In [None]:
def find_aliases(text, alias_dict, name):
    #print("Name: " + name)
    #Find all aliases. Will be between |alias and |title, can be on several lines
    aliases = re.findall(r'\|alias = (.(.|\s)*?)\|', text)
    
    if len(aliases)==0:
        return
    else:
        #Take the first match in the first group and split by new line so each alias becomes an element
        all_aliases = aliases[0][0].split("\n")
        #print("length of aliases: " + str(len(all_aliases)))

        #Find the aliases to keep
        for alias in all_aliases:
            #Don't keep the ones used for a disguise
            if "disguise" in alias:
                continue
            #Mudblood not an alias
            if "Mudblood" in alias:
                continue
            #Don't keep the ones used to tell others a wrong name 
            if "the name he told" in alias:
                continue
            if "the name she told" in alias:
                continue
            #If alias is empty string then don't keep
            if alias == '':
                continue
            #Only keep the first part of the alias and only from the 2nd char, as 1st char is a *
            processed_alias = alias.split(" (by")[0].replace('*', '')
            if processed_alias in alias_dict:
                print("Processed alias: " + processed_alias + ", belonging to: " + name)
            else:
                alias_dict[processed_alias] = name   

In [None]:
alias_dictionary = {}

for file_ in os.listdir("./characters"):
    #print(file_)
    name = file_[:-4]
    #print(name)
    with open("./characters/"+file_, 'r') as f:
        text = f.read()
    first_name = name.split("_")[0]
    #print("First name: " + first_name)
    if first_name in alias_dictionary:
        print("First name: " + first_name + ", belonging to: " + name)
    else:
        alias_dictionary[first_name] = name   
    find_aliases(text, alias_dictionary, name)
    


In [None]:
print(len(alias_dictionary))