# Data Collection
## Imports

In [None]:
import csv
import urllib.request
import json
import numpy as np
import matplotlib.pyplot as plt
from urllib.parse import quote

## Choosing an online source
>We have decided to use the [fandom](https://harrypotter.fandom.com/wiki/Main_Page) wikipedia instead of the regular [wikipedia](https://www.wikipedia.org/) for finding articles for our characters. The main reason for this being that the regular wikipedia had multiple characters in one article, e.g. [Ministry of Magic](https://en.wikipedia.org/wiki/Ministry_of_Magic) contains 23 characters. Whereas the fandom wikipedia has a dedicated article for each character.

## Creating a list of characters
>To create a list of characters we combined [wikipedia's list of Harry Potter characters](https://en.wikipedia.org/wiki/List_of_Harry_Potter_characters), with the [half- and full-blood lists](https://harrypotter.fandom.com/wiki/Category:Individuals_by_parentage) from the fandom wikipedia, and the characters from [Buzzfeed's Harry Potter Character Quiz](https://www.buzzfeed.com/sarahaspler/there-are-over-700-harry-potter-characters-and-i). The reason for combining these were to include as many characters as possible, without having to go through all of the articles on the fandom wikipedia. However, we found that we did have to check all of the articles from the combined list manually, because some of the names in the list were not unique, had incorrect spelling, did not match the article name etc. Hence we decided on the following criteria for the final list:

- Characters are represented by the article name.
- Characters must be from the actual books.
- Characters must have an appearence in at least one book.


>To clarify a character has an appearence in a book if they are represented in a book by some interaction with other characters. This is contrary to characters who are only mentioned, which means that it may just be a case of another character saying their name in conversation with some third character. These criteria also weeds out characters that are only from video games, or the Fantastic Beasts franchise etc. The reason for this initial sorting is that we want to use the books, hence we are removing a lot of noise by not having characters that have no text related to them in the books. 

In [None]:
# Moving the characters from our .csv file into a list of tuples
characters = []

with open("HP_characters.csv", "r", encoding="utf8") as sent_file:
    csv = csv.reader(sent_file, delimiter=",")
    for row in csv:
        name = row[0].replace(' ', '_') 
        # Parentage, House, and Occupation will be our attributes in the nodes
        parentage = row[1]
        house = row[2]
        occupation = row[3]
        characters.append((name, parentage, house, occupation))

## Downloading the files
>We are using the API from the course to download the articles:

In [None]:
lengths = []

#https://www.reddit.com/r/learnpython/comments/muwu7v/scraping_fandomwiki_pages/
baseurl = "https://harrypotter.fandom.com/api.php?"
action = "action=query"

for character in characters:
    # Set up the query for the character
    title = "titles=" + character[0]
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

    # Since we have checked the articles, we know that urlopen will succeed
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    lengths.append((character, len(wikitext)))

    try:
        jsonobj = json.loads(wikitext)
    except ValueError: 
        print(f'Decoding JSON has failed for {character}, moving on...')
    # Get the number for the article
    num = list(jsonobj['query']['pages'].keys())[0]
    # Get the wikitext
    wikitext = jsonobj['query']['pages'][num]['revisions'][0]['*']
    f_name = character[0] + '.txt'
    
    # Write to a file with that name
    with open("characters/" + f_name, 'w') as f:
        f.write(wikitext)

## Checking the files

In [None]:
# Looking at the top/bottom of characters by length of articles
# to see if it looks reasonable
print(sorted(lengths, key=lambda x: x[1])[:10])
print(sorted(lengths, key=lambda x: x[1])[-10:])

In [None]:
lengths_s = [x[1] for x in lengths]

# Histogram for bad guys
number_of_bins = 10
histogram = np.histogram(lengths_s, number_of_bins)

x_values = histogram[1][:-1]
y_values = histogram[0]

plt.plot(x_values, y_values)
plt.title(f'Plot of length of articles')
plt.xlabel('Length')
plt.ylabel('Number of articles')
plt.figtext(.5,-0.05, f"text", ha='center')
plt.show()

print(f"Minimum length = {np.min(lengths_s)}")

In [None]:
# Histogram for bad guys
number_of_bins = 10
histogram = np.histogram(lengths_s, number_of_bins)

x_values = histogram[1][:-1]
y_values = histogram[0]

plt.plot(x_values, y_values)
plt.title(f'Log-Log plot of length of articles')
plt.xlabel('Length')
plt.yscale('log')
plt.xscale('log')
plt.ylabel('Number of articles')
plt.figtext(.5,-0.05, f"text", ha='center')
plt.show()

print(f"Minimum length = {np.min(lengths_s)}")

In [None]:
print(f"Average length of an article: {sum(lengths_s)/len(lengths_s)}")