In [2]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os

Get Sitemap

In [3]:
ROOT = 'https://openpsychometrics.org/tests/characters/stats/'
sitemap_df = pd.read_html(ROOT)[0]
sitemap_df=sitemap_df[2:-1]

Get the labels of all the shows/works

In [4]:
work_pages = np.array(sitemap_df[sitemap_df.Name.str.match('[^\d]')]['Name'])
work_pages[:4]

array(['A/', 'AD/', 'AHX/', 'AK/'], dtype=object)

Get all of the trait code names, using Joffrey's page as a source

In [5]:
joffrey = 'GOT/23/'
URL = ROOT + joffrey
mean_df = pd.read_html(URL)[0]
mean_df['Trait'] = mean_df['Trait'].str.replace(' \(', ',').str.replace('not ', '').str.replace('\)', '')
mean_df = pd.DataFrame(index=mean_df['Trait'])
n_df = mean_df.copy()
std_df = mean_df.copy()
mean_df.head()

"psychopath,empath"
"punchable,loveable"
"arrogant,humble"
"😈,😇"
"selfish,altruistic"


Create URLs for all character pages

In [27]:
character_urls = []
for work in work_pages:
    URL = ROOT + work
    work_df = pd.read_html(URL)[0]
    work_df = work_df[2:-1]
    work_df['Name'] = work + work_df['Name']
    character_urls.extend(list(work_df['Name']))
# A few links point to image pages. Remove them
character_urls = [char for char in character_urls if char[-4:] != '.jpg']

Scrape all of the personality ratings, images, and character names and show names from each character's page

In [36]:
character_map = pd.DataFrame(columns=['Name', 'Show'])
for char in character_urls:
    mean_df[char] = None
    std_df[char] = None
    n_df[char] = None

    URL = ROOT + char
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, 'html.parser')
    description = soup.find('p').text
    x = re.match(r"(.+)is a character from (.+)\. This page summarizes.+", description)
    character_map.loc[char] = x.group(1), x.group(2)
    
    img_url = soup.find('img').attrs.get('src')
    if not os.path.isdir('images'):
        os.makedirs('images')
    response = requests.get(img_url)
    filename = 'images/' + char.replace('/', '') + '.jpg'
    with open(filename, 'wb') as out_file:
        out_file.write(response.content)
        out_file.close()

    trait_df = pd.read_html(URL)[0]
    trait_df['trait1'] = trait_df['Trait'].str.extract(r'(.+)\s+?\(')
    trait_df['trait2'] = trait_df['Trait'].str.extract(r'.\(not\s+?(.+)\)')
    for index, row in trait_df.iterrows():
        mean_df.loc[mean_df.index.str.match(re.escape(row['trait1'] + ',' + row['trait2'])),char] = 100 - row['Average rating']
        n_df.loc[n_df.index.str.match(re.escape(row['trait1'] + ',' + row['trait2'])),char] = row['Number of raters']
        std_df.loc[std_df.index.str.match(re.escape(row['trait1'] + ',' + row['trait2'])),char] = row['Rating standard deviation']

        mean_df.loc[mean_df.index.str.match(re.escape(row['trait2'] + ',' + row['trait1'])),char] = row['Average rating']
        n_df.loc[n_df.index.str.match(re.escape(row['trait2'] + ',' + row['trait1'])),char] = row['Number of raters']
        std_df.loc[std_df.index.str.match(re.escape(row['trait2'] + ',' + row['trait1'])),char] = row['Rating standard deviation']

Save results to CSVs

In [38]:
if not os.path.isdir('data'):
    os.makedirs('data')

character_map.to_csv('data/character_map.csv')
mean_df.to_csv('data/mean.csv')
n_df.to_csv('data/n.csv')
std_df.to_csv('data/std.csv')