In [1]:
import json
import logging
import os
import pandas as pd
import re
import requests as r
import time
import tqdm

from collections import defaultdict

## settings

In [2]:
# create logger with 'spam_application'
logger = logging.getLogger('pdb_scraper')
logger.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(ch)


In [3]:
PWD = os.environ["WORKSPACE_PATH"]

## util functions

In [4]:
PERS_DATA_FOLDER = f"{PWD}/data/personality_data/"

in_pers_data = lambda data_path: f"{PERS_DATA_FOLDER}{data_path}"

In [5]:
def safe_request(url):
    try:
        resp = r.get(url)
        return resp.json()
    except:
        raise Exception(f"Error in HTTP request: {resp.content}")

In [6]:
def open_and_load_json(path, lines=False):
    obj = [] if lines else {}

    if os.path.exists(path):
        with open(path, "r+") as fp:
            if lines:
                obj += [json.loads(line) for line in fp.readlines()]
            else:
                obj.update(json.load(fp))
    return obj

In [7]:
def save_json(path, obj, lines=False):
    with open(path, "w+") as fp:
        if lines:
            fp.writelines([json.dumps(item) for item in obj])
        else:
            json.dump(obj, fp)

## loading speakers

In [8]:
with open(f"{PWD}/data/cornell_movies/speakers.json", "r+") as  fp:
    speakers = json.load(fp)

## getting movie pages

loading!

In [12]:
movies = sorted(list({speakers[char]["meta"]["movie_name"].strip().lower() for char in speakers}))

movie_to_id_path = in_pers_data("movie_to_id.json")
movie_name_clarify_path = in_pers_data("movie_name_clarifications.json")
movie_cat_excepts_path = in_pers_data("movie_category_exceptions.json")

movie_to_id = open_and_load_json(movie_to_id_path)

In [41]:
def find_movie(movie: str, cat_id=3) -> int:
    req = f"https://api.personality-database.com/api/v2/search/subcategories?query={'%20'.join(movie.split())}&limt=1000&nextCursor=0"
    resp = r.get(req)
    logger.debug(req)
    subs = resp.json()["data"]["results"]
    for sub in subs:
        site_full_name = sub["name"].lower().strip().replace(" & ", " ")
        site_name = re.sub("\(\d{4}\)", "", site_full_name).strip()
        if site_name != movie and (site_full_name != movie.replace(" - ", "-")) and (site_name != movie.replace(" - ", "-")):
            logger.debug(f"\nSkipped result for '{movie}' because name did not match ({site_name} or {site_full_name})")
            continue
        if (cat_id:=sub["categoryID"]) != str(cat_id):
            logger.debug(f"\nSkipped result for '{movie}' because was not in movie category ({cat_id})")
            continue
        if not sub["isFictional"]:
            logger.debug(f"\nSkipped result for '{movie}' because was not fictional")
            continue
        return int(sub["id"])
    raise Exception("Movie not found!")

In [380]:
movie_name_clarify = open_and_load_json(movie_name_clarify_path)
movie_cat_excepts = open_and_load_json(movie_cat_excepts_path)

found = 0
missing = 0
for mov in tqdm.tqdm(movies):
    if mov in movie_to_id:
        continue
    try:
        if mov in movie_name_clarify and not movie_name_clarify[mov]:
            continue
        movie_to_id[mov] = find_movie(movie_name_clarify.get(mov, mov), cat_id=movie_cat_excepts.get(mov, 3))
        found += 1
    except Exception as e:
        logger.debug(e)
        missing += 1
        logger.info(f"Could not find '{mov}'")
        # logger.info(f"Could not find '{mov}' ({found} found/{missing} missing)")
        break
    # time.sleep(1)

100%|██████████| 617/617 [00:00<00:00, 1415692.32it/s]


### correcting name

In [379]:
correct_name = None
movie_name_clarify[mov] = correct_name.lower() if type(correct_name) == str else correct_name
save_json(movie_name_clarify_path, movie_name_clarify)

### correcting category

In [313]:
new_cat = 2
movie_cat_excepts[mov] = new_cat
save_json(movie_cat_excepts_path, movie_cat_excepts)

saving!

In [381]:
save_json(movie_to_id_path, movie_to_id)

## getting characters from movie pages

loading!

In [9]:
movie_to_chars_path = in_pers_data("movie_to_chars.json")
movie_to_chars = open_and_load_json(movie_to_chars_path)

In [383]:
def get_profiles(film_id: int) -> dict:
    out = safe_request(f"https://api.personality-database.com/api/v1/profiles?offset=0&limit=100&sub_cat_id={film_id}&cat_id=3&property_id=2")
    name_to_id = {prof["mbti_profile"].lower(): prof["id"] for prof in out["profiles"]}
    return name_to_id

In [384]:
for movie in tqdm.tqdm(movie_to_id):
    if movie in movie_to_chars:
        continue
    movie_to_chars[movie] = get_profiles(movie_to_id[movie])
    time.sleep(1)

100%|██████████| 464/464 [01:15<00:00,  6.15it/s] 


saving!

In [385]:
save_json(movie_to_chars_path, movie_to_chars)

## matching speakers to characters

In [386]:
char_id_to_pdb_id = {}

In [387]:
def name_match(name1, name2):
    clean = lambda name: re.sub("[^A-Za-z0-9 ]", "", name).lower()
    cleaned_name1 = clean(name1)
    cleaned_name2 = clean(name2)

    strong_match = lambda n1, n2: n1 in n2
    weak_match = lambda n1, n2: any([n1_part in n2 for n1_part in n1.split()])
    both_ways = lambda arg1, arg2, func: func(arg1, arg2) or func(arg2, arg1)

    return (both_ways(cleaned_name1, cleaned_name2, strong_match), both_ways(cleaned_name1, cleaned_name2, weak_match))

In [388]:
all_matches = []
all_misses = []
near_misses = []

with open(in_pers_data("char_name_corrections.json"), "r+") as fp:
    char_corrections = json.load(fp)

for char_id in speakers:
    char_dict = speakers[char_id]["meta"]
    char_name = char_dict["character_name"].lower()
    movie = char_dict["movie_name"]

    if movie in movie_to_chars:
        correct_name = char_corrections.get(char_id, char_name)
        poss_matches = {char: name_match(correct_name, char[0]) for char in movie_to_chars[movie].items()}
        matches = [char for char in poss_matches if poss_matches[char][0]]
        if len(matches) == 1:
            all_matches.append((movie, char_name, matches[0][0]))
            char_id_to_pdb_id[char_id] = matches[0][1]
        else:
            poss_matches = [char for char in poss_matches if poss_matches[char][1]]
            if poss_matches:
                near_misses.append((movie, char_id, char_name, poss_matches))

            all_misses.append((movie, char_id, char_name, movie_to_chars[movie].keys()))

## getting speaker profiles

loading!

In [390]:
char_pers_path = in_pers_data("char_to_pers_votes.json")

char_pers = open_and_load_json(char_pers_path)

In [391]:
pers_systems = {"1": "Myers Briggs", "9": "SLOAN"}

def get_pers_votes(prof_id: str) -> dict:
    votes = defaultdict(dict)

    out = safe_request(f"https://api.personality-database.com/api/v1/profile/{prof_id}")

    prof = out["breakdown_systems"]

    for sys_code in pers_systems:
        for res in prof[sys_code]:
            votes[pers_systems[sys_code]][res["personality_type"]] = res["theCount"]

    return votes

In [392]:
for char_id in tqdm.tqdm(char_id_to_pdb_id):
    try:
        if char_id in char_pers:
            continue

        char_pers[char_id] = get_pers_votes(char_id_to_pdb_id[char_id])

        time.sleep(3)
    except KeyboardInterrupt:
        break

100%|██████████| 2477/2477 [20:19<00:00,  2.03it/s] 


saving!

In [24]:
save_json(char_pers_path, char_pers)