In [4]:
from bs4 import BeautifulSoup
import re
import requests
from retrying import retry
import json

# The first page we're scraping
TOP_PAGE = "https://www.imdb.com/list/ls058011111/"

# The second page we're scraping
BIO_PAGE = "https://imdb.com/name/{}/bio?ref_=nmls_hd"

# Patterns to find on the pages we're scraping
PATTERN = re.compile('/name/nm.*ref_=nmls_hd')
REGEX = re.compile('/name/(nm\d+)?.*ref_=nmls_hd.*')

In [2]:
@retry(stop_max_attempt_number=7, wait_fixed=2000)
def persistent_request_to_soup(*args, **kwargs):
    """Make a request and convert to a HTML soup. Requests sometimes get blocked,
    so wait 2 seconds between failed requests. All args and kwargs are passed
    directly to `requests.get`"""
    r = requests.get(*args, **kwargs)
    r.raise_for_status()
    return BeautifulSoup(r.text, "lxml")


def imdb_code_iter():
    """Yield an iterator to the next IMDB actor code, by iterating
    through pages until no more actor codes are found."""
    done = False
    ipage = 1
    while not done:
        # Innocent until proven guilty                                  
        done = True
        for code in _code_iter(ipage):
            done = False
            yield code
        # Increment page number                                         
        ipage += 1


def _code_iter(ipage):
    """Yield an iterator to the next IMDB actor code on this page, if any.
    
    Args:
        ipage (int): The page number.
    Yields:
        code (str) an IMDB actor code.
    """
    # Get the HTML soup for this page
    soup = persistent_request_to_soup(TOP_PAGE, params=dict(page=ipage))
    # Find all IMDB actor codes on this page
    for anchor in soup.find_all('a', href=PATTERN):
        href = anchor['href']
        code = REGEX.findall(href)[0]
        yield code


def fetch_bio(imdb_code):
    """Find the biography associated with the actor with IMDB code `imdb_code`.
    
    Args:
        imdb_code (str): The IMDB code of the actor.
    Returns:
        name, biography (str, str): Name and biography of the actor.
    """
    soup = persistent_request_to_soup(BIO_PAGE.format(imdb_code))
    # Fetch the actor name                                       
    name_meta = soup.find("meta", property="og:title")
    name = name_meta["content"]
    # Fetch the bio                                              
    bio = soup.find("div", class_=["soda", "odd"])
    clean_text = bio.text.strip()
    paragraphs = clean_text.split("\n")
    bio_text = paragraphs[0]
    return name, bio_text

In [5]:
# Fetch each biography for every code.
code_iter = imdb_code_iter()
bios = {name: text for name, text in map(fetch_bio, code_iter)}
with open("data/out-bios.json", "w") as f:
    json.dump(bios, f)