In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from collections import defaultdict

In [None]:
pd.set_option('display.max_colwidth', 100)

base_provider = "https://www.comedy.co.uk"

In [None]:
# only TV panel shows
r = requests.get("https://www.comedy.co.uk/tv/list/panel_show/")

In [None]:
html_tree = BeautifulSoup(r.text, "lxml")

In [None]:
#read all the show titles and their urls
list_shows = [[x.text, base_provider + x["href"] + "episodes/all/"] for x in html_tree.select(".m- a")]

In [None]:
data = pd.DataFrame(list_shows, columns=["title", "url"])

In [None]:
def build_cast(idx, show):
    """Builds a DataFrame for each show (row) and the number of appearences for the comedians (columns)
    """
    cast = defaultdict(int)
    
    time_start = time.time()
    
    r = requests.get(show).text
    episode_list = [base_provider + a["href"] for a in BeautifulSoup(r, "lxml")
                    .select("ol.list-unstyled > li > a[href]")]
    
    for episode in episode_list:
        r_ep = requests.get(episode).text
        ep = BeautifulSoup(r_ep, "lxml")
        cast_list = [x.text for x in
                     ep.select("h3 + table a[href^=/people]") # regulars
                     + ep.select("h3 + table + table a[href^=/people]")] # guests
        # for complete cast (incl. writers): "table a[href^=/people]"
        
        for person in cast_list:
            cast[person] += 1
        

        time.sleep(1)
        
    time_end = time.time() - time_start
    print("Got", len(cast), "columns for", show, "in", int(time_end), "s")
    
    return pd.DataFrame(cast, index=[idx], columns=cast.keys())

In [None]:
# build cast data
cast_data = [build_cast(idx, data["url"][idx]) for idx in data.index]

In [None]:
# make it one DataFrame
tmp = pd.concat(cast_data)

In [None]:
# join with original data, remove shows with no information and fill the sparse matrix with zeros
data = data.join(tmp)
data = data.dropna(how="all", subset=data.columns[2:]).fillna(0)

In [None]:
# save
data.to_hdf("all_shows.h5", key="data")