# Stude Terkel Radio Archive Scraper

Downloads program information from [WFMT's archive](https://studsterkel.wfmt.com/explore#t=date) of radio programs hosted by Studs Terkel.

### Import Python tools

In [1]:
import os
import time
import boto3
import requests
import pathlib
import pandas as pd
import urllib.parse
import urllib.request
from rich.progress import track
from datetime import date
from bs4 import BeautifulSoup
from boto3.s3.transfer import S3Transfer

### Download program list

Fetch the URL

In [2]:
r = requests.get("https://studsterkel.wfmt.com/explore#t=date")

Parse it

In [3]:
html = r.text

In [4]:
soup = BeautifulSoup(html)

Pull all the segemented blocks on the page

In [5]:
year_list = soup.find_all(class_="prog_year_block")

Loop through them and grab all the URLs, which lead to program pages.

In [6]:
link_list = []

In [7]:
for year in year_list:
    a_list = year.find_all("a")
    link_list.extend([a['href'] for a in a_list])

Make sure that list is unique

In [8]:
unique_links = set(link_list)

Print the total number of URLs

In [9]:
len(unique_links)

5023

### Scrape program pages

Prepare functions to parse pages

In [10]:
def parse_meta(e):
    """
    Parse a grid cell of metadata from the bottom of a program page.
    """
    # Get all the p tags
    p_list = e.find_all("p")
    d = {}
    for p in p_list:
        # Split out the bolded text as the label
        label = p.strong.text
        p.strong.extract()
        # Keep the rest as the value
        value = p.text.strip()
        # Add to the dictionary
        d[label] = value
    # Return all dictionaries in this block
    return d

In [11]:
def url2name(url):
    return pathlib.Path(urllib.parse.urlparse(url).path).stem + ".html"

In [17]:
def download_html(url):
    name = url2name(url)
    path = pathlib.Path("./html") / name
    if path.exists():
        return path
    else:
        headers = {
            'User-Agent': 'Studs Terkel Radio Archive Scraper (github.com/palewire/studs-terkel-radio-feed/)',
        }
        r = requests.get(f"https://studsterkel.wfmt.com{url}", headers=headers)
        if not r.status_code == 200:
            print(f"Failed with status code {r.status_code}")
            return None
        with open(path, "w") as fh:
            fh.write(r.text)
        time.sleep(0.15)
        return path

Loop through all URLs and scrape each individual page.

In [66]:
for url in track(list(unique_links)):
    d = download_html(url)

In [73]:
def scrape_program(path):
    """
    Scrape a program page and return data.
    """
    # Open the page
    with open(path, "r") as fh:
        html = fh.read()
    
    # Parse the HTML
    soup = BeautifulSoup(html)
    
    # Pull out the title
    title = soup.find("h1").text
    
    # Parse out all metadata
    meta = {}
    for e in soup.find(class_="meta_data__section").find_all(class_="col-4"):
        meta.update(parse_meta(e))
    
    # Grab the MP3 URL, if it exists
    media = soup.find(class_="audio_trigger")
    if media:
        mp3_url = media['data-track-url']
    else:
        mp3_url = None
    
    # Grab the synopsis, if it exists
    summary = soup.find(class_="program_synopsis__body")
    if summary:
        synopsis = summary.h2.text
    else:
        synopsis = None
    
    # Return the scraped data
    return dict(
        title=title,
        mp3_url=mp3_url,
        archive_url="/programs/" + path.stem,
        synopsis=synopsis,
        **meta
    )

In [67]:
html_list = list(pathlib.Path("./html").glob("*.html"))

In [74]:
program_list = []
for p in track(html_list):
    d = scrape_program(p)
    program_list.append(d)

Convert to a dataframe

In [75]:
df = pd.DataFrame(program_list).rename(columns={
    "Broadcast Date": "broadcast_date",
    "Physical Format": "physical_format",
    "Digital Format": "digital_format",
    "Ownership": "ownership",
    "Language": "language",
    "Program Sponsor": "program_sponsor",
    "Duration": "duration"
})

Calculate extra columns

In [76]:
def parse_date(s):
    if pd.isnull(s):
        return None
    try:
        month, day, year = s.split(" ")
    except ValueError:
        return None
    return pd.to_datetime(s)

In [90]:
df.archive_url = df.archive_url.str.strip()

In [91]:
df['broadcast_datetime'] = df.broadcast_date.apply(parse_date)

In [92]:
df['broadcast_year'] = df.broadcast_datetime.dt.year

In [93]:
df['broadcast_month'] = df.broadcast_datetime.dt.month

In [94]:
df['broadcast_monthday'] = df.broadcast_datetime.dt.day

In [95]:
df['has_mp3_url'] = ~pd.isnull(df.mp3_url)

Calculate statistics

In [96]:
df.has_mp3_url.value_counts()

True     2872
False    2149
Name: has_mp3_url, dtype: int64

### Export

In [97]:
df.sort_values(["broadcast_month", "broadcast_monthday", "broadcast_year"]).to_csv("../data/programs.csv", index=False)

### Download mp3 files

In [118]:
sked = pd.read_csv("../data/schedule.csv")

In [119]:
has_url = sked[~pd.isnull(sked.archive_url)]

In [122]:
assert not has_url.archive_url.duplicated().any()

In [123]:
programs = pd.read_csv("../data/programs.csv")

In [124]:
feed_df = has_url.merge(
    programs,
    on="archive_url",
    how="inner"
)

In [125]:
this_dir = os.path.abspath("")

In [126]:
parent_dir = os.path.dirname(this_dir)

In [127]:
def download_mp3(mp3_url):
    """
    Download the mp3 URL and return the local file path.
    """
    # If there's no URL, there's no path
    if not mp3_url or pd.isnull(mp3_url):
        return None
    
    # Build the local path using the URL
    filename = mp3_url.split("/")[-1].replace("published%2F", "")
    filepath = pathlib.Path("./mp3") / filename
    
    # If the file is already downloaded, we're good to go
    if filepath.exists():
        pass
    # If not, download it
    else:
        print(f"Downloading {mp3_url} to {filepath}")
        opener = urllib.request.build_opener()
        opener.addheaders = [("Referer", "https://studsterkel.wfmt.com/")]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(mp3_url, filepath)
        time.sleep(0.125)

    # Return the path
    return filepath

In [128]:
feed_df['mp3_path'] = feed_df.mp3_url.apply(download_mp3)

Downloading https://s3.amazonaws.com/wfmt-studs-terkel/published/13405.mp3 to mp3/13405.mp3


URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 's3.amazonaws.com'. (_ssl.c:1123)>

### Upload mp3 files

In [None]:
def _get_s3_client(bucket, region="us-east-1"):
    """
    Returns a transfer client ready to upload files to an s3 bucket.
    Provide the S3 bucket name and region.
    """
    credentials = { 
        'aws_access_key_id': os.getenv('AWS_ACCESS_KEY_ID'),
        'aws_secret_access_key': os.getenv('AWS_ACCESS_KEY_SECRET')
    }
    client = boto3.client('s3', region, **credentials)
    return S3Transfer(client)

In [None]:
feed_df[pd.isnull(feed_df.mp3_url)]

In [None]:
bucket = 'studs-terkel-radio-archive-feed'
client = _get_s3_client(bucket)
for path in list(feed_df.mp3_path):
    print(f"Uploading {path} to {bucket}")
    client.upload_file(
        path,
        bucket,
        path,
        extra_args={'ACL': 'public-read', 'ContentType': 'audio/mpeg'}
    )

In [None]:
feed_df['feed_url'] = feed_df.mp3_path.apply(lambda x: f'https://studs-terkel-radio-archive-feed.s3.amazonaws.com/{x}')

In [None]:
feed_df.sort_values(["feed_date"]).to_csv("../data/feed.csv", index=False)