# Stude Terkel Radio Archive Scraper

Downloads program information from [WFMT's archive](https://studsterkel.wfmt.com/explore#t=date) of radio programs hosted by Studs Terkel.

### Import Python tools

In [40]:
import os
import time
import boto3
import requests
import pathlib
import pandas as pd
import urllib.request
from datetime import date
from bs4 import BeautifulSoup
from boto3.s3.transfer import S3Transfer

### Download program list

Fetch the URL

In [None]:
r = requests.get("https://studsterkel.wfmt.com/explore#t=date")

Parse it

In [None]:
html = r.text

In [None]:
soup = BeautifulSoup(html)

Pull all the segemented blocks on the page

In [None]:
year_list = soup.find_all(class_="prog_year_block")

Loop through them and grab all the URLs, which lead to program pages.

In [None]:
link_list = []

In [None]:
for year in year_list:
    a_list = year.find_all("a")
    link_list.extend([a['href'] for a in a_list])

Make sure that list is unique

In [None]:
unique_links = set(link_list)

Print the total number of URLs

In [None]:
len(unique_links)

### Scrape program pages

Prepare functions to parse pages

In [None]:
def parse_meta(e):
    """
    Parse a grid cell of metadata from the bottom of a program page.
    """
    # Get all the p tags
    p_list = e.find_all("p")
    d = {}
    for p in p_list:
        # Split out the bolded text as the label
        label = p.strong.text
        p.strong.extract()
        # Keep the rest as the value
        value = p.text.strip()
        # Add to the dictionary
        d[label] = value
    # Return all dictionaries in this block
    return d

In [None]:
def scrape_program(url):
    """
    Scrape a program page and return data.
    """
    # Request the page
    print(f"Scraping https://studsterkel.wfmt.com{url}")
    headers = {
        'User-Agent': 'Studs Terkel Radio Archive Scraper (github.com/pastpages/studs-terkel-radio-feed/)',
    }
    r = requests.get(f"https://studsterkel.wfmt.com{url}", headers=headers)
    if not r.status_code == 200:
        print(f"Failed with status code {r.status_code}")
        return None

    # Parse the HTML
    html = r.text
    soup = BeautifulSoup(html)
    
    # Pull out the title
    title = soup.find("h1").text
    
    # Parse out all metadata
    meta = {}
    for e in soup.find(class_="meta_data__section").find_all(class_="col-4"):
        meta.update(parse_meta(e))
    
    # Grab the MP3 URL, if it exists
    media = soup.find(class_="audio_trigger")
    if media:
        mp3_url = media['data-track-url']
    else:
        mp3_url = None
    
    # Grab the synopsis, if it exists
    summary = soup.find(class_="program_synopsis__body")
    if summary:
        synopsis = summary.h2.text
    else:
        synopsis = None
    
    # Return the scraped data
    return dict(
        title=title,
        mp3_url=mp3_url,
        archive_url=url,
        synopsis=synopsis,
        **meta
    )

Loop through all URLs and scrape each individual page.

In [None]:
program_list, dud_list = [], []

In [None]:
for url in list(unique_links):
    d = scrape_program(url)
    if d:
        program_list.append(d)
    else:
        dud_list.append(url)
    time.sleep(0.33)

Repeat with the duds to give them a second chance

In [None]:
for url in dud_list:
    d = scrape_program(url)
    if d:
        program_list.append(d)
    else:
        pass
    time.sleep(0.33)

Convert to a dataframe

In [None]:
df = pd.DataFrame(program_list).rename(columns={
    "Broadcast Date": "broadcast_date",
    "Physical Format": "physical_format",
    "Digital Format": "digital_format",
    "Ownership": "ownership",
    "Language": "language",
    "Program Sponsor": "program_sponsor",
    "Duration": "duration"
})

Calculate extra columns

In [None]:
def parse_date(s):
    if pd.isnull(s):
        return None
    try:
        month, day, year = s.split(" ")
    except ValueError:
        return None
    return pd.to_datetime(s)

In [None]:
df['broadcast_datetime'] = df.broadcast_date.apply(parse_date)

In [None]:
df['broadcast_year'] = df.broadcast_datetime.dt.year

In [None]:
df['broadcast_month'] = df.broadcast_datetime.dt.month

In [None]:
df['broadcast_monthday'] = df.broadcast_datetime.dt.day

In [None]:
df['has_mp3_url'] = ~pd.isnull(df.mp3_url)

Validate results

In [None]:
assert len(df) == len(unique_links)

Calculate statistics

In [None]:
df.has_mp3_url.value_counts()

### Export

In [None]:
df.sort_values(["broadcast_month", "broadcast_monthday", "broadcast_year"]).to_csv("./data/programs.csv", index=False)

### Download mp3 files

In [80]:
sked = pd.read_csv("../data/schedule.csv")

In [81]:
programs = pd.read_csv("../data/programs.csv")

In [82]:
feed_df = sked[~pd.isnull(sked.archive_url)].merge(
    programs,
    on="archive_url",
    how="inner"
)

In [83]:
this_dir = os.path.abspath("")

In [84]:
parent_dir = os.path.dirname(this_dir)

In [91]:
def download_mp3(mp3_url):
    """
    Download the mp3 URL and return the local file path.
    """
    # If there's no URL, there's no path
    if not mp3_url or pd.isnull(mp3_url):
        return None
    
    # Build the local path using the URL
    print(mp3_url)
    filename = mp3_url.split("/")[-1].replace("published%2F", "")
    filepath = os.path.join(parent_dir, f"mp3/{filename}")
    
    # If the file is already downloaded, we're good to go
    if os.path.exists(filepath):
        pass
        # print(f"Already downloaded to {filepath}")
    # If not, download it
    else:
        # print(f"Downloading {mp3_url} to {filepath}")
        opener = urllib.request.build_opener()
        opener.addheaders = [("Referer", "https://studsterkel.wfmt.com/")]
        urllib.request.install_opener(opener)
        urllib.request.urlretrieve(mp3_url, filepath)
        time.sleep(0.33)

    # Return the path
    return filepath

In [92]:
feed_df['mp3_path'] = feed_df.mp3_url.apply(download_mp3)

https://wfmt-studs-terkel.s3.amazonaws.com/published%2F40605.mp3
https://wfmt-studs-terkel.s3.amazonaws.com/published%2F33489.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/6537.mp3
https://wfmt-studs-terkel.s3.amazonaws.com/published/28275.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/10904.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/2158.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/5942.mp3
https://wfmt-studs-terkel.s3.amazonaws.com/published/33494.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/18777.mp3
https://wfmt-studs-terkel.s3.amazonaws.com/published%2F32660.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/11142.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/10766.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/10125.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/published/6826.mp3
https://wfmt-studs-terkel.s3.amazonaws.com/published%2F37043.mp3
https://s3.amazonaws.com/wfmt-studs-terkel/publishe

In [85]:
def _get_s3_client(bucket, region="us-east-1"):
    """
    Returns a transfer client ready to upload files to an s3 bucket.
    Provide the S3 bucket name and region.
    """
    credentials = { 
        'aws_access_key_id': os.getenv('AWS_ACCESS_KEY_ID'),
        'aws_secret_access_key': os.getenv('AWS_ACCESS_KEY_SECRET')
    }
    client = boto3.client('s3', region, **credentials)
    return S3Transfer(client)

In [93]:
feed_df[pd.isnull(feed_df.mp3_url)]

Unnamed: 0,feed_date,archive_url,title,mp3_url,synopsis,broadcast_date,physical_format,Duration,digital_format,ownership,language,program_sponsor,broadcast_datetime,broadcast_year,broadcast_month,broadcast_monthday,has_mp3_url,mp3_path


In [94]:
bucket = 'studs-terkel-radio-archive-feed'
client = _get_s3_client(bucket)
for path in list(feed_df.mp3_path):
    print(f"Uploading {path} to {bucket}")
    client.upload_file(
        path,
        bucket,
        path,
        extra_args={'ACL': 'public-read', 'ContentType': 'audio/mpeg'}
    )

Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/40605.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/33489.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/6537.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/28275.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/10904.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/2158.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/5942.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/33494.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp3/18777.mp3 to studs-terkel-radio-archive-feed
Uploading /home/palewire/Code/studs-terkel-podcast-feed/mp

In [95]:
feed_df['feed_url'] = feed_df.mp3_path.apply(lambda x: f'https://studs-terkel-radio-archive-feed.s3.amazonaws.com/{x}')

In [97]:
feed_df.sort_values(["feed_date"]).to_csv("../data/feed.csv", index=False)