## Scraping speakers biographies from the 2023 PyCon UK website

This scraper is designed to collect data from public-i TV websites - information about the councillors. The TV recordings stay on the website for a limited period of time, several months only. The modeule below is for scraping biographies of speakers to populate people.jsonl

In [5]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import os
import time
from tqdm import tqdm

BASE_URL = "https://kent.public-i.tv"
START_ID = 953662
END_ID = 966039
OUTPUT_WEBCAST_FILE = "../data/jsons/valid_webcasts_with_speakers.txt"
OUTPUT_JSONL_FILE = "../data/jsons/speaker_biographies.jsonl"

def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    seen.add(entry.get("biography_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def is_valid_webcast(meeting_id):
    url = f"{BASE_URL}/core/portal/webcast_interactive/{meeting_id}"
    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return False, None, None

        soup = BeautifulSoup(resp.text, "html.parser")
        links = soup.select('a[href*="/core/portal/speaker_profile/"]')
        return (True, url, soup) if links else (False, None, None)
    except Exception as e:
        print(f"⚠️ Error on {meeting_id}: {e}")
        return False, None, None

def extract_speaker_links_from_soup(soup):
    return list({urljoin(BASE_URL, link['href']) for link in soup.select('a[href*="/core/portal/speaker_profile/"]')})

def scrape_speaker_profile(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    profile = {'profile_url': url}
    name_tag = soup.select_one('h2.cs_heading_font_family')
    profile['name'] = name_tag.text.strip() if name_tag else None

    img_tag = soup.select_one('div.col-md-4 img')
    profile['photo_url'] = urljoin(url, img_tag['src']) if img_tag else None

    org_dt = soup.find('dt', string='Organisation')
    profile['organisation'] = org_dt.find_next_sibling('dt').text.strip() if org_dt else None

    bio_dt = soup.find('dt', string='Biography')
    bio_link = bio_dt.find_next('a')['href'] if bio_dt and bio_dt.find_next('a') else None
    profile['biography_url'] = bio_link

    email_dt = soup.find('dt', string='Email')
    email_link = email_dt.find_next('a').text.strip() if email_dt else None
    profile['email'] = email_link

    activities = []
    for activity in soup.select('div.recent_activity_webcast'):
        committee = activity.select_one('div.font-weight-bold')
        datetime = committee.find_next_sibling('div') if committee else None
        watch_link = activity.select_one('a.btn-primary')
        activities.append({
            'committee': committee.text.strip() if committee else None,
            'datetime': datetime.text.strip() if datetime else None,
            'webcast_url': urljoin(url, watch_link['href']) if watch_link else None
        })
    profile['recent_activity'] = activities
    return profile

def scrape_biography_page(url, retries=3, delay=3):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            profile_info = {}
            dl = soup.find('dl', class_='dl-horizontal')
            if dl:
                dt_tags = dl.find_all('dt')
                for dt in dt_tags:
                    key = dt.get_text(strip=True)
                    dd = dt.find_next_sibling('dd')
                    value = dd.get_text(strip=True) if dd else ''
                    profile_info[key] = value

            bio_section = soup.find('div', class_='content')
            if bio_section:
                bio_text = bio_section.get_text(separator=' ', strip=True)
                profile_info['FullText'] = bio_text

            return profile_info
        except Exception as e:
            if attempt == retries - 1:
                return {"bio_scrape_error": str(e)}
            time.sleep(delay)

def main():
    seen_bios = load_existing_jsonl(OUTPUT_JSONL_FILE)

    with open(OUTPUT_WEBCAST_FILE, "a", encoding="utf-8") as out_file:
        for meeting_id in tqdm(range(START_ID, END_ID + 1)):
            valid, webcast_url, soup = is_valid_webcast(meeting_id)
            if not valid:
                continue

            print(f"✔️ Valid webcast: {webcast_url}")
            out_file.write(webcast_url + "\n")
            out_file.flush()

            speaker_urls = extract_speaker_links_from_soup(soup)
            for speaker_url in speaker_urls:
                try:
                    profile = scrape_speaker_profile(speaker_url)
                    bio_url = profile.get("biography_url")

                    if not bio_url or bio_url in seen_bios:
                        continue

                    full_bio = scrape_biography_page(bio_url)
                    profile.update(full_bio)
                    save_jsonl_entry(OUTPUT_JSONL_FILE, profile)
                    seen_bios.add(bio_url)

                    print(f"   → Saved {profile['name']}")
                    time.sleep(3)
                except Exception as e:
                    print(f"❌ Failed speaker {speaker_url}: {e}")

if __name__ == "__main__":
    main()

  0%|          | 0/12378 [00:00<?, ?it/s]

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/953662


  5%|▍         | 575/12378 [01:37<32:35,  6.03it/s] 

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/954237
   → Saved Trevor Bond


 10%|█         | 1245/12378 [05:20<56:58,  3.26it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/954907


 19%|█▉        | 2387/12378 [11:45<50:37,  3.29it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/956049


 19%|█▉        | 2388/12378 [11:46<1:56:14,  1.43it/s]

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/956050


 25%|██▌       | 3101/12378 [15:59<49:59,  3.09it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/956763


 31%|███▏      | 3879/12378 [20:16<45:09,  3.14it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/957541


 50%|████▉     | 6145/12378 [33:19<34:31,  3.01it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/959807


100%|█████████▉| 12377/12378 [1:09:02<00:00,  3.07it/s]  

✔️ Valid webcast: https://kent.public-i.tv/core/portal/webcast_interactive/966039


100%|██████████| 12378/12378 [1:09:05<00:00,  2.99it/s]


### Individual meetings scraping

In [41]:
WEBCAST_URLS = [
    "https://kent.public-i.tv/core/portal/webcast_interactive/953662",
    # more...
]

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import csv
import time
import os

from tqdm import tqdm
import pandas as pd

BASE_URL = "https://kent.public-i.tv"
BIO_OUTPUT_JSONL = "../data/jsons/speaker_biographies.jsonl"


# -------------- UTILS --------------

def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    entry = json.loads(line)
                    seen.add(entry.get("biography_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

# -------------- STEP 1: SCRAPE SPEAKER LINKS --------------

def extract_speaker_links(webcast_url):
    try:
        response = requests.get(webcast_url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.select('a[href*="/core/portal/speaker_profile/"]')
        return list({urljoin(BASE_URL, link['href']) for link in links})
    except Exception as e:
        print(f"❌ Failed to load {webcast_url}: {e}")
        return []

# -------------- STEP 2: SCRAPE SPEAKER PROFILE --------------

def scrape_speaker_profile(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    profile = {'profile_url': url}
    name_tag = soup.select_one('h2.cs_heading_font_family')
    profile['name'] = name_tag.text.strip() if name_tag else None

    img_tag = soup.select_one('div.col-md-4 img')
    profile['photo_url'] = urljoin(url, img_tag['src']) if img_tag else None

    org_dt = soup.find('dt', string='Organisation')
    profile['organisation'] = org_dt.find_next_sibling('dt').text.strip() if org_dt else None

    bio_dt = soup.find('dt', string='Biography')
    bio_link = bio_dt.find_next('a')['href'] if bio_dt and bio_dt.find_next('a') else None
    profile['biography_url'] = bio_link

    email_dt = soup.find('dt', string='Email')
    email_link = email_dt.find_next('a').text.strip() if email_dt else None
    profile['email'] = email_link

    activities = []
    for activity in soup.select('div.recent_activity_webcast'):
        committee = activity.select_one('div.font-weight-bold')
        datetime = committee.find_next_sibling('div') if committee else None
        watch_link = activity.select_one('a.btn-primary')
        activities.append({
            'committee': committee.text.strip() if committee else None,
            'datetime': datetime.text.strip() if datetime else None,
            'webcast_url': urljoin(url, watch_link['href']) if watch_link else None
        })
    profile['recent_activity'] = activities
    return profile

# -------------- STEP 3: SCRAPE BIOGRAPHY PAGE --------------

def scrape_biography_page(url, retries=3, delay=3):
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')

            profile_info = {}
            dl = soup.find('dl', class_='dl-horizontal')
            if dl:
                dt_tags = dl.find_all('dt')
                for dt in dt_tags:
                    key = dt.get_text(strip=True)
                    dd = dt.find_next_sibling('dd')
                    value = dd.get_text(strip=True) if dd else ''
                    profile_info[key] = value

            bio_section = soup.find('div', class_='content')
            if bio_section:
                bio_text = bio_section.get_text(separator=' ', strip=True)
                profile_info['FullText'] = bio_text

            return profile_info
        except Exception as e:
            if attempt == retries - 1:
                return {"bio_scrape_error": str(e)}
            time.sleep(delay)

# -------------- MAIN DRIVER --------------

def main():
    seen_bios = load_existing_jsonl(BIO_OUTPUT_JSONL)

    for webcast_url in WEBCAST_URLS:
        print(f"\n🔍 Scanning: {webcast_url}")
        speaker_urls = extract_speaker_links(webcast_url)
        print(f"✅ Found {len(speaker_urls)} speaker profiles")

        for speaker_url in tqdm(speaker_urls):
            try:
                profile = scrape_speaker_profile(speaker_url)
                bio_url = profile.get("biography_url")

                if not bio_url or bio_url in seen_bios:
                    continue

                full_bio = scrape_biography_page(bio_url)
                profile.update(full_bio)
                save_jsonl_entry(BIO_OUTPUT_JSONL, profile)
                seen_bios.add(bio_url)

                time.sleep(1)  # throttle
                print(f"✔️ {profile['name']}")

            except Exception as e:
                print(f"❌ Error processing {speaker_url}: {e}")

if __name__ == "__main__":
    main()



🔍 Scanning: https://kent.public-i.tv/core/portal/webcast_interactive/953662
✅ Found 10 speaker profiles


 10%|█         | 1/10 [00:01<00:13,  1.48s/it]

✔️ Robert Thomas


 10%|█         | 1/10 [00:02<00:26,  2.95s/it]


KeyboardInterrupt: 

### Identify all pages with valid meetings (from a range)

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm

BASE_URL = "https://kent.public-i.tv"
START_ID = 953662
END_ID = 966039
OUTPUT_FILE = "../data/jsons/valid_webcasts_with_speakers_test.txt"

def is_valid_webcast(meeting_id):
    url = f"{BASE_URL}/core/portal/webcast_interactive/{meeting_id}"
    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return False, None

        soup = BeautifulSoup(resp.text, "html.parser")
        links = soup.select('a[href*="/core/portal/speaker_profile/"]')
        if links:
            return True, url
        return False, None

    except Exception as e:
        print(f"⚠️ Error on {meeting_id}: {e}")
        return False, None

def main():
    print(f"🔍 Scanning webcast IDs {START_ID} to {END_ID}...")
    with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
        for meeting_id in tqdm(range(START_ID, END_ID + 1)):
            valid, url = is_valid_webcast(meeting_id)
            if valid:
                print(f"✔️ Found: {url}")
                f.write(url + "\n")
                f.flush()  # ensure it's immediately written to disk

    print(f"\n✅ Done. Valid webcasts written to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

🔍 Scanning webcast IDs 953662 to 966039...


  0%|          | 2/12378 [00:00<59:46,  3.45it/s]  

✔️ Found: https://kent.public-i.tv/core/portal/webcast_interactive/953662


  5%|▍         | 576/12378 [03:04<1:38:50,  1.99it/s]

✔️ Found: https://kent.public-i.tv/core/portal/webcast_interactive/954237


  5%|▌         | 651/12378 [03:29<1:02:52,  3.11it/s]


KeyboardInterrupt: 

### Mass Scraping Speaker Profiles - all related to Kent County 

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import time
from tqdm import tqdm

BASE_URL = "https://kent.public-i.tv/core/portal/speaker_profile/"
OUTPUT_JSONL = "../data/jsons/new_speaker_profiles_32000_55000.jsonl"

def scrape_speaker_profile(url):
    response = requests.get(url, timeout=10)
    if response.status_code != 200:
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    name_tag = soup.select_one("h2.cs_heading_font_family")
    if not name_tag or "Error finding profile" in name_tag.text:
        return None  # skip error pages and placeholders
    profile = {
        "profile_url": url,
        "name": name_tag.text.strip()
    }
    return profile

with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
    for speaker_id in tqdm(range(32000, 33000)):
        url = f"{BASE_URL}{speaker_id}"
        try:
            profile = scrape_speaker_profile(url)
            if profile:
                f.write(json.dumps(profile, ensure_ascii=False) + "\n")
            time.sleep(3)
        except:
            continue