In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import time
import os
from tqdm import tqdm

MOTHER_URL = "https://whocanivotefor.co.uk/elections/local.kent.2025-05-01/kent-local-election/"
BASE_URL = "https://whocanivotefor.co.uk"
OUTPUT_JSONL = "../data/jsons/kent_candidate_profiles.jsonl"

# Utilities
def load_existing_jsonl(path):
    seen = set()
    if os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    seen.add(data.get("profile_url"))
                except:
                    continue
    return seen

def save_jsonl_entry(path, data):
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

# Step 1: Get all child division links from the mother page
def get_child_division_urls(mother_url):
    res = requests.get(mother_url)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select("a[href^='/elections/local.kent']")
    division_urls = {urljoin(BASE_URL, a["href"]) for a in links if a["href"].count("/") > 3}
    return sorted(division_urls)

# Step 2: Extract all candidate profile URLs from a division page
def get_candidate_urls(division_url):
    res = requests.get(division_url)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select("a[href^='/person/']")
    return [urljoin(BASE_URL, a["href"]) for a in links]

# Step 3: Parse the candidate profile page
def scrape_candidate_profile(url):
    res = requests.get(url)
    if res.status_code != 200:
        return None

    soup = BeautifulSoup(res.text, "html.parser")
    name_tag = soup.select_one("h2.ds-candidate-name")
    if not name_tag:
        return None

    profile = {
        "profile_url": url,
        "name": name_tag.text.strip(),
        "photo_url": None,
        "party": None,
        "ward": None,
        "votes": None,
        "position": None,
        "statement": None,
        "email": None,
        "social_links": {},
        "elections": []
    }

    # Image
    img_tag = soup.select_one("img[alt*='profile photo']")
    if img_tag:
        profile["photo_url"] = img_tag["src"]

    # Paragraph with election summary
    p_tag = soup.select_one("section.ds-candidate p")
    if p_tag:
        profile["summary"] = p_tag.text.strip()

    # Statement
    quote_block = soup.select_one("blockquote")
    if quote_block:
        profile["statement"] = quote_block.get_text(" ", strip=True)

    # Email
    email_tag = soup.select_one("a[href^='mailto:']")
    if email_tag:
        profile["email"] = email_tag.text.strip()

    # Social and external links
    for dt in soup.select("dl.ds-descriptions dt"):
        label = dt.text.strip().lower()
        dd = dt.find_next_sibling("div") or dt.find_next_sibling("dd")
        if dd and dd.find("a"):
            profile["social_links"][label] = dd.find("a").get("href")

    # Election table
    table = soup.select_one("table")
    if table:
        for row in table.select("tr")[1:]:
            cols = row.find_all("td")
            if len(cols) >= 5:
                profile["elections"].append({
                    "year": cols[0].text.strip(),
                    "election": cols[1].text.strip(),
                    "party": cols[2].text.strip(),
                    "result": cols[3].text.strip(),
                    "position": cols[4].text.strip()
                })

    return profile

# MAIN SCRIPT
def main():
    seen_profiles = load_existing_jsonl(OUTPUT_JSONL)
    child_urls = get_child_division_urls(MOTHER_URL)
    print(f"🔍 Found {len(child_urls)} divisions")

    for division_url in tqdm(child_urls, desc="Scanning divisions"):
        try:
            candidate_urls = get_candidate_urls(division_url)
            for candidate_url in candidate_urls:
                if candidate_url in seen_profiles:
                    continue
                data = scrape_candidate_profile(candidate_url)
                if data:
                    save_jsonl_entry(OUTPUT_JSONL, data)
                    seen_profiles.add(candidate_url)
                time.sleep(2)  # Throttle
        except Exception as e:
            print(f"❌ Error in {division_url}: {e}")
            continue

    print(f"✅ Finished scraping. Saved to {OUTPUT_JSONL}")

if __name__ == "__main__":
    main()

🔍 Found 72 divisions


Scanning divisions: 100%|██████████| 72/72 [17:43<00:00, 14.77s/it]

✅ Finished scraping. Saved to ../data/jsons/kent_candidate_profiles.jsonl



