### Scrape elections results from a landing page

The settings are at the bottom of the main function - untoggle as needed

In [16]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "https://democracy.kent.gov.uk:9071"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ElectionScraper/1.0)"
}

def scrape_election(eid, election_date, rpid=None):
    print(f"\n📋 Scraping election {eid} on {election_date}...\n")

    # Construct the master page URL
    start_url = f"{BASE_URL}/mgElectionElectionAreaResults.aspx?Page=all&EID={eid}"
    if rpid:
        start_url += f"&RPID={rpid}"

    # --- Step 1: Get all division result links
    res = requests.get(start_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = []
    seen = set()

    for a in soup.select("a"):
        href = a.get("href", "")
        text = a.get_text(strip=True)
        if "mgElectionAreaResults.aspx" in href and "ID=" in href:
            full_url = BASE_URL + "/" + href.lstrip("/")
            if full_url not in seen:
                seen.add(full_url)
                links.append((text, full_url))

    print(f"🔗 Found {len(links)} division links...")

    # --- Step 2: Scrape each division
    results = []
    failed = []

    for i, (name, url) in enumerate(links):
        print(f"[{i+1}/{len(links)}] Scraping: {name}")
        try:
            division_data = parse_division_page(name, url)
            division_data["election_date"] = election_date
            results.append(division_data)
            time.sleep(0.4)
        except Exception as e:
            print(f"⚠️ Failed on {name}: {e}")
            failed.append({"division": name, "url": url, "error": str(e)})

    # --- Step 3: Save results
    out_file = f"../data/elections/kent_results_{election_date}.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    if failed:
        with open(f"failed_{election_date}.json", "w", encoding="utf-8") as f:
            json.dump(failed, f, indent=2)

    print(f"\n✅ Saved {len(results)} results to '{out_file}'")
    if failed:
        print(f"⚠️ {len(failed)} divisions failed — see 'failed_{election_date}.json'")

# ------------------------------------------
# Helper: Parse individual division page
# ------------------------------------------

def parse_division_page(name, url):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    tables = soup.select("table.mgStatsTable")

    if len(tables) < 1:
        return {
            "division": name,
            "url": url,
            "status": "no_tables_found"
        }

    # Try to locate correct tables by caption
    candidate_table = next((t for t in tables if "Candidate" in t.get_text() or "results" in t.get_text().lower()), None)
    summary_table = next((t for t in tables if "Voting Summary" in t.get_text()), None)

    if not candidate_table or not summary_table:
        return {
            "division": name,
            "url": url,
            "status": "incomplete_data"
        }

    # --- Candidate table
    candidates = []
    candidate_rows = candidate_table.find_all("tr")[1:]
    for row in candidate_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) != 5:
            continue
        candidates.append({
            "name": cols[0],
            "party": cols[1],
            "votes": int(cols[2].replace(",", "")),
            "percentage": cols[3],
            "outcome": cols[4]
        })

    # --- Summary table
    summary = {}
    summary_rows = summary_table.find_all("tr")[1:]
    for row in summary_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) != 2 or not cols[0].strip():
            continue
        key = cols[0].lower().replace(" ", "_")
        val = cols[1].replace(",", "")
        summary[key] = int(val) if val.isdigit() else val

    return {
        "division": name,
        "url": url,
        "status": "ok",
        "candidates": candidates,
        "summary": summary
    }

# ------------------------------------------
# Example usage
# ------------------------------------------

if __name__ == "__main__":
    #scrape_election(eid=51, election_date="2025-05-01")
    #scrape_election(eid=32, election_date="2021-05-06")
    #scrape_election(eid=20, election_date="2017-05-04")
    # scrape_election(eid=12, election_date="2013-05-02")
    scrape_election(eid=3,  election_date="2009-06-04")



📋 Scraping election 3 on 2009-06-04...

🔗 Found 72 division links...
[1/72] Scraping: Ashford Central
[2/72] Scraping: Ashford East
[3/72] Scraping: Ashford Rural East
[4/72] Scraping: Ashford Rural South
[5/72] Scraping: Ashford Rural West
[6/72] Scraping: Ashford South
[7/72] Scraping: Birchington and Villages
[8/72] Scraping: Broadstairs & Sir M M'fiore
[9/72] Scraping: Canterbury City North East
[10/72] Scraping: Canterbury City South West
[11/72] Scraping: Canterbury South East
[12/72] Scraping: Canterbury West
[13/72] Scraping: Cranbrook
[14/72] Scraping: Darent Valley
[15/72] Scraping: Dartford East
[16/72] Scraping: Dartford North East
[17/72] Scraping: Dartford Rural
[18/72] Scraping: Dartford West
[19/72] Scraping: Deal
[20/72] Scraping: Dover North
[21/72] Scraping: Dover Town
[22/72] Scraping: Dover West
[23/72] Scraping: Elham Valley
[24/72] Scraping: Faversham
[25/72] Scraping: Folkestone North East
[26/72] Scraping: Folkestone South
[27/72] Scraping: Folkestone West
[28

### Consolidate all jsons into one

In [98]:
import json
import os
from glob import glob

def merge_election_files(folder_path=".", output_file="../data/elections/kent_results_all_years.json"):
    merged = []
    files = sorted(glob(os.path.join(folder_path, "../data/elections/kent_results_20*.json")))

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            for record in data:
                record["council"] = "Kent County Council"
                merged.append(record)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged, f, indent=2, ensure_ascii=False)

    print(f"✅ Merged {len(files)} files into '{output_file}' ({len(merged)} total records)")

if __name__ == "__main__":
    merge_election_files()


✅ Merged 5 files into '../data/elections/kent_results_all_years.json' (360 total records)


### Cleaning 

In [99]:
import re

# Reload the full data again (before any other cleaning)
with open("../data/elections/kent_results_all_years.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define title-stripping function
def strip_titles(name):
    return re.sub(r",?\s+(MBE|OBE|CBE|KBE|DBE|CH|QC|KC|JP|DL|FRSA|FRICS|BA|MA|PhD|BSc|MSc|LLB|LLM)\b", "", name)

# Apply it directly to the raw data, before any other cleaning
for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        cand["name"] = strip_titles(cand["name"]).strip()

# Now data is clean in-memory (titles stripped, nothing else done yet)


In [100]:
# Function to split full name into first, middle, last
def split_name(full_name):
    full_name = full_name.strip().replace(",", "")  # remove commas and trim
    parts = full_name.split()

    if len(parts) == 0:
        return "", "", ""

    first_name = parts[0]
    last_name = parts[-1]
    middle_names = " ".join(parts[1:-1]) if len(parts) > 2 else ""

    return first_name, middle_names, last_name

# Apply the split to all candidate entries
for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        first, middle, last = split_name(cand["name"])
        cand["first_name"] = first
        cand["middle_names"] = middle
        cand["last_name"] = last



In [101]:
# Step 1: Reload original full data before any cleaning
with open("../data/elections/kent_results_all_years.json", "r", encoding="utf-8") as f:
    original_data = json.load(f)

# Step 2: Strip honorifics (MBE, OBE, etc.)
def strip_titles(name):
    return re.sub(r",?\s+(MBE|OBE|CBE|KBE|DBE|CH|QC|KC|JP|DL|FRSA|FRICS|BA|MA|PhD|BSc|MSc|LLB|LLM)\b", "", name)

# Step 3: Split full name into components
def split_name(full_name):
    full_name = full_name.strip().replace(",", "")
    parts = full_name.split()
    if len(parts) == 0:
        return "", "", ""
    first_name = parts[0]
    last_name = parts[-1]
    middle_names = " ".join(parts[1:-1]) if len(parts) > 2 else ""
    return first_name, middle_names, last_name

# Step 4: Build mapping from last name → set of (first_name, middle_names, full_name)
from collections import defaultdict

last_name_map = defaultdict(set)

for record in original_data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        stripped_name = strip_titles(cand["name"])
        first, middle, last = split_name(stripped_name)
        if last and first:
            last_name_map[last].add((first, middle, stripped_name))

# Step 5: Identify ambiguous last names (same last name, different first initials)
ambiguous_last_names = {}

for last, entries in last_name_map.items():
    initials = {first[0] for first, _, _ in entries if first}
    if len(initials) > 1:
        ambiguous_last_names[last] = sorted(entries)

# Step 6: Display in dataframe
ambiguous_df = pd.DataFrame([
    {
        "last_name": last,
        "first_names": ", ".join(sorted({first for first, _, _ in entries})),
        "examples": "; ".join(sorted(full for _, _, full in entries))
    }
    for last, entries in ambiguous_last_names.items()
])

ambiguous_df


Unnamed: 0,last_name,first_names,examples
0,Griffiths,"David, Jackie, John, Sophie",David John Griffiths; Jackie Griffiths; John G...
1,Angell,"Christine, Mike",Christine Angell; Mike Angell
2,Stone,"Ian, Robert",Ian Stone; Robert Dalip Stone
3,King,"Alex, Nathan, Paul, Richard, Thomas",Alex King; Nathan King; Paul John King; Richar...
4,Jones,"Anne, Gillian, Hilary, Huw, John, Joshua, Mari...",Anne Elizabeth Jones; Gillian Jones; Hilary Jo...
...,...,...,...
164,Barrett,"Bill, Thea",Bill Barrett; Thea Barrett
165,Goldfinch,"Claire, Harry",Claire Goldfinch; Harry Goldfinch
166,Spence,"George, John",George Henry Mills Spence; John Russell Spence
167,Sweetman,"Allison, Callum",Allison Juliet Sweetman; Callum George Sweetman


In [102]:
# Let's re-apply the stricter filtering logic:
# - Include only last names that appear with *different first names*
# - But all those first names must share the same initial letter

rechecked_strict_ambiguous_last_names = {}

for last, entries in last_name_map.items():
    # Get first name initials
    first_names = {first for first, _, _ in entries if first}
    first_initials = {first[0] for first in first_names if first}

    # Keep only if there's more than one name, but all with same first initial
    if len(first_names) > 1 and len(first_initials) == 1:
        rechecked_strict_ambiguous_last_names[last] = sorted(entries)

# Convert to DataFrame
rechecked_strict_df = pd.DataFrame([
    {
        "last_name": last,
        "first_names": ", ".join(sorted({first for first, _, _ in entries})),
        "examples": "; ".join(sorted(full for _, _, full in entries))
    }
    for last, entries in rechecked_strict_ambiguous_last_names.items()
])

rechecked_strict_df


Unnamed: 0,last_name,first_names,examples
0,Campkin,"Steve, Steven",Steve Campkin; Steven R Campkin; Steven Robert...
1,Manion,"Stephen, Steve",Stephen Charles Manion; Steve Manion
2,Brivio,"Pam, Pamela",Pam Brivio; Pamela M Brivio; Pamela Mary Brivio
3,Jack,"Nick, Nicolas",Nick Jack; Nicolas S W Jack
4,McKenna,"Francis, Frank",Francis J McKenna; Frank McKenna
5,Cannon,"Teresa, Tom",Teresa Cannon; Tom Cannon
6,Parker,"Ray, Roz",Ray Parker; Roz Parker
7,London,"James, John",James Frederick Justin London; John London
8,Kift,"Penelope, Penny",Penelope M Kift; Penny Kift
9,Stewart,"Chris, Colin",Chris Stewart; Colin McCarthy Stewart


In [103]:
# Convert the set of indices to a list to fix the error
excluded_merge_indices = [4, 5, 6, 7, 9, 11, 18, 22, 23, 25]

# Get last names from excluded rows in the previously built ambiguous_df
excluded_last_names = ambiguous_df.loc[excluded_merge_indices, "last_name"].tolist()

# Rebuild the canonical_name_map with exclusions
final_canonical_name_map = {}

for last, entries in last_name_map.items():
    grouped_by_first = defaultdict(list)
    first_initials = set()

    for first, middle, full in entries:
        if first:
            grouped_by_first[first].append(full)
            first_initials.add(first[0])

    if last in excluded_last_names:
        # Do not merge — treat each name as its own canonical
        for first, variants in grouped_by_first.items():
            for variant in variants:
                final_canonical_name_map[(first.strip(), last.strip())] = variant
    elif len(first_initials) == 1:
        # Safe to merge — assign longest variant
        for first, variants in grouped_by_first.items():
            longest_variant = max(variants, key=len)
            final_canonical_name_map[(first.strip(), last.strip())] = longest_variant

# Apply updated canonical names to the dataset
for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        first = cand.get("first_name", "").strip()
        last = cand.get("last_name", "").strip()
        cand["canonical_name"] = final_canonical_name_map.get((first, last), cand["name"])

# Show a few examples where canonical_name != original name
diffs = sorted({
    (cand["name"], cand["canonical_name"])
    for record in data if record.get("status") == "ok"
    for cand in record["candidates"]
    if cand["name"] != cand["canonical_name"]
})

diffs[:100]


[('Alan J Bullion', 'Alan James Bullion'),
 ('Alex Ricketts', 'Alex James Ricketts'),
 ('Andrew C Waldie', 'Andrew Crawford  Waldie'),
 ('Andrew Malcolm Kennedy', 'Andrew Mark Stephen Kennedy'),
 ('Ashley Wise', 'Ashley Luke Wise'),
 ('Avtar Sandhu', 'Avtar Singh Sandhu'),
 ('Brian E MacDowall', 'Brian Eric MacDowall'),
 ('Brian Eugene Clark', 'Brian Clark'),
 ('Brian W Copping', 'Brian William Copping'),
 ('Bryan Sweetland', 'Bryan John Sweetland'),
 ('Christine J Marshall', 'Christine Jennifer Marshall'),
 ('Christopher Cornell', 'Christopher James Cornell'),
 ('Christopher Hoare', 'Christopher Pierce David Hoare'),
 ('Clive A English', 'Clive Andrew English'),
 ('Colin Caller', 'Colin William Caller'),
 ('David Brazier', 'David Lionel Brazier'),
 ('David Gary Beaney', 'David Garry Beaney'),
 ('David J Neve', 'David John Neve'),
 ('David Naghi', 'David Sandru Naghi'),
 ('David Robey', 'David Patrick John Robey'),
 ('David S Naghi', 'David Sandru Naghi'),
 ('David Waller', 'David Leon

In [104]:
# Save the fully cleaned and canonicalised dataset to a new JSON file
output_path = "../data/elections/kent_results_all_years_cleaned.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

output_path


'../data/elections/kent_results_all_years_cleaned.json'

### Crosstab of election results - by party

In [105]:
import json
import pandas as pd
from collections import defaultdict

# Load cleaned election dataset
with open("../data/elections/kent_results_all_years_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Collect all unique election dates
election_dates = sorted({entry["election_date"] for entry in data if entry.get("status") == "ok"})

# Step 1: Track latest party for each candidate by canonical name
latest_party_by_candidate = {}
for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name not in latest_party_by_candidate or election_date > latest_party_by_candidate[name]["election_date"]:
            latest_party_by_candidate[name] = {
                "election_date": election_date,
                "party": cand["party"]
            }

# Step 2: Build outcome timeline for candidates whose latest party is "Reform UK"
party_filter = "Reform UK"
candidate_results = defaultdict(lambda: {date: "NP" for date in election_dates})

for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if latest_party_by_candidate.get(name, {}).get("party") == party_filter:
            candidate_results[name][election_date] = cand["outcome"]

# Step 3: Convert to DataFrame
df = pd.DataFrame.from_dict(candidate_results, orient="index")
df.index.name = "Candidate"

# Step 4: Add each candidate's latest division
latest_division_by_candidate = {}
for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    division = record["division"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name not in latest_division_by_candidate or election_date > latest_division_by_candidate[name]["election_date"]:
            latest_division_by_candidate[name] = {
                "election_date": election_date,
                "division": division
            }

df["Latest Division"] = df.index.map(lambda name: latest_division_by_candidate.get(name, {}).get("division", ""))

# Reorder columns
df = df[["Latest Division"] + [col for col in df.columns if col != "Latest Division"]]

# View the output
df.head(20)


Unnamed: 0_level_0,Latest Division,2009-06-04,2013-05-02,2017-05-04,2021-05-06,2025-05-01
Candidate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Barry Taylor,Dartford North East,Not elected,NP,NP,Not elected,NP
Gary Rogers,Dartford West,Not elected,Not elected,NP,Not elected,NP
Garry Graham Sturley,Gravesend East,Not elected,NP,NP,NP,Elected
Trevor Leslie Shonk,Ramsgate,Not elected,Elected,Not elected,Elected,Elected
Eric Elliott,Romney Marsh,NP,Not elected,NP,Not elected,NP
Christopher Pierce David Hoare,Tunbridge Wells North,NP,Elected,NP,Not elected,Not elected
Ben Robert Fryer,Dartford North East,NP,NP,Not elected,NP,Elected
Mary Elizabeth Lawes,Folkestone East,NP,NP,Not elected,Not elected,Elected
Ryan Andrew Waters,Dartford East,NP,NP,Not elected,NP,Elected
Helen Jean Brown,Malling Rural East,NP,NP,Not elected,NP,Not elected


In [110]:
# Step 1: Get the most recent election date
latest_election_date = max(election_dates)

# Step 2: Find all candidates elected in the latest election
elected_councillors = set()
latest_division_by_candidate = {}
latest_party_by_candidate = {}

for record in data:
    if record.get("status") != "ok":
        continue
    if record["election_date"] != latest_election_date:
        continue

    division = record["division"]
    for cand in record["candidates"]:
        if cand["outcome"] == "Elected":
            name = cand["canonical_name"].strip()
            elected_councillors.add(name)
            latest_division_by_candidate[name] = division
            latest_party_by_candidate[name] = cand["party"]

# Step 3: Build the outcome history for each elected councillor
councillor_results = defaultdict(lambda: {date: "NP" for date in election_dates})

for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name in elected_councillors:
            councillor_results[name][election_date] = cand["outcome"]

# Step 4: Create DataFrame
councillor_df = pd.DataFrame.from_dict(councillor_results, orient="index")
councillor_df.index.name = "Councillor"

# Add division and latest party columns
councillor_df["Division"] = councillor_df.index.map(lambda name: latest_division_by_candidate.get(name, ""))
councillor_df["Latest Party"] = councillor_df.index.map(lambda name: latest_party_by_candidate.get(name, ""))

# Reorder columns
cols = ["Division", "Latest Party"] + [col for col in election_dates]
councillor_df = councillor_df[cols]

# Filter the DataFrame to include only councillors who were elected in the latest election
elected_df = councillor_df[councillor_df[latest_election_date] == "Elected"]
# Step 1: Format past parties (already done above)
councillor_df["Past Parties"] = councillor_df.index.map(
    lambda name: ", ".join(sorted(party_affiliations_by_candidate.get(name, [])))
)

# Step 2: Calculate experience (number of times elected)
councillor_df["Experience at KCC (terms)"] = councillor_df[election_dates].apply(
    lambda row: sum(1 for value in row if value == "Elected"), axis=1
)

# Step 3: Reorder columns
ordered_cols = ["Division", "Latest Party"] + election_dates + ["Experience at KCC (terms)", "Past Parties"]
councillor_df = councillor_df[ordered_cols]

councillor_df.head(30)

Unnamed: 0_level_0,Division,Latest Party,2009-06-04,2013-05-02,2017-05-04,2021-05-06,2025-05-01,Experience at KCC (terms),Past Parties
Councillor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Garry Graham Sturley,Gravesend East,Reform UK,Not elected,NP,NP,NP,Elected,1,Labour
Tim Prater,Cheriton Sandgate & Hythe East,Liberal Democrat,Elected,Not elected,Not elected,Not elected,Elected,2,
Stuart Robert Jeffery,Maidstone Central,Green Party,Not elected,Not elected,Not elected,Not elected,Elected,1,
Trudy Dean,Malling Central,Liberal Democrat,Elected,Elected,Elected,Elected,Elected,5,
Trevor Leslie Shonk,Ramsgate,Reform UK,Not elected,Elected,Not elected,Elected,Elected,3,"Conservative, UK Independence Party"
Geoffrey Richard Samme,Maidstone North East,Liberal Democrat,NP,Not elected,NP,NP,Elected,1,
Mark Strafford Ellis,Tunbridge Wells North,Liberal Democrat,NP,Not elected,NP,NP,Elected,1,Independent
Ben Robert Fryer,Dartford North East,Reform UK,NP,NP,Not elected,NP,Elected,1,UK Independence Party
Antony James Hook,Faversham,Liberal Democrat,NP,NP,Elected,Elected,Elected,3,
Mary Elizabeth Lawes,Folkestone East,Reform UK,NP,NP,Not elected,Not elected,Elected,1,"Foundation Party, UK Independence Party"


In [112]:
# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2025.csv"
councillor_df.to_csv(output_path)
output_path

'../data/elections/kent_councillors_elected_2025.csv'

### KCC 2021

In [126]:
# Re-run after code reset

import json
import pandas as pd

# Set target election date
target_date = "2021-05-06"

# Extract elected candidates from that date
elected_2021 = []
for record in data:
    if record.get("status") != "ok":
        continue
    if record["election_date"] != target_date:
        continue
    division = record["division"]
    for cand in record["candidates"]:
        if cand["outcome"] == "Elected":
            elected_2021.append({
                "name": cand["name"].strip(),
                "division": division,
                "first_name": cand.get("first_name", "").strip(),
                "last_name": cand.get("last_name", "").strip(),
                "middle_names": cand.get("middle_names", "").strip(),
                "party": cand["party"].strip()
            })

# Create DataFrame
elected_2021_df = pd.DataFrame(elected_2021)


# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2021_short.csv"
elected_2021_df.to_csv(output_path)
output_path

elected_2021_df.tail(20)

Unnamed: 0,name,division,first_name,last_name,middle_names,party
60,Nick Chard,Sevenoaks West,Nick,Chard,,Conservative
61,Cameron Andrew Beart,Sheppey,Cameron,Beart,Andrew,Conservative
62,Andy Booth,Sheppey,Andy,Booth,,Conservative
63,Mike Dendor,Sittingbourne North,Mike,Dendor,,Conservative
64,John Geoffrey Wright,Sittingbourne South,John,Wright,Geoffrey,Conservative
65,Rich Lehmann,Swale East,Rich,Lehmann,,Green Party
66,Mike Baldock,Swale West,Mike,Baldock,,Swale Independents
67,Perry Cole,Swanley,Perry,Cole,,Conservative
68,Peter Martin Harman,Swanscombe and Greenhithe,Peter,Harman,Martin,Swanscombe & Greenhithe Residents' Association
69,Mike Hill,Tenterden,Mike,Hill,,Conservative


In [127]:
import json
import pandas as pd

# Load the correct dataset
with open("../data/elections/kent_results_all_years_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Target election date
target_date = "2025-05-01"
elected_2021 = []

for record in data:
    if record.get("status") != "ok" or record.get("election_date") != target_date:
        continue
    division = record.get("division", "")
    for cand in record["candidates"]:
        if cand.get("outcome") == "Elected":
            elected_2021.append({
                "name": cand.get("name", "").strip(),
                "division": division,
                "first_name": cand.get("first_name", "").strip(),
                "last_name": cand.get("last_name", "").strip(),
                "middle_names": cand.get("middle_names", "").strip(),
                "party": cand.get("party", "").strip()
            })

# Convert to DataFrame
elected_2025_df = pd.DataFrame(elected_2021)

# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2025_short.csv"
elected_2025_df.to_csv(output_path)
output_path

elected_2025_df


Unnamed: 0,name,division,first_name,last_name,middle_names,party
0,Pamela Ann Williams,Ashford Central,Pamela,Williams,Ann,Reform UK
1,Dean Edward Burns,Ashford East,Dean,Burns,Edward,Reform UK
2,Brian Philip Collins,Ashford Rural East,Brian,Collins,Philip,Reform UK
3,Bill Barrett,Ashford Rural South,Bill,Barrett,,Reform UK
4,Jeremy Waring Eustace,Ashford Rural West,Jeremy,Eustace,Waring,Reform UK
...,...,...,...,...,...,...
76,Martin Dale Brice,Tunbridge Wells South,Martin,Brice,Dale,Liberal Democrat
77,John Joseph Moreland,Tunbridge Wells West,John,Moreland,Joseph,Liberal Democrat
78,Adrian John Kibble,Whitstable East & Herne Bay West,Adrian,Kibble,John,Reform UK
79,Stuart Heaver,Whitstable West,Stuart,Heaver,,Green Party


### Meetings mock up

In [138]:
import pandas as pd
import json
from pathlib import Path
from typing import List, Tuple, Dict
import re

# Configuration
MINUTES_PATH = Path("/Users/lgfolder/github/council-assistant/data/document_metadata/metadata_test.jsonl")
COUNCILLORS_2025_CSV = Path("/Users/lgfolder/github/council-assistant/data/elections/kent_councillors_elected_2025_short.csv")
COUNCILLORS_2021_CSV = Path("/Users/lgfolder/github/council-assistant/data/elections/kent_councillors_elected_2021_short.csv")
OUTPUT_PATH = Path("/Users/lgfolder/github/council-assistant/data/who_is_who.csv")

def load_and_standardize_councillors(current_csv: Path, previous_csv: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load both current and previous councillor data with standardized names"""
    def process_councillors(df: pd.DataFrame) -> pd.DataFrame:
        # Handle different column name variations
        first_name_col = next((col for col in df.columns if 'first' in col.lower()), None)
        last_name_col = next((col for col in df.columns if 'last' in col.lower()), None)
        
        if not first_name_col or not last_name_col:
            raise ValueError("Could not find first_name and last_name columns in councillor data")
        
        df['standard_first'] = (
            df[first_name_col]
            .astype(str)
            .str.lower()
            .str.strip()
            .str.normalize('NFKD')
            .str.encode('ascii', errors='ignore')
            .str.decode('utf-8')
        )
        df['standard_last'] = (
            df[last_name_col]
            .astype(str)
            .str.lower()
            .str.strip()
            .str.normalize('NFKD')
            .str.encode('ascii', errors='ignore')
            .str.decode('utf-8')
        )
        
        # Handle division/ward/department naming
        division_col = next((col for col in df.columns if any(x in col.lower() for x in ['division', 'ward', 'district'])), None)
        if division_col:
            df['division'] = df[division_col]
        else:
            df['division'] = ''
            
        # Handle party/group naming
        party_col = next((col for col in df.columns if any(x in col.lower() for x in ['party', 'group'])), None)
        if party_col:
            df['party'] = df[party_col]
        else:
            df['party'] = ''
            
        return df
    
    try:
        current = process_councillors(pd.read_csv(current_csv))
        previous = process_councillors(pd.read_csv(previous_csv))
        return current, previous
    except Exception as e:
        print(f"Error loading councillor data: {e}")
        raise

def parse_minute_names(minutes_path: Path) -> List[str]:
    """Extract all unique names from meeting minutes"""
    attendees = set()
    
    with open(minutes_path, 'r') as f:
        for line in f:
            try:
                meeting = json.loads(line)
                for status in ['present', 'absent', 'virtual']:
                    for name in meeting['attendance'].get(status, []):
                        attendees.add(name.strip())
            except json.JSONDecodeError:
                continue
    
    return list(attendees)

def standardize_minutes_name(name: str) -> Tuple[str, str]:
    """Convert council minutes names to standardized format"""
    # Remove honorifics and trailing periods
    clean_name = re.sub(
        r'^(Mr|Mrs|Ms|Miss|Sir|Dr|Cllr)\.?\s+', 
        '', 
        name, 
        flags=re.IGNORECASE
    ).strip()
    
    # Handle cases with multiple initials
    parts = [p.strip('. ') for p in clean_name.split() if p.strip()]
    
    if not parts:
        return ('', '')
    
    last_name = parts[-1]
    first_parts = parts[:-1]
    
    if not first_parts:
        return ('', last_name.lower())
    
    # Create first initial pattern (e.g., "R W" becomes "r.?w.?")
    first_initials = ''.join([f"{p[0].lower()}.*" for p in first_parts if p])
    
    return (first_initials, last_name.lower())

def find_councillor_match(first: str, last: str, councillors: pd.DataFrame) -> pd.DataFrame:
    """Find matching councillors using flexible matching"""
    # Exact last name match
    matches = councillors[councillors['standard_last'] == last]
    
    if not first:
        return matches
    
    # Flexible first initial matching (e.g., "r.?w.?" matches "Roger William")
    try:
        pattern = re.compile(f'^{first}')
        return matches[
            matches['standard_first'].str.contains(pattern, na=False)
        ]
    except:
        return matches

def classify_attendees(
    attendees: List[str], 
    current_councillors: pd.DataFrame,
    previous_councillors: pd.DataFrame
) -> pd.DataFrame:
    """Classify each attendee into categories"""
    records = []
    
    for raw_name in attendees:
        first, last = standardize_minutes_name(raw_name)
        
        # Check current councillors first
        current_matches = find_councillor_match(first, last, current_councillors)
        previous_matches = find_councillor_match(first, last, previous_councillors)
        
        if len(current_matches) == 1:
            # Current councillor match
            record = create_record(raw_name, current_matches.iloc[0], 'current')
        elif len(previous_matches) == 1:
            # Former councillor match
            record = create_record(raw_name, previous_matches.iloc[0], 'former')
        elif len(current_matches) > 1 or len(previous_matches) > 1:
            # Ambiguous match
            record = create_ambiguous_record(raw_name, first, last, current_matches, previous_matches)
        else:
            # No match - likely civil servant
            record = create_civil_servant_record(raw_name, first, last)
        
        records.append(record)
    
    return pd.DataFrame(records)

def create_record(raw_name: str, councillor: pd.Series, status: str) -> Dict:
    """Create standardized record for matched councillor"""
    return {
        'raw_name': raw_name,
        'first_name': councillor.get('first_name', ''),
        'last_name': councillor.get('last_name', ''),
        'position': 'Councillor',
        'division': councillor.get('division', ''),
        'party': councillor.get('party', ''),
        'status': status,
        'source': 'current' if status == 'current' else 'previous'
    }

def create_ambiguous_record(
    raw_name: str, 
    first: str, 
    last: str,
    current_matches: pd.DataFrame,
    previous_matches: pd.DataFrame
) -> Dict:
    """Create record for ambiguous matches"""
    all_matches = pd.concat([current_matches, previous_matches])
    return {
        'raw_name': raw_name,
        'first_name': '',
        'last_name': last.title(),
        'position': 'AMBIGUOUS',
        'division': '|'.join(all_matches.get('division', '').unique()),
        'party': '|'.join(all_matches.get('party', '').unique()),
        'status': 'needs_review',
        'source': 'multiple'
    }

def create_civil_servant_record(raw_name: str, first: str, last: str) -> Dict:
    """Create record for civil servants"""
    formatted_first = ' '.join([f"{c.upper()}." for c in first.split('.') if c]) if first else ''
    return {
        'raw_name': raw_name,
        'first_name': formatted_first,
        'last_name': last.title(),
        'position': 'Civil Servant',
        'division': '',
        'party': '',
        'status': 'civil_servant',
        'source': ''
    }

def main():
    try:
        # Load and standardize councillor data
        current_councillors, previous_councillors = load_and_standardize_councillors(
            COUNCILLORS_2025_CSV, 
            COUNCILLORS_2021_CSV
        )
        
        # Parse meeting attendees
        attendees = parse_minute_names(MINUTES_PATH)
        
        # Classify attendees
        who_is_who = classify_attendees(attendees, current_councillors, previous_councillors)
        
        # Save results
        who_is_who.to_csv(OUTPUT_PATH, index=False)
        print(f"Processed {len(who_is_who)} names. Saved to {OUTPUT_PATH}")
        print("\nSample output:")
        print(who_is_who.head().to_string())
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    main()

Processed 77 names. Saved to /Users/lgfolder/github/council-assistant/data/who_is_who.csv

Sample output:
            raw_name first_name   last_name       position                     division                                           party         status    source
0  Mr I S Chittenden  I. *S. *.  Chittenden  Civil Servant                                                                               civil_servant          
1       Peter Harman      Peter      Harman     Councillor    Swanscombe and Greenhithe  Swanscombe & Greenhithe Residents' Association         former  previous
2       Mr R W Gough  R. *W. *.       Gough  Civil Servant                                                                               civil_servant          
3        Mr J Wright       John      Wright     Councillor          Sittingbourne South                                    Conservative         former  previous
4      Mr C Broadley     Conrad    Broadley     Councillor  Northfleet & Gravesend West  

In [139]:
import pandas as pd
from pathlib import Path

# Load the processed data
who_is_who = pd.read_csv("../data/who_is_who.csv")

# 1. Current Councillors Table
current_councillors = who_is_who[who_is_who['status'] == 'current'].copy()
current_councillors['full_name'] = current_councillors['first_name'] + ' ' + current_councillors['last_name']
councillors_table = current_councillors[['full_name', 'division', 'party', 'last_name']].sort_values('last_name')
councillors_table = councillors_table[['full_name', 'division', 'party']]  # Drop last_name after sorting

# 2. Civil Servants Directory
civil_servants = who_is_who[who_is_who['status'] == 'civil_servant'].copy()
civil_servants['formatted_name'] = civil_servants['first_name'] + ' ' + civil_servants['last_name']
civil_servants_table = civil_servants[['formatted_name', 'raw_name', 'last_name']].sort_values('last_name')
civil_servants_table = civil_servants_table[['formatted_name', 'raw_name']]  # Drop last_name after sorting

# 3. Meeting Participation Heatmap
# First we need to count meeting appearances (this would be better done during initial processing)
def count_meetings(minutes_path):
    meeting_counts = {}
    with open(minutes_path, 'r') as f:
        for line in f:
            try:
                meeting = json.loads(line)
                for status in ['present', 'absent', 'virtual']:
                    for name in meeting['attendance'].get(status, []):
                        meeting_counts[name.strip()] = meeting_counts.get(name.strip(), 0) + 1
            except json.JSONDecodeError:
                continue
    return meeting_counts

meeting_counts = count_meetings(MINUTES_PATH)
who_is_who['meetings_attended'] = who_is_who['raw_name'].map(meeting_counts).fillna(0)

participation_table = who_is_who[
    ['first_name', 'last_name', 'position', 'meetings_attended']
].sort_values('meetings_attended', ascending=False)

# 4. Department Affiliations
department_table = who_is_who.groupby(['division', 'position']).size().unstack(fill_value=0)
department_table['Total'] = department_table.sum(axis=1)

# 5. Ambiguity Resolution Table
ambiguity_table = who_is_who[who_is_who['status'] == 'needs_review'].copy()
ambiguity_table['possible_matches'] = ambiguity_table.apply(
    lambda x: f"{x['division']} ({x['party']})", axis=1
)
ambiguity_table = ambiguity_table[['raw_name', 'possible_matches']]

# Save all tables
tables_path = Path("/Users/lgfolder/github/council-assistant/data/who_is_who_tables/")
tables_path.mkdir(exist_ok=True)

councillors_table.to_csv(tables_path / "councillors.csv", index=False)
civil_servants_table.to_csv(tables_path / "civil_servants.csv", index=False)
participation_table.to_csv(tables_path / "participation.csv", index=False)
department_table.to_csv(tables_path / "divisions.csv")
ambiguity_table.to_csv(tables_path / "ambiguities.csv", index=False)

print("All tables generated successfully!")

All tables generated successfully!


### Generate elections json from the elections results

In [145]:
import json
from pathlib import Path
import pandas as pd

# Load the cleaned results
input_path = Path("../data/elections/kent_results_all_years_cleaned.json")
results_data = pd.read_json(input_path)

# Ensure output directory exists
output_path = Path("../data/references/elections.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)

# Add election year column from date
results_data["election_year"] = pd.to_datetime(results_data["election_date"]).dt.year

# Define base fields
council_id = "kent_cc"
source_url = "https://www.kent.gov.uk/about-the-council/how-the-council-works/elections"
results_path = str(input_path)

# Generate one entry per unique election year
elections = []
for year in sorted(results_data["election_year"].unique(), reverse=True):
    election_date = pd.to_datetime(
        results_data.loc[results_data["election_year"] == year, "election_date"].iloc[0]
    ).date().isoformat()

    election = {
        "election_id": f"{council_id}_{int(year)}",
        "council_id": council_id,
        "election_date": election_date,
        "election_type": "local",
        "scope": "county-wide",
        "description": f"Kent County Council local elections {year}",
        "results_path": results_path,
        "results_filter": {"election_year": int(year)},
        "source_url": source_url
    }
    elections.append(election)

# Write to JSONL
with open(output_path, "w") as f:
    for entry in elections:
        f.write(json.dumps(entry) + "\n")

print(f"✅ elections.jsonl written to: {output_path}")

✅ elections.jsonl written to: ../data/references/elections.jsonl


### Populate people.json from existing civil servants json

The civil servant json was already available - generated by ChatGPT from a pdf I found on the council's website

In [159]:
import json
from pathlib import Path
import re

# === CONFIGURATION ===
INPUT_FILE = Path("../data/jsons/civil_servants_all.json")
OUTPUT_FILE = Path("../data/entities/people.jsonl")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

# === UTILITY FUNCTIONS ===
def slugify(name):
    return re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")

def generate_person_id(slug, counter):
    return f"{slug}_{counter:03d}"

# === LOAD EXISTING PEOPLE ===
existing_people = {}
slug_counter = {}

if OUTPUT_FILE.exists():
    with open(OUTPUT_FILE) as f:
        for line in f:
            person = json.loads(line)
            slug = slugify(person["full_name"])
            existing_people[slug] = person
            # update counter
            id_suffix = person["person_id"].split("_")[-1]
            try:
                slug_counter[slug] = max(slug_counter.get(slug, 0), int(id_suffix))
            except ValueError:
                pass

# === LOAD CIVIL SERVANTS DATA ===
with open(INPUT_FILE) as f:
    civil_servants = json.load(f)

new_people = []
flagged_people = []

for entry in civil_servants:
    full_name = entry.get("name", "").strip()
    if not full_name:
        continue

    parts = full_name.split()
    first_name = parts[0] if parts else ""
    last_name = parts[-1] if len(parts) > 1 else ""
    slug = slugify(full_name)

    if slug in existing_people:
        flagged_people.append(full_name)
        continue  # Skip known person

    # Assign new person_id
    slug_counter[slug] = slug_counter.get(slug, 0) + 1
    person_id = generate_person_id(slug, slug_counter[slug])

    person = {
        "person_id": person_id,
        "full_name": full_name,
        "first_name": first_name,
        "last_name": last_name,
        "aliases": list({full_name, last_name}),
        "roles": ["civil_servant"],
        "civil_service_roles": [{
            "role": entry.get("role", ""),
            "department": entry.get("department", ""),
            "division": entry.get("Division", ""),
            "service_unit": entry.get("Service Unit", ""),
            "grade": entry.get("Grade", ""),
            "contract_title": entry.get("Contract Title", ""),
            "manager_name": entry.get("Manager Name", ""),
            "start_date": "",
            "end_date": ""
        }],
        "committees": entry.get("committees", []),
        "elections": [],
        "profiles": {
            "council_url": "",
            "linkedin": "",
            "twitter": ""
        }
    }

    new_people.append(person)
    existing_people[slug] = person

# === APPEND TO people.jsonl ===
mode = "a" if OUTPUT_FILE.exists() else "w"
with open(OUTPUT_FILE, mode) as f:
    for person in new_people:
        f.write(json.dumps(person) + "\n")

print(f"✅ Added {len(new_people)} new civil servants to: {OUTPUT_FILE}")
if flagged_people:
    print(f"⚠️  Skipped {len(flagged_people)} possible duplicates:")
    for name in flagged_people:
        print(" -", name)

✅ Added 33 new civil servants to: ../data/entities/people.jsonl


### Append Candidates to people.jsonl

In [158]:
import json
from pathlib import Path
import re

# === CONFIGURATION ===
ELECTION_FILE = Path("../data/elections/kent_results_all_years_cleaned.json")
PEOPLE_FILE = Path("../data/entities/people.jsonl")
PEOPLE_FILE.parent.mkdir(parents=True, exist_ok=True)

# === UTILITY FUNCTIONS ===
def slugify(name):
    return re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")

def generate_person_id(base_slug, counter):
    return f"{base_slug}_{counter:03d}"

# === LOAD EXISTING PEOPLE ===
existing_people = {}
slug_counter = {}

if PEOPLE_FILE.exists():
    with open(PEOPLE_FILE) as f:
        for line in f:
            person = json.loads(line)
            slug = slugify(person["full_name"])
            existing_people[slug] = person
            slug_counter[slug] = int(person["person_id"].split("_")[-1])

# === LOAD ELECTION RESULTS ===
with open(ELECTION_FILE) as f:
    election_data = json.load(f)

# === PROCESS NEW CANDIDATES ===
for record in election_data:
    year = int(record.get("election_date", "")[:4])
    division = record.get("division", "")
    
    for cand in record.get("candidates", []):
        canonical_name = cand.get("canonical_name", cand.get("name", "")).strip()
        if not canonical_name:
            continue

        slug = slugify(canonical_name)
        first, *rest = canonical_name.split()
        last = rest[-1] if rest else first

        # Prepare the election record
        election_info = {
            "year": year,
            "division": division,
            "party": cand.get("party", ""),
            "status": cand.get("status", "")
        }

        if slug in existing_people:
            # Append election to existing person if not a duplicate
            existing = existing_people[slug]
            if election_info not in existing["elections"]:
                existing["elections"].append(election_info)
        else:
            # New person: assign new ID
            slug_counter[slug] = slug_counter.get(slug, 0) + 1
            person_id = generate_person_id(slug, slug_counter[slug])

            new_person = {
                "person_id": person_id,
                "full_name": canonical_name,
                "first_name": first,
                "last_name": last,
                "aliases": list({canonical_name, last}),
                "roles": ["candidate"],
                "civil_service_roles": [],
                "committees": [],
                "elections": [election_info],
                "profiles": {
                    "council_url": "",
                    "linkedin": "",
                    "twitter": ""
                }
            }

            existing_people[slug] = new_person

# === WRITE UPDATED PEOPLE FILE ===
with open(PEOPLE_FILE, "w") as f:
    for person in existing_people.values():
        f.write(json.dumps(person) + "\n")

print(f"✅ people.jsonl updated with {len(existing_people)} unique individuals.")

✅ people.jsonl updated with 1414 unique individuals.
