### Scrape REGULAR elections results from a landing page

The settings are at the bottom of the main function - untoggle as needed

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "https://democracy.kent.gov.uk:9071"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ElectionScraper/1.0)"
}

def scrape_election(eid, election_date, rpid=None):
    print(f"\n📋 Scraping election {eid} on {election_date}...\n")

    # Construct the master page URL
    start_url = f"{BASE_URL}/mgElectionElectionAreaResults.aspx?Page=all&EID={eid}"
    if rpid:
        start_url += f"&RPID={rpid}"

    # --- Step 1: Get all division result links
    res = requests.get(start_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = []
    seen = set()

    for a in soup.select("a"):
        href = a.get("href", "")
        text = a.get_text(strip=True)
        if "mgElectionAreaResults.aspx" in href and "ID=" in href:
            full_url = BASE_URL + "/" + href.lstrip("/")
            if full_url not in seen:
                seen.add(full_url)
                links.append((text, full_url))

    print(f"🔗 Found {len(links)} division links...")

    # --- Step 2: Scrape each division
    results = []
    failed = []

    for i, (name, url) in enumerate(links):
        print(f"[{i+1}/{len(links)}] Scraping: {name}")
        try:
            division_data = parse_division_page(name, url)
            division_data["election_date"] = election_date
            results.append(division_data)
            time.sleep(0.4)
        except Exception as e:
            print(f"⚠️ Failed on {name}: {e}")
            failed.append({"division": name, "url": url, "error": str(e)})

    # --- Step 3: Save results
    out_file = f"../data/elections/kent_results_{election_date}.json"
    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    if failed:
        with open(f"failed_{election_date}.json", "w", encoding="utf-8") as f:
            json.dump(failed, f, indent=2)

    print(f"\n✅ Saved {len(results)} results to '{out_file}'")
    if failed:
        print(f"⚠️ {len(failed)} divisions failed — see 'failed_{election_date}.json'")

# ------------------------------------------
# Helper: Parse individual division page
# ------------------------------------------

def parse_division_page(name, url):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    tables = soup.select("table.mgStatsTable")

    if len(tables) < 1:
        return {
            "division": name,
            "url": url,
            "status": "no_tables_found"
        }

    # Try to locate correct tables by caption
    candidate_table = next((t for t in tables if "Candidate" in t.get_text() or "results" in t.get_text().lower()), None)
    summary_table = next((t for t in tables if "Voting Summary" in t.get_text()), None)

    if not candidate_table or not summary_table:
        return {
            "division": name,
            "url": url,
            "status": "incomplete_data"
        }

    # --- Candidate table
    candidates = []
    candidate_rows = candidate_table.find_all("tr")[1:]
    for row in candidate_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) != 5:
            continue
        candidates.append({
            "name": cols[0],
            "party": cols[1],
            "votes": int(cols[2].replace(",", "")),
            "percentage": cols[3],
            "outcome": cols[4]
        })

    # --- Summary table
    summary = {}
    summary_rows = summary_table.find_all("tr")[1:]
    for row in summary_rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) != 2 or not cols[0].strip():
            continue
        key = cols[0].lower().replace(" ", "_")
        val = cols[1].replace(",", "")
        summary[key] = int(val) if val.isdigit() else val

    return {
        "division": name,
        "url": url,
        "status": "ok",
        "candidates": candidates,
        "summary": summary
    }

# ------------------------------------------
# Example usage
# ------------------------------------------

if __name__ == "__main__":
    #scrape_election(eid=51, election_date="2025-05-01")
    #scrape_election(eid=32, election_date="2021-05-06")
    #scrape_election(eid=20, election_date="2017-05-04")
    # scrape_election(eid=12, election_date="2013-05-02")
    scrape_election(eid=3,  election_date="2009-06-04")


### By-election results scraping

In [88]:
import requests
from bs4 import BeautifulSoup
import json
import time
import re
from dateutil.parser import parse as parse_date

BASE_URL = "https://democracy.kent.gov.uk"
FINAL_HOST = "https://democracy.kent.gov.uk:9071"
START_URL = f"{BASE_URL}/mgManageElectionResults.aspx?bcr=1"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; ByElectionScraper/1.0)"
}

def try_parse_int(s):
    try:
        return int(s.replace(",", ""))
    except:
        return s

def parse_name_and_date(text):
    """
    Extracts clean division name and ISO election date from mixed link formats.
    Supports:
        - "Election results for Division Name, 6 July 2023"
        - "Division Name by-election, 21/11/2024"
    """
    # Try flexible match: split last comma-separated part as date
    match = re.match(r"^(.*?),\s*(\d{1,2}[/\s]\w+[/\s]\d{4})$", text)
    if match:
        name = match.group(1).strip()
        date_str = match.group(2).strip()
        try:
            election_date = parse_date(date_str, dayfirst=True).date().isoformat()
        except:
            election_date = None
        return name, election_date

    # Fallback: just return as-is
    return text.strip(), None

def parse_division_page(name, url, election_date):
    res = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    tables = soup.select("table.mgStatsTable")

    if not tables:
        return {"division": name, "url": url, "election_date": election_date, "status": "no_tables_found"}

    candidate_table = None
    for table in tables:
        headers = [th.get_text(strip=True).lower() for th in table.find_all("th")]
        if any("candidate" in h or "votes" in h for h in headers):
            candidate_table = table
            break

    summary_table = next((t for t in tables if "Voting Summary" in t.get_text()), None)

    if not candidate_table:
        return {"division": name, "url": url, "election_date": election_date, "status": "no_candidate_table"}

    candidates = []
    rows = candidate_table.find_all("tr")[1:]
    for row in rows:
        cols = [td.get_text(strip=True) for td in row.find_all("td")]
        if len(cols) >= 4:
            candidate = {
                "name": cols[0],
                "party": cols[1],
                "votes": try_parse_int(cols[2]),
                "percentage": cols[3],
            }
            if len(cols) > 4:
                candidate["outcome"] = cols[4]
            candidates.append(candidate)

    summary = {}
    if summary_table:
        rows = summary_table.find_all("tr")[1:]
        for row in rows:
            cols = [td.get_text(strip=True) for td in row.find_all("td")]
            if len(cols) == 2:
                key = cols[0].lower().replace(" ", "_")
                summary[key] = try_parse_int(cols[1])

    return {
        "division": name,
        "election_date": election_date,
        "url": url,
        "status": "ok",
        "candidates": candidates,
        "summary": summary
    }

def scrape_byelection_results():
    print(f"📋 Scraping from: {START_URL}")
    res = requests.get(START_URL, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")

    links = []
    for a in soup.select("a[href*='mgElectionResults.aspx?ID=']"):
        href = a.get("href")
        if not href:
            continue
        full_url = FINAL_HOST + "/" + href.lstrip("/").replace("mgElectionResults", "mgElectionAreaResults")
        text = a.get_text(strip=True)
        name, election_date = parse_name_and_date(text)
        links.append((name, election_date, full_url))

    print(f"🔗 Found {len(links)} byelection result links")

    results = []
    failed = []

    for i, (name, election_date, url) in enumerate(links):
        print(f"[{i+1}/{len(links)}] Scraping: {name} ({election_date})")
        try:
            result = parse_division_page(name, url, election_date)
            results.append(result)
            time.sleep(0.4)
        except Exception as e:
            print(f"⚠️ Failed: {name} — {e}")
            failed.append({"name": name, "url": url, "error": str(e)})

    # 🚫 Step: Remove junk division entries like "County Council, 01/05/2025"
    initial_count = len(results)
    results = [r for r in results if not r.get("division", "").strip().lower().startswith("county council")]
    removed = initial_count - len(results)

    print(f"\n🧹 Removed {removed} invalid 'County Council' rows from results.")

    # ✅ Save cleaned results
    with open("../data/elections/kent_byelection_results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    # 🚨 Log failures if any
    if failed:
        with open("kent_byelection_failures.json", "w", encoding="utf-8") as f:
            json.dump(failed, f, indent=2)

    print(f"\n✅ Final saved: {len(results)} byelection results. {len(failed)} failed.")

    # Save successful results
    with open("../data/elections/kent_byelection_results.json", "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    # Save failed logs
    if failed:
        with open("kent_byelection_failures.json", "w", encoding="utf-8") as f:
            json.dump(failed, f, indent=2)

    print(f"\n✅ Scraped {len(results)} byelection results. {len(failed)} failed.")

if __name__ == "__main__":
    scrape_byelection_results()


📋 Scraping from: https://democracy.kent.gov.uk/mgManageElectionResults.aspx?bcr=1
🔗 Found 14 byelection result links
[1/14] Scraping: County Council (2025-05-01)
[2/14] Scraping: County Council (2025-05-01)
[3/14] Scraping: Swanscombe and Greenhithe by-election (2024-11-21)
[4/14] Scraping: Maidstone Central by-election (2023-07-06)
[5/14] Scraping: Sheppey By-Election (2023-05-04)
[6/14] Scraping: Hythe West By-Election (2023-03-02)
[7/14] Scraping: Wilmington by-election (2022-01-27)
[8/14] Scraping: Elham Valley (2021-06-17)
[9/14] Scraping: County Council (2021-05-06)
[10/14] Scraping: Northfleet and Gravesend West by-election (2019-05-02)
[11/14] Scraping: Sittingbourne North by-election (2019-05-02)
[12/14] Scraping: Canterbury North by-election (2018-11-15)
[13/14] Scraping: Birchington and Rural (2018-01-11)
[14/14] Scraping: County Council (2017-05-04)

🧹 Removed 4 invalid 'County Council' rows from results.

✅ Final saved: 10 byelection results. 0 failed.

✅ Scraped 10 byelec

In [89]:
import pandas as pd
import json
from pathlib import Path

# Load byelection results
byelection_path = Path("../data/elections/kent_byelection_results.json")
with open(byelection_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Flatten each result (one row per division)
df = pd.json_normalize(raw_data)

# Show basic shape and column names
print(f"📦 Shape: {df.shape[0]} rows × {df.shape[1]} columns\n")
print("🧾 Columns:", list(df.columns), "\n")

# Show missing value summary
print("🔍 Missing values per column:\n")
print(df.isna().sum())

# Optional: View sample of incomplete rows
print("\n🧩 Sample rows with missing values:")
display(df[df.isna().any(axis=1)].head(10))


📦 Shape: 10 rows × 12 columns

🧾 Columns: ['division', 'election_date', 'url', 'status', 'candidates', 'summary.seats', 'summary.total_votes', 'summary.electorate', 'summary.number_of_ballot_papers_issued', 'summary.number_of_ballot_papers_rejected', 'summary.turnout', 'summary.'] 

🔍 Missing values per column:

division                                    0
election_date                               0
url                                         0
status                                      0
candidates                                  0
summary.seats                               0
summary.total_votes                         0
summary.electorate                          0
summary.number_of_ballot_papers_issued      0
summary.number_of_ballot_papers_rejected    0
summary.turnout                             0
summary.                                    0
dtype: int64

🧩 Sample rows with missing values:


Unnamed: 0,division,election_date,url,status,candidates,summary.seats,summary.total_votes,summary.electorate,summary.number_of_ballot_papers_issued,summary.number_of_ballot_papers_rejected,summary.turnout,summary.


### Consolidate all jsons into one

In [90]:
import json
import os
from glob import glob

def merge_election_files(folder_path=".", output_file="../data/elections/kent_results_all_years.json"):
    merged = []
    files = sorted(glob(os.path.join(folder_path, "../data/elections/kent_results_20*.json")))

    # Add the byelection file
    byelection_file = os.path.join(folder_path, "../data/elections/kent_byelection_results.json")
    if os.path.exists(byelection_file):
        files.append(byelection_file)

    for file in files:
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)
            for record in data:
                record["council"] = "Kent County Council"
                merged.append(record)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged, f, indent=2, ensure_ascii=False)

    print(f"✅ Merged {len(files)} files into '{output_file}' ({len(merged)} total records)")

if __name__ == "__main__":
    merge_election_files()


✅ Merged 6 files into '../data/elections/kent_results_all_years.json' (370 total records)


### Open the consolidated file for cleaning

In [91]:
import json
import pandas as pd
from collections import defaultdict

# === Step 1: Load the merged JSON ===

# Load merged election data into 'data' (list of dicts)
with open("../data/elections/kent_results_all_years.json", "r", encoding="utf-8") as f:
    data = json.load(f)


# === Step 2: Assign 'election_type' based on number of candidates ===
records_by_date = defaultdict(list)
for record in data:
    if record.get("status") == "ok" and "election_date" in record:
        records_by_date[record["election_date"]].append(record)

for date, records_on_date in records_by_date.items():
    for record in records_on_date:
        num_candidates = len(record.get("candidates", []))
        if num_candidates <= 1:
            record["election_type"] = "by-election"
        else:
            record["election_type"] = "regular"

# === Step 3: Flatten to one row per candidate ===
rows = []
for record in data:
    base = {
        "Election Date": record.get("election_date"),
        "Division": record.get("division"),
        "Election Type": record.get("election_type", "unknown"),
        "Council": record.get("council"),
        "URL": record.get("url")
    }
    for cand in record.get("candidates", []):
        row = base.copy()
        row.update({
            "Candidate": cand.get("canonical_name", cand.get("name", "")),
            "Party": cand.get("party", ""),
            "Outcome": cand.get("outcome", ""),
            "Votes": cand.get("votes"),
            "Percentage": cand.get("percentage", "")
        })
        rows.append(row)

# === Step 4: Create DataFrame ===
df = pd.DataFrame(rows)

# Optional: preview result
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1967 entries, 0 to 1966
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Election Date  1967 non-null   object
 1   Division       1967 non-null   object
 2   Election Type  1967 non-null   object
 3   Council        1967 non-null   object
 4   URL            1967 non-null   object
 5   Candidate      1967 non-null   object
 6   Party          1967 non-null   object
 7   Outcome        1967 non-null   object
 8   Votes          1967 non-null   int64 
 9   Percentage     1967 non-null   object
dtypes: int64(1), object(9)
memory usage: 153.8+ KB


In [92]:
# Basic info
print("📊 DataFrame Overview:\n")
print(df.info())

# Summary of missing values per column
print("\n🔍 Missing Values Per Column:\n")
missing = df.isna().sum()
print(missing[missing > 0])

# Percentage of missing values
print("\n📉 Percentage Missing Per Column:\n")
percent_missing = (df.isna().mean() * 100).round(2)
print(percent_missing[percent_missing > 0])

# Optional: View a few incomplete rows
print("\n🧩 Sample Rows with Missing Data:\n")
display(df[df.isna().any(axis=1)].head(10))


📊 DataFrame Overview:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1967 entries, 0 to 1966
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Election Date  1967 non-null   object
 1   Division       1967 non-null   object
 2   Election Type  1967 non-null   object
 3   Council        1967 non-null   object
 4   URL            1967 non-null   object
 5   Candidate      1967 non-null   object
 6   Party          1967 non-null   object
 7   Outcome        1967 non-null   object
 8   Votes          1967 non-null   int64 
 9   Percentage     1967 non-null   object
dtypes: int64(1), object(9)
memory usage: 153.8+ KB
None

🔍 Missing Values Per Column:

Series([], dtype: int64)

📉 Percentage Missing Per Column:

Series([], dtype: float64)

🧩 Sample Rows with Missing Data:



Unnamed: 0,Election Date,Division,Election Type,Council,URL,Candidate,Party,Outcome,Votes,Percentage


### Cleaning 

identify those cases when the division was not scraped 

In [93]:
# Step 1: Identify suspicious divisions
suspicious_divisions = []

for record in data:
    if record.get("status") != "ok":
        continue
    division = record.get("division", "").strip()
    if division.lower().startswith("county council"):
        suspicious_divisions.append({
            "division": division,
            "election_date": record.get("election_date", "???"),
            "url": record.get("url", ""),
            "candidates": [
                cand.get("canonical_name", cand.get("name", "???"))
                for cand in record.get("candidates", [])
            ]
        })

# Step 2: Print summary of problems
print(f"⚠️ Found {len(suspicious_divisions)} records with invalid division names:\n")
for entry in suspicious_divisions:
    print(f"Division: {entry['division']}")
    print(f"Date:     {entry['election_date']}")
    print(f"URL:      {entry['url']}")
    print(f"Candidates: {', '.join(entry['candidates'])}")
    print("-" * 60)


⚠️ Found 0 records with invalid division names:



In [94]:
from collections import defaultdict

seen_keys = set()
deduplicated_data = []
removed_entries = []

for record in data:
    if record.get("status") != "ok":
        continue

    url = record.get("url", "")
    date = record.get("election_date", "???")
    division = record.get("division", "")
    unique_candidates = []
    
    for cand in record.get("candidates", []):
        name = cand.get("canonical_name", cand.get("name", "???"))
        key = (url, name)
        if key not in seen_keys:
            seen_keys.add(key)
            unique_candidates.append(cand)
        else:
            removed_entries.append({
                "name": name,
                "election_date": date,
                "division": division,
                "url": url
            })

    if unique_candidates:
        new_record = record.copy()
        new_record["candidates"] = unique_candidates
        deduplicated_data.append(new_record)

# Replace data with deduplicated version
data = deduplicated_data

# Summary
print(f"✅ Deduplicated dataset: {len(data)} records")
print(f"🗑️ Removed {len(removed_entries)} duplicate candidate entries:")

# Print details of removed entries
for entry in removed_entries:
    print(f"- {entry['name']} ({entry['election_date']} / {entry['division']})")


✅ Deduplicated dataset: 369 records
🗑️ Removed 0 duplicate candidate entries:


In [95]:
#Manually Correct election_date for Broken URLs

manual_date_fixes = {
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=51&RPID=286764034": "2009-06-04",  # estimated or actual date
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=32&RPID=286764034": "2009-06-04",
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=20&RPID=286764034": "2009-06-04"
}

for record in data:
    url = record.get("url", "")
    if url in manual_date_fixes:
        record["election_date"] = manual_date_fixes[url]


In [96]:
manual_division_fixes = {
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=51&RPID=286764034": {
        "John Kirby": "Ramsgate",
        "Elizabeth Green": "Ramsgate",
        "Michael J Taylor": "Ramsgate",
        "Trevor Leslie Shonk": "Ramsgate",
        "Alan R Poole": "Ramsgate",
        "Gerry O'Donnell": "Ramsgate",
        "Georgina R Maddox": "Ramsgate",
        "Dennis Whiting": "Ramsgate",
        "Richard G H Perry": "Ramsgate"
    },
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=32&RPID=286764034": {
        "Alan Marsh": "Herne and Sturry",
        "John D Moore": "Herne and Sturry",
        "Monica Eden-Green": "Herne and Sturry",
        "Cecile M Manning": "Herne and Sturry"
    },
    "https://democracy.kent.gov.uk:9071/mgElectionAreaResults.aspx?ID=20&RPID=286764034": {
        "Jan Michael Ozog": "Dartford West",
        "Jo Shippam": "Dartford West",
        "Joanne S Howard": "Dartford West"
    }
}


apply the manual division fix:

In [97]:
for record in data:
    if record.get("status") != "ok":
        continue
    url = record.get("url", "")
    if url in manual_division_fixes:
        fixes = manual_division_fixes[url]
        for cand in record.get("candidates", []):
            name = cand.get("canonical_name", "")
            correct_div = fixes.get(name)
            if correct_div:
                record["division"] = correct_div


In [98]:
print("\n✅ Manual division fixes applied to:")
for url, mapping in manual_division_fixes.items():
    for name, division in mapping.items():
        print(f" - {name:30s} → {division}")



✅ Manual division fixes applied to:
 - John Kirby                     → Ramsgate
 - Elizabeth Green                → Ramsgate
 - Michael J Taylor               → Ramsgate
 - Trevor Leslie Shonk            → Ramsgate
 - Alan R Poole                   → Ramsgate
 - Gerry O'Donnell                → Ramsgate
 - Georgina R Maddox              → Ramsgate
 - Dennis Whiting                 → Ramsgate
 - Richard G H Perry              → Ramsgate
 - Alan Marsh                     → Herne and Sturry
 - John D Moore                   → Herne and Sturry
 - Monica Eden-Green              → Herne and Sturry
 - Cecile M Manning               → Herne and Sturry
 - Jan Michael Ozog               → Dartford West
 - Jo Shippam                     → Dartford West
 - Joanne S Howard                → Dartford West


In [99]:
from collections import defaultdict

# Step 0: Record total before cleanup
total_before = len(data)

# Step 1: Count how many records share each (election_date, division)
division_counts = defaultdict(int)
for record in data:
    if record.get("status") != "ok":
        continue
    date = record.get("election_date")
    div = record.get("division", "")
    division_counts[(date, div)] += 1

# Step 2: Identify suspicious records
invalid_records = []
for record in data:
    if record.get("status") != "ok":
        continue
    division = record.get("division", "")
    election_date = record.get("election_date", "")
    if division.lower().startswith("county council") and division_counts[(election_date, division)] > 1:
        invalid_records.append(record)

# Diagnostic printout
print(f"\n⚠️ Found {len(invalid_records)} likely invalid summary records out of {total_before} total:\n")
for record in invalid_records:
    print(f"Division: {record.get('division')}")
    print(f"Date:     {record.get('election_date')}")
    print(f"URL:      {record.get('url')}")
    print(f"Candidates: {', '.join(c['canonical_name'] for c in record.get('candidates', []))}")
    print("-" * 60)

# Step 3: Filter them out
urls_to_remove = set(r["url"] for r in invalid_records)
data = [r for r in data if r.get("url") not in urls_to_remove]

# Step 4: Summary
total_after = len(data)
print(f"\n✅ Cleaned data: {total_after} records remain after removing {total_before - total_after} invalid entries.")



⚠️ Found 0 likely invalid summary records out of 369 total:


✅ Cleaned data: 369 records remain after removing 0 invalid entries.


### Titles stripped, first and last name separated

Strip titles and normalize names.

Extract and store first_name, middle_names, last_name, and canonical_name.

Identify potentially mergeable variants (e.g., “Steve Campkin” vs “Steven R Campkin”) — but only where first names differ but share the same initial.

In [100]:
import re
import pandas as pd
from collections import defaultdict

# === Step 1: Define helper functions ===

def strip_titles(name):
    """Remove titles like MBE, PhD etc."""
    return re.sub(r",?\s+(MBE|OBE|CBE|KBE|DBE|CH|QC|KC|JP|DL|FRSA|FRICS|BA|MA|PhD|BSc|MSc|LLB|LLM)\b", "", name, flags=re.IGNORECASE)

def split_name(full_name):
    """Split name into first, middle, last"""
    full_name = full_name.strip().replace(",", "")
    parts = full_name.split()
    if len(parts) == 0:
        return "", "", ""
    first_name = parts[0]
    last_name = parts[-1]
    middle_names = " ".join(parts[1:-1]) if len(parts) > 2 else ""
    return first_name.title(), middle_names.title(), last_name.title()

# === Step 2: Apply to every candidate and store structured names ===

for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        raw_name = cand.get("name", "")
        stripped = strip_titles(raw_name)
        first, middle, last = split_name(stripped)

        cand["first_name"] = first
        cand["middle_names"] = middle
        cand["last_name"] = last
        cand["canonical_name"] = f"{first} {last}".strip()

# === Step 3: Build last_name_map for ambiguity detection ===

last_name_map = defaultdict(set)

for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        full = strip_titles(cand.get("name", ""))
        first, middle, last = split_name(full)
        if first and last:
            last_name_map[last].add((first, middle, full))

# === Step 4: Detect ambiguous names with same first initial ===

rechecked_strict_ambiguous_last_names = {}

for last, entries in last_name_map.items():
    first_names = {first for first, _, _ in entries if first}
    first_initials = {first[0] for first in first_names}
    if len(first_names) > 1 and len(first_initials) == 1:
        rechecked_strict_ambiguous_last_names[last] = sorted(entries)

# === Step 5: Show result as DataFrame ===

rechecked_strict_df = pd.DataFrame([
    {
        "last_name": last,
        "first_names": ", ".join(sorted({first for first, _, _ in entries})),
        "examples": "; ".join(sorted(full for _, _, full in entries))
    }
    for last, entries in rechecked_strict_ambiguous_last_names.items()
])

rechecked_strict_df.head(20)


Unnamed: 0,last_name,first_names,examples
0,Campkin,"Steve, Steven",Steve Campkin; Steven R Campkin; Steven Robert...
1,Manion,"Stephen, Steve",Stephen Charles Manion; Steve Manion
2,Brivio,"Pam, Pamela",Pam Brivio; Pamela M Brivio; Pamela Mary Brivio
3,Jack,"Nick, Nicolas",Nick Jack; Nicolas S W Jack
4,Mckenna,"Francis, Frank",Francis J McKenna; Frank McKenna
5,Cannon,"Teresa, Tom",Teresa Cannon; Tom Cannon
6,Parker,"Ray, Roz",Ray Parker; Roz Parker
7,London,"James, John",James Frederick Justin London; John London
8,Kift,"Penelope, Penny",Penelope M Kift; Penny Kift
9,Stewart,"Chris, Colin",Chris Stewart; Colin McCarthy Stewart


### Canonical Name Mapping and Normalisation

This code below refines candidate name consistency across election records:

- **Exclusions**: A predefined list of merge-excluded indices (`excluded_merge_indices`) identifies ambiguous last names that should not be merged.
- **Group and Merge**: For each last name in `last_name_map`, candidate full names are grouped by first name. If all first names share the same initial, the longest variant is chosen as the canonical name.
- **Exception Handling**: If a last name is in the exclusion list, all its variants are kept separate.
- **Assignment**: The `canonical_name` field is assigned to each candidate in the dataset based on the mapping.
- **Diagnostics**: A list (`diffs`) shows where original names differ from canonical ones for inspection.

This helps unify inconsistent name formats (e.g. "J. Smith", "John Smith", "Jonathan Smith") while allowing manual override of ambiguous cases.


✅ Guidelines to Exclude

You should exclude any row where:

    The first names differ in gender (e.g., Teresa and Tom)

    The names are clearly distinct (e.g., James and John)

    There is no strong reason to assume they're variants of the same person

In [101]:
# Convert the set of indices to a list to fix the error = conflicting or overlapping first-name variants, same person rows
#  These are the rows I decided not to auto-merge under one canonical name
excluded_merge_indices = [
    4,   # Mckenna → Francis vs Frank (maybe OK, but play safe)
    5,   # Cannon → Teresa vs Tom (different gender)
    6,   # Parker → Ray vs Roz (gender + plausibly unrelated)
    7,   # London → James vs John (distinct names)
    9,   # Stewart → Chris vs Colin
    10,  # Young → Mad vs Mike (nickname ambiguity)
    11,  # Read → Jim vs Julie (different gender)
    18   # Nicholls → Darren vs David (distinct names)
]


This code block identifies ambiguous last names where multiple first names share the same initial (e.g. *Steve* and *Steven* Campkin), then applies a canonical naming rule:

* It **merges** those names only if not manually excluded,
* And assigns the **longest full version** of each name as canonical.

It ensures consistency in tracking people across elections without incorrectly combining distinct individuals.


In [102]:
# Get last names from excluded rows in the previously built ambiguous_df
excluded_last_names = rechecked_strict_df.loc[excluded_merge_indices, "last_name"].tolist()

# Rebuild the canonical_name_map with exclusions
final_canonical_name_map = {}

for last, entries in last_name_map.items():
    grouped_by_first = defaultdict(list)
    first_initials = set()

    for first, middle, full in entries:
        if first:
            grouped_by_first[first].append(full)
            first_initials.add(first[0])

    if last in excluded_last_names:
        # Do not merge — treat each name as its own canonical
        for first, variants in grouped_by_first.items():
            for variant in variants:
                final_canonical_name_map[(first.strip(), last.strip())] = variant
    elif len(first_initials) == 1:
        # Safe to merge — assign longest variant
        for first, variants in grouped_by_first.items():
            longest_variant = max(variants, key=len)
            final_canonical_name_map[(first.strip(), last.strip())] = longest_variant

# Apply updated canonical names to the dataset
for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        first = cand.get("first_name", "").strip()
        last = cand.get("last_name", "").strip()
        cand["canonical_name"] = final_canonical_name_map.get((first, last), cand["name"])

# Show a few examples where canonical_name != original name
diffs = sorted({
    (cand["name"], cand["canonical_name"])
    for record in data if record.get("status") == "ok"
    for cand in record["candidates"]
    if cand["name"] != cand["canonical_name"]
})

diffs[:100]


[('Alan J Bullion', 'Alan James Bullion'),
 ('Alex Ricketts', 'Alex James Ricketts'),
 ('Andrew C Waldie', 'Andrew Crawford  Waldie'),
 ('Andrew Malcolm Kennedy', 'Andrew Mark Stephen Kennedy'),
 ('Ashley Wise', 'Ashley Luke Wise'),
 ('Avtar Sandhu, MBE', 'Avtar Singh Sandhu'),
 ('Brian E MacDowall', 'Brian Eric MacDowall'),
 ('Brian W Copping', 'Brian William Copping'),
 ('Bryan Sweetland', 'Bryan John Sweetland'),
 ('Chris Capon, MBE', 'Chris Capon'),
 ('Christine J Marshall', 'Christine Jennifer Marshall'),
 ('Christopher Cornell', 'Christopher James Cornell'),
 ('Christopher Hoare', 'Christopher Pierce David Hoare'),
 ('Clive A English', 'Clive Andrew English'),
 ('Colin Caller', 'Colin William Caller'),
 ('David Brazier', 'David Lionel Brazier'),
 ('David Gary Beaney', 'David Garry Beaney'),
 ('David J Neve', 'David John Neve'),
 ('David Naghi', 'David Sandru Naghi'),
 ('David Robey', 'David Patrick John Robey'),
 ('David S Naghi', 'David Sandru Naghi'),
 ('David Waller', 'David L

This is a last-mile cleaning patch. It:

    Fixes known typos (e.g., "Joesph" → "Joseph")

    Removes double spaces and trailing whitespace

    Updates canonical_name fields if they differ from the cleaned version

In [103]:
# custom cleaning script for diffs 
 
import re

def clean_canonical_name(name):
    # Fix common typos
    name = name.replace("Joesph", "Joseph")

    # Collapse multiple spaces
    name = re.sub(r"\s{2,}", " ", name)

    # Strip and standardise spacing
    return name.strip()

# Apply cleanup
cleaned_diffs = sorted({
    (orig, clean_canonical_name(canon))
    for orig, canon in diffs
    if orig != clean_canonical_name(canon)  # Only include cleaned diffs
})

# Optional: update canonical names directly in your data
for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        orig = cand.get("name", "")
        canon = cand.get("canonical_name", "")
        cleaned = clean_canonical_name(canon)
        if orig != cleaned:
            cand["canonical_name"] = cleaned

# Preview cleaned diffs
cleaned_diffs[:100]


[('Alan J Bullion', 'Alan James Bullion'),
 ('Alex Ricketts', 'Alex James Ricketts'),
 ('Andrew C Waldie', 'Andrew Crawford Waldie'),
 ('Andrew Malcolm Kennedy', 'Andrew Mark Stephen Kennedy'),
 ('Ashley Wise', 'Ashley Luke Wise'),
 ('Avtar Sandhu, MBE', 'Avtar Singh Sandhu'),
 ('Brian E MacDowall', 'Brian Eric MacDowall'),
 ('Brian W Copping', 'Brian William Copping'),
 ('Bryan Sweetland', 'Bryan John Sweetland'),
 ('Chris Capon, MBE', 'Chris Capon'),
 ('Christine J Marshall', 'Christine Jennifer Marshall'),
 ('Christopher Cornell', 'Christopher James Cornell'),
 ('Christopher Hoare', 'Christopher Pierce David Hoare'),
 ('Clive A English', 'Clive Andrew English'),
 ('Colin Caller', 'Colin William Caller'),
 ('David Brazier', 'David Lionel Brazier'),
 ('David Gary Beaney', 'David Garry Beaney'),
 ('David J Neve', 'David John Neve'),
 ('David Naghi', 'David Sandru Naghi'),
 ('David Robey', 'David Patrick John Robey'),
 ('David S Naghi', 'David Sandru Naghi'),
 ('David Waller', 'David Le

### Code: Flag Shared Last Names Across Different People

In [104]:
from collections import defaultdict

# Build a map of last name → set of full canonical names
last_name_to_people = defaultdict(set)

for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        last = cand.get("last_name", "").strip()
        canon = cand.get("canonical_name", "").strip()
        if last and canon:
            last_name_to_people[last].add(canon)

# Filter to only those last names that have >1 distinct person
ambiguous_last_names = {
    last: sorted(list(people))
    for last, people in last_name_to_people.items()
    if len(people) > 1
}

# Display
for last, names in sorted(ambiguous_last_names.items()):
    print(f"⚠️  Shared last name: {last}")
    for name in names:
        print(f"   - {name}")
    print()


⚠️  Shared last name: Allen
   - Ann Allen
   - Ann Allen, MBE
   - Ann Dorothy Allen
   - Dan Allen
   - David John Allen
   - Shirley K Allen

⚠️  Shared last name: Angell
   - Christine Angell
   - Mike Angell

⚠️  Shared last name: Arnold
   - Gareth Dominic Edmund Arnold
   - Isla Finley Arnold

⚠️  Shared last name: Baker
   - David Baker
   - David John Baker
   - Georgina Baker
   - John Baker
   - Neil Stephen Baker

⚠️  Shared last name: Baldock
   - Alan Baldock
   - Mike Baldock

⚠️  Shared last name: Banks
   - Dave Banks
   - David Geoffrey Banks
   - Gail Anne Banks
   - Robin Banks
   - Sam Banks

⚠️  Shared last name: Barrett
   - Bill Barrett
   - Thea Barrett

⚠️  Shared last name: Bartlett
   - Alan Bartlett
   - Paul Bartlett
   - Paul William Bartlett
   - Tony Bartlett

⚠️  Shared last name: Bell
   - Clair Bell
   - Neil Bell

⚠️  Shared last name: Berry
   - Ann Elizabeth Berry
   - Chas Berry

⚠️  Shared last name: Betts
   - Martin Roland Betts
   - Robin Pat

### Create aliases commonly used in council documents

In [105]:
nicknames = {
    # Male names
    "Christopher": "Chris",
    "Jonathan": "Jon",
    "Stephen": "Steve",
    "Steven": "Steve",
    "Michael": "Mike",
    "Richard": "Rick",
    "Robert": "Rob",
    "Joseph": "Joe",
    "Timothy": "Tim",
    "Nicholas": "Nick",
    "James": "Jim",
    "Francis": "Frank",
    "Jeremy": "Jerry",

    # Female names
    "Margaret": "Maggie",
    "Elizabeth": "Liz",
    "Patricia": "Pat",
    "Penelope": "Penny",
    "Pamela": "Pam",
    "Karen": "Kate",  # or Katie
    "Katherine": "Kate",
    "Rebecca": "Becky",
    "Deborah": "Debbie",
    "Susan": "Sue",
    "Jacqueline": "Jackie",
    "Victoria": "Vicky"
}


In [106]:
import re

def extract_initials(full_name, last_name):
    """Extract initials from full name, skipping the last name."""
    parts = full_name.replace(last_name, "").strip().split()
    return [p[0] for p in parts if p and p[0].isalpha()]

def generate_aliases(first_name, last_name, canonical_name):
    aliases = set()

    # Title-based forms (all genders)
    for title in ["Mr", "Ms", "Mrs", "Miss", "Cllr", "Councillor"]:
        aliases.add(f"{title} {last_name}")

    # Add full first name + last name
    if first_name:
        aliases.add(f"{first_name} {last_name}")
        aliases.add(f"{first_name[0]} {last_name}")

    # Extract canonical first name (may be more formal)
    canonical_first = canonical_name.split()[0]
    if canonical_first.lower() != first_name.lower():
        aliases.add(f"{canonical_first} {last_name}")

    # Add nickname if available
    nick = nicknames.get(canonical_first)
    if nick and nick.lower() != first_name.lower():
        aliases.add(f"{nick} {last_name}")

    # Add initials
    initials = extract_initials(canonical_name, last_name)
    if initials:
        joined = ' '.join(initials)
        for title in ["Mr", "Ms", "Mrs", "Miss"]:
            aliases.add(f"{title} {joined} {last_name}")
            aliases.add(f"{title}\n{joined} {last_name}")
        aliases.add(f"{joined} {last_name}")

    return sorted(aliases)


# === Apply to your data ===

for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        first = cand.get("first_name", "").strip()
        last = cand.get("last_name", "").strip()
        canon = cand.get("canonical_name", cand.get("name", "")).strip()
        cand["aliases"] = generate_aliases(first, last, canon)


What code below does

    Keeps the preferred version of each (date, candidate) pair

    Drops any duplicate where the division name starts with "County"

    Leaves you with one clean record per candidate per date



In [107]:
from collections import defaultdict

# Keyed by (election_date, canonical_name)
deduped_records = {}
skipped_records = []

for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record.get("election_date")
    division = record.get("division", "").strip()
    url = record.get("url", "")

    for cand in record.get("candidates", []):
        name = cand.get("canonical_name", "").strip()
        key = (election_date, name)

        # Prefer division NOT starting with 'County'
        keep_current = not division.lower().startswith("county council")

        if key not in deduped_records:
            deduped_records[key] = {
                "record": record,
                "candidate": cand,
                "division": division
            }
        else:
            existing_div = deduped_records[key]["division"]
            if keep_current and existing_div.lower().startswith("county council"):
                deduped_records[key] = {
                    "record": record,
                    "candidate": cand,
                    "division": division
                }
            else:
                skipped_records.append((name, election_date, division))

# Build deduplicated version
deduplicated_data = []
for (date, name), info in deduped_records.items():
    record = info["record"].copy()
    record["candidates"] = [info["candidate"]]
    deduplicated_data.append(record)

# Replace your working data
print(f"✅ Deduplicated dataset: {len(data)} → {len(deduplicated_data)} records")
data = deduplicated_data

# Optional: View what was skipped
if skipped_records:
    print(f"🗑️ Skipped {len(skipped_records)} duplicate entries (less preferred divisions):")
    for name, date, div in skipped_records[:10]:
        print(f"- {name} on {date} in '{div}'")


✅ Deduplicated dataset: 369 → 1966 records
🗑️ Skipped 1 duplicate entries (less preferred divisions):
- Michael James Hogg on 2017-05-04 in 'Sevenoaks Town'


In [108]:
print(f"📊 Records remaining in data: {len(data)}")


📊 Records remaining in data: 1966


In [109]:
output_path = "../data/elections/kent_results_all_years_cleaned.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for record in data:
        json.dump(record, f, ensure_ascii=False)
        f.write("\n")

output_path

'../data/elections/kent_results_all_years_cleaned.jsonl'

### Examine one person- Candidate Participation Lookup

In [110]:
from IPython.display import display, HTML

def get_candidate_history(data, candidate_name):
    records = []

    for record in data:
        if record.get("status") != "ok":
            continue
        date = record.get("election_date")
        div = record.get("division", "")
        election_type = record.get("election_type", "unknown")
        url = record.get("url", "")
        for cand in record.get("candidates", []):
            if cand.get("canonical_name", "").strip() == candidate_name:
                records.append({
                    "Election Date": date,
                    "Division": div,
                    "Party": cand.get("party", ""),
                    "Outcome": cand.get("outcome", ""),
                    "Election Type": election_type,
                    "URL": f'<a href="{url}" target="_blank">Link</a>'
                })

    if not records:
        print(f"⚠️ No records found for: {candidate_name}")
        return pd.DataFrame()

    df = pd.DataFrame(sorted(records, key=lambda x: x["Election Date"]))
    return df.to_html(escape=False, index=False)


In [111]:
# View Trevor Shonk's election history as clickable table
html_table = get_candidate_history(data, "Trevor Leslie Shonk")
display(HTML(html_table))


Election Date,Division,Party,Outcome,Election Type,URL
2009-06-04,Ramsgate,UK Independence Party,Not elected,regular,Link
2013-05-02,Ramsgate,UK Independence Party,Elected,regular,Link
2017-05-04,Ramsgate,UK Independence Party,Not elected,regular,Link
2021-05-06,Ramsgate,Conservative,Elected,regular,Link
2025-05-01,Ramsgate,Reform UK,Elected,regular,Link


### Crosstab of election results - by party

In [112]:
import json
from datetime import datetime

# === Load and patch the data ===
data = []
with open("../data/elections/kent_results_all_years_cleaned.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)

        # Patch: If 'election_date' is missing, extract from 'division'
        if entry.get("status") == "ok" and "election_date" not in entry:
            division = entry.get("division", "")
            if "," in division:
                try:
                    possible_date = division.split(",")[-1].strip()
                    parsed_date = datetime.strptime(possible_date, "%d/%m/%Y").date()
                    entry["election_date"] = str(parsed_date)
                except ValueError:
                    pass  # Skip if date can't be parsed

        data.append(entry)

In [113]:
# Check which 'ok' entries are missing 'election_date'
missing_dates = [entry for entry in data if entry.get("status") == "ok" and "election_date" not in entry]
print(f"⚠️ Entries missing 'election_date': {len(missing_dates)}")


⚠️ Entries missing 'election_date': 0


### Reform candidates 

the table produced below:

The final df is a timeline matrix of candidates whose latest party is "Reform UK", showing:

| Row → each Reform UK candidate's canonical_name
| Columns → regular election dates (election_date)
| Cell value → election outcome at that date
| Plus one column: Latest Division

In [114]:

import json
import pandas as pd
from collections import defaultdict

# Step 0: Filter regular elections only for timeline columns
election_dates = sorted({
    entry["election_date"]
    for entry in data
    if entry.get("status") == "ok" and entry.get("election_type") == "regular"
})

# Step 1: Track latest party for each candidate by canonical name
latest_party_by_candidate = {}
for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name not in latest_party_by_candidate or election_date > latest_party_by_candidate[name]["election_date"]:
            latest_party_by_candidate[name] = {
                "election_date": election_date,
                "party": cand["party"]
            }

# Step 2: Build outcome timeline for candidates whose latest party is "Reform UK"
party_filter = "Reform UK"
candidate_results = defaultdict(lambda: {date: "NP" for date in election_dates})

for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    if election_date not in election_dates:
        continue  # skip byelections
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if latest_party_by_candidate.get(name, {}).get("party") == party_filter:
            candidate_results[name][election_date] = cand["outcome"]

# Step 3: Convert to DataFrame
df = pd.DataFrame.from_dict(candidate_results, orient="index")
df.index.name = "Candidate"

# Step 4: Add each candidate's latest division
latest_division_by_candidate = {}
for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    division = record["division"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name not in latest_division_by_candidate or election_date > latest_division_by_candidate[name]["election_date"]:
            latest_division_by_candidate[name] = {
                "election_date": election_date,
                "division": division
            }

df["Latest Division"] = df.index.map(lambda name: latest_division_by_candidate.get(name, {}).get("division", ""))

# Reorder columns
df = df[["Latest Division"] + [col for col in df.columns if col != "Latest Division"]]

# View the result
df.head(20)


Unnamed: 0_level_0,Latest Division,2009-06-04,2013-05-02,2017-05-04,2018-01-11,2018-11-15,2019-05-02,2021-05-06,2021-06-17,2022-01-27,2023-03-02,2023-05-04,2023-07-06,2024-11-21,2025-05-01
Candidate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Barry Taylor,Dartford North East,Not elected,NP,NP,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP
Garry Graham Sturley,Gravesend East,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected
Trevor Leslie Shonk,Ramsgate,Not elected,Elected,Not elected,NP,NP,NP,Elected,NP,NP,NP,NP,NP,NP,Elected
Eric Elliott,Romney Marsh,NP,Not elected,NP,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP
Christopher Pierce David Hoare,Tunbridge Wells North,NP,Elected,NP,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,Not elected
Gary Paul Rogers,Dartford West,NP,Not elected,NP,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP
Ben Robert Fryer,Dartford North East,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected
Mary Elizabeth Lawes,Folkestone East,NP,NP,Not elected,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,Elected
Ryan Andrew Waters,Dartford East,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected
Helen Jean Brown,Malling Rural East,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Not elected


### Councillors elected in 2025

The final councillor_df is a comprehensive performance and history profile for all councillors elected in the most recent regular election. It captures their past results, experience, party affiliations, and division in one clean table.

In [115]:
# Step 1: Get the most recent election date
latest_election_date = max(election_dates)

# Step 2: Find all candidates elected in the latest election
elected_councillors = set()
latest_division_by_candidate = {}
latest_party_by_candidate = {}

for record in data:
    if record.get("status") != "ok":
        continue
    if record["election_date"] != latest_election_date:
        continue

    division = record["division"]
    for cand in record["candidates"]:
        if cand["outcome"] == "Elected":
            name = cand["canonical_name"].strip()
            elected_councillors.add(name)
            latest_division_by_candidate[name] = division
            latest_party_by_candidate[name] = cand["party"]

# Build party history for each candidate
party_affiliations_by_candidate = defaultdict(set)

for record in data:
    if record.get("status") != "ok":
        continue
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        party = cand["party"].strip()
        if name and party:
            party_affiliations_by_candidate[name].add(party)


# Step 3: Build the outcome history for each elected councillor
councillor_results = defaultdict(lambda: {date: "NP" for date in election_dates})

for record in data:
    if record.get("status") != "ok":
        continue
    election_date = record["election_date"]
    for cand in record["candidates"]:
        name = cand["canonical_name"].strip()
        if name in elected_councillors:
            councillor_results[name][election_date] = cand["outcome"]

# Step 4: Create DataFrame
councillor_df = pd.DataFrame.from_dict(councillor_results, orient="index")
councillor_df.index.name = "Councillor"

# Add division and latest party columns
councillor_df["Division"] = councillor_df.index.map(lambda name: latest_division_by_candidate.get(name, ""))
councillor_df["Latest Party"] = councillor_df.index.map(lambda name: latest_party_by_candidate.get(name, ""))

# Reorder columns
cols = ["Division", "Latest Party"] + [col for col in election_dates]
councillor_df = councillor_df[cols]

# Filter the DataFrame to include only councillors who were elected in the latest election
elected_df = councillor_df[councillor_df[latest_election_date] == "Elected"]
# Step 1: Format past parties (already done above)
councillor_df["Past Parties"] = councillor_df.index.map(
    lambda name: ", ".join(sorted(party_affiliations_by_candidate.get(name, [])))
)

# Step 2: Calculate experience (number of times elected)
councillor_df["Experience at KCC (terms)"] = councillor_df[election_dates].apply(
    lambda row: sum(1 for value in row if value == "Elected"), axis=1
)

# Step 3: Reorder columns
ordered_cols = ["Division", "Latest Party"] + election_dates + ["Experience at KCC (terms)", "Past Parties"]
councillor_df = councillor_df[ordered_cols]

councillor_df.head(30)

Unnamed: 0_level_0,Division,Latest Party,2009-06-04,2013-05-02,2017-05-04,2018-01-11,2018-11-15,2019-05-02,2021-05-06,2021-06-17,2022-01-27,2023-03-02,2023-05-04,2023-07-06,2024-11-21,2025-05-01,Experience at KCC (terms),Past Parties
Councillor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Garry Graham Sturley,Gravesend East,Reform UK,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected,1,"Labour, Reform UK"
Tim Prater,Cheriton Sandgate & Hythe East,Liberal Democrat,Elected,Not elected,Not elected,NP,NP,Elected,Not elected,NP,NP,NP,NP,NP,NP,Elected,3,Liberal Democrat
Stuart Robert Jeffery,Maidstone Central,Green Party,Not elected,Not elected,Not elected,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,Elected,1,Green Party
Trudy Dean,Malling Central,Liberal Democrat,Elected,Elected,Elected,NP,NP,NP,Elected,NP,NP,Elected,NP,NP,NP,Elected,6,Liberal Democrat
Trevor Leslie Shonk,Ramsgate,Reform UK,Not elected,Elected,Not elected,NP,NP,NP,Elected,NP,NP,NP,NP,NP,NP,Elected,3,"Conservative, Reform UK, UK Independence Party"
Geoffrey Richard Samme,Maidstone North East,Liberal Democrat,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected,1,Liberal Democrat
Mark Strafford Ellis,Tunbridge Wells North,Liberal Democrat,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected,1,"Independent, Liberal Democrat"
Ben Robert Fryer,Dartford North East,Reform UK,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,NP,NP,NP,NP,Elected,1,"Reform UK, UK Independence Party"
Antony James Hook,Faversham,Liberal Democrat,NP,NP,Elected,NP,NP,NP,Elected,NP,NP,NP,NP,NP,NP,Elected,3,Liberal Democrat
Mary Elizabeth Lawes,Folkestone East,Reform UK,NP,NP,Not elected,NP,NP,NP,Not elected,NP,NP,NP,NP,NP,NP,Elected,1,"Foundation Party, Reform UK, UK Independence P..."


In [116]:
# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2025.csv"
councillor_df.to_csv(output_path)
output_path

'../data/elections/kent_councillors_elected_2025.csv'

### KCC 2021

In [117]:
# Re-run after code reset

import json
import pandas as pd

# Set target election date
target_date = "2021-05-06"

# Extract elected candidates from that date
elected_2021 = []
for record in data:
    if record.get("status") != "ok":
        continue
    if record["election_date"] != target_date:
        continue
    division = record["division"]
    for cand in record["candidates"]:
        if cand["outcome"] == "Elected":
            elected_2021.append({
                "name": cand["name"].strip(),
                "division": division,
                "first_name": cand.get("first_name", "").strip(),
                "last_name": cand.get("last_name", "").strip(),
                "middle_names": cand.get("middle_names", "").strip(),
                "party": cand["party"].strip()
            })

# Create DataFrame
elected_2021_df = pd.DataFrame(elected_2021)


# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2021_short.csv"
elected_2021_df.to_csv(output_path)
output_path

elected_2021_df.tail(20)

Unnamed: 0,name,division,first_name,last_name,middle_names,party
60,Nick Chard,Sevenoaks West,Nick,Chard,,Conservative
61,Cameron Andrew Beart,Sheppey,Cameron,Beart,Andrew,Conservative
62,Andy Booth,Sheppey,Andy,Booth,,Conservative
63,Mike Dendor,Sittingbourne North,Mike,Dendor,,Conservative
64,John Geoffrey Wright,Sittingbourne South,John,Wright,Geoffrey,Conservative
65,Rich Lehmann,Swale East,Rich,Lehmann,,Green Party
66,Mike Baldock,Swale West,Mike,Baldock,,Swale Independents
67,Perry Cole,Swanley,Perry,Cole,,Conservative
68,Peter Martin Harman,Swanscombe and Greenhithe,Peter,Harman,Martin,Swanscombe & Greenhithe Residents' Association
69,Mike Hill,Tenterden,Mike,Hill,,Conservative


In [118]:
import json
import pandas as pd

# Target election date
target_date = "2025-05-01"
elected_2021 = []

for record in data:
    if record.get("status") != "ok" or record.get("election_date") != target_date:
        continue
    division = record.get("division", "")
    for cand in record["candidates"]:
        if cand.get("outcome") == "Elected":
            elected_2021.append({
                "name": cand.get("name", "").strip(),
                "division": division,
                "first_name": cand.get("first_name", "").strip(),
                "last_name": cand.get("last_name", "").strip(),
                "middle_names": cand.get("middle_names", "").strip(),
                "party": cand.get("party", "").strip()
            })

# Convert to DataFrame
elected_2025_df = pd.DataFrame(elected_2021)

# Save the final dataframe as a CSV in the data/elections directory
output_path = "../data/elections/kent_councillors_elected_2025_short.csv"
elected_2025_df.to_csv(output_path)
output_path

elected_2025_df


Unnamed: 0,name,division,first_name,last_name,middle_names,party
0,Pamela Ann Williams,Ashford Central,Pamela,Williams,Ann,Reform UK
1,Dean Edward Burns,Ashford East,Dean,Burns,Edward,Reform UK
2,Brian Philip Collins,Ashford Rural East,Brian,Collins,Philip,Reform UK
3,Bill Barrett,Ashford Rural South,Bill,Barrett,,Reform UK
4,Jeremy Waring Eustace,Ashford Rural West,Jeremy,Eustace,Waring,Reform UK
...,...,...,...,...,...,...
76,Martin Dale Brice,Tunbridge Wells South,Martin,Brice,Dale,Liberal Democrat
77,John Joseph Moreland,Tunbridge Wells West,John,Moreland,Joseph,Liberal Democrat
78,Adrian John Kibble,Whitstable East & Herne Bay West,Adrian,Kibble,John,Reform UK
79,Stuart Heaver,Whitstable West,Stuart,Heaver,,Green Party


### Meetings mock up

## 🧠 What This Script Does: "Who Is Who" Councillor Classifier

This script builds a "Who Is Who" registry by linking **meeting attendance records** (from council minutes) to **elected councillor data** across two electoral terms (2021 and 2025). It classifies each name mentioned in meeting minutes into one of three categories:

### 🔍 Key Steps:

1. **Load and Standardize Councillor Data**
   - Reads two CSVs of elected councillors from 2021 and 2025.
   - Standardizes names (e.g., lowercasing, ASCII stripping) to enable fuzzy matching.

2. **Extract Attendee Names from JSONL Minutes**
   - Opens a `.jsonl` file of meeting metadata.
   - Pulls out unique names listed under 'present', 'absent', or 'virtual'.

3. **Standardize Attendee Names**
   - Removes titles like `Mr`, `Cllr`, `Dr`, etc.
   - Splits names into initials and last names for pattern matching.

4. **Flexible Matching Logic**
   - Matches attendees to current or former councillors using:
     - Exact last name
     - Fuzzy regex on first initials
   - Categorizes results into:
     - `current` councillor
     - `former` councillor
     - `civil_servant` (if no match found)
     - `needs_review` (if ambiguous matches found)

5. **Export Final Dataset**
   - Outputs a clean CSV (`who_is_who.csv`) with:
     - Raw name
     - Match info (first, last, division, party)
     - Status tag (`current`, `former`, `civil_servant`, `needs_review`)

### ✅ Result:
This enables downstream systems to recognize **who's who** in meeting minutes, distinguishing between elected representatives and council staff or visitors — essential for analytics, attendance stats, or knowledge graphs.

This is just an experiment.

In [None]:
import pandas as pd
import json
from pathlib import Path
from typing import List, Tuple, Dict
import re

# Configuration
MINUTES_PATH = Path("/Users/lgfolder/github/council-assistant/data/document_metadata/metadata_test.jsonl")
#MINUTES_PATH = Path("/Users/lgfolder/github/council-assistant/data/metadata/meetings.jsonl")
COUNCILLORS_2025_CSV = Path("/Users/lgfolder/github/council-assistant/data/elections/kent_councillors_elected_2025_short.csv")
COUNCILLORS_2021_CSV = Path("/Users/lgfolder/github/council-assistant/data/elections/kent_councillors_elected_2021_short.csv")
OUTPUT_PATH = Path("/Users/lgfolder/github/council-assistant/data/who_is_who.csv")

def load_and_standardize_councillors(current_csv: Path, previous_csv: Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load both current and previous councillor data with standardized names"""
    def process_councillors(df: pd.DataFrame) -> pd.DataFrame:
        # Handle different column name variations
        first_name_col = next((col for col in df.columns if 'first' in col.lower()), None)
        last_name_col = next((col for col in df.columns if 'last' in col.lower()), None)
        
        if not first_name_col or not last_name_col:
            raise ValueError("Could not find first_name and last_name columns in councillor data")
        
        df['standard_first'] = (
            df[first_name_col]
            .astype(str)
            .str.lower()
            .str.strip()
            .str.normalize('NFKD')
            .str.encode('ascii', errors='ignore')
            .str.decode('utf-8')
        )
        df['standard_last'] = (
            df[last_name_col]
            .astype(str)
            .str.lower()
            .str.strip()
            .str.normalize('NFKD')
            .str.encode('ascii', errors='ignore')
            .str.decode('utf-8')
        )
        
        # Handle division/ward/department naming
        division_col = next((col for col in df.columns if any(x in col.lower() for x in ['division', 'ward', 'district'])), None)
        if division_col:
            df['division'] = df[division_col]
        else:
            df['division'] = ''
            
        # Handle party/group naming
        party_col = next((col for col in df.columns if any(x in col.lower() for x in ['party', 'group'])), None)
        if party_col:
            df['party'] = df[party_col]
        else:
            df['party'] = ''
            
        return df
    
    try:
        current = process_councillors(pd.read_csv(current_csv))
        previous = process_councillors(pd.read_csv(previous_csv))
        return current, previous
    except Exception as e:
        print(f"Error loading councillor data: {e}")
        raise

def parse_minute_names(minutes_path: Path) -> List[str]:
    """Extract all unique names from meeting minutes"""
    attendees = set()
    
    with open(minutes_path, 'r') as f:
        for line in f:
            try:
                meeting = json.loads(line)
                for status in ['present', 'absent', 'virtual']:
                    for name in meeting['attendance'].get(status, []):
                        attendees.add(name.strip())
            except json.JSONDecodeError:
                continue
    
    return list(attendees)

def standardize_minutes_name(name: str) -> Tuple[str, str]:
    """Convert council minutes names to standardized format"""
    # Remove honorifics and trailing periods
    clean_name = re.sub(
        r'^(Mr|Mrs|Ms|Miss|Sir|Dr|Cllr)\.?\s+', 
        '', 
        name, 
        flags=re.IGNORECASE
    ).strip()
    
    # Handle cases with multiple initials
    parts = [p.strip('. ') for p in clean_name.split() if p.strip()]
    
    if not parts:
        return ('', '')
    
    last_name = parts[-1]
    first_parts = parts[:-1]
    
    if not first_parts:
        return ('', last_name.lower())
    
    # Create first initial pattern (e.g., "R W" becomes "r.?w.?")
    first_initials = ''.join([f"{p[0].lower()}.*" for p in first_parts if p])
    
    return (first_initials, last_name.lower())

def find_councillor_match(first: str, last: str, councillors: pd.DataFrame) -> pd.DataFrame:
    """Find matching councillors using flexible matching"""
    # Exact last name match
    matches = councillors[councillors['standard_last'] == last]
    
    if not first:
        return matches
    
    # Flexible first initial matching (e.g., "r.?w.?" matches "Roger William")
    try:
        pattern = re.compile(f'^{first}')
        return matches[
            matches['standard_first'].str.contains(pattern, na=False)
        ]
    except:
        return matches

def classify_attendees(
    attendees: List[str], 
    current_councillors: pd.DataFrame,
    previous_councillors: pd.DataFrame
) -> pd.DataFrame:
    """Classify each attendee into categories"""
    records = []
    
    for raw_name in attendees:
        first, last = standardize_minutes_name(raw_name)
        
        # Check current councillors first
        current_matches = find_councillor_match(first, last, current_councillors)
        previous_matches = find_councillor_match(first, last, previous_councillors)
        
        if len(current_matches) == 1:
            # Current councillor match
            record = create_record(raw_name, current_matches.iloc[0], 'current')
        elif len(previous_matches) == 1:
            # Former councillor match
            record = create_record(raw_name, previous_matches.iloc[0], 'former')
        elif len(current_matches) > 1 or len(previous_matches) > 1:
            # Ambiguous match
            record = create_ambiguous_record(raw_name, first, last, current_matches, previous_matches)
        else:
            # No match - likely civil servant
            record = create_civil_servant_record(raw_name, first, last)
        
        records.append(record)
    
    return pd.DataFrame(records)

def create_record(raw_name: str, councillor: pd.Series, status: str) -> Dict:
    """Create standardized record for matched councillor"""
    return {
        'raw_name': raw_name,
        'first_name': councillor.get('first_name', ''),
        'last_name': councillor.get('last_name', ''),
        'position': 'Councillor',
        'division': councillor.get('division', ''),
        'party': councillor.get('party', ''),
        'status': status,
        'source': 'current' if status == 'current' else 'previous'
    }

def create_ambiguous_record(
    raw_name: str, 
    first: str, 
    last: str,
    current_matches: pd.DataFrame,
    previous_matches: pd.DataFrame
) -> Dict:
    """Create record for ambiguous matches"""
    all_matches = pd.concat([current_matches, previous_matches])
    return {
        'raw_name': raw_name,
        'first_name': '',
        'last_name': last.title(),
        'position': 'AMBIGUOUS',
        'division': '|'.join(all_matches.get('division', '').unique()),
        'party': '|'.join(all_matches.get('party', '').unique()),
        'status': 'needs_review',
        'source': 'multiple'
    }

def create_civil_servant_record(raw_name: str, first: str, last: str) -> Dict:
    """Create record for civil servants"""
    formatted_first = ' '.join([f"{c.upper()}." for c in first.split('.') if c]) if first else ''
    return {
        'raw_name': raw_name,
        'first_name': formatted_first,
        'last_name': last.title(),
        'position': 'Civil Servant',
        'division': '',
        'party': '',
        'status': 'civil_servant',
        'source': ''
    }

def main():
    try:
        # Load and standardize councillor data
        current_councillors, previous_councillors = load_and_standardize_councillors(
            COUNCILLORS_2025_CSV, 
            COUNCILLORS_2021_CSV
        )
        
        # Parse meeting attendees
        attendees = parse_minute_names(MINUTES_PATH)
        
        # Classify attendees
        who_is_who = classify_attendees(attendees, current_councillors, previous_councillors)
        
        # Save results
        who_is_who.to_csv(OUTPUT_PATH, index=False)
        print(f"Processed {len(who_is_who)} names. Saved to {OUTPUT_PATH}")
        print("\nSample output:")
        print(who_is_who.head().to_string())
        
    except Exception as e:
        print(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
from pathlib import Path

# Load the processed data
who_is_who = pd.read_csv("../data/who_is_who.csv")

# 1. Current Councillors Table
current_councillors = who_is_who[who_is_who['status'] == 'current'].copy()
current_councillors['full_name'] = current_councillors['first_name'] + ' ' + current_councillors['last_name']
councillors_table = current_councillors[['full_name', 'division', 'party', 'last_name']].sort_values('last_name')
councillors_table = councillors_table[['full_name', 'division', 'party']]  # Drop last_name after sorting

# 2. Civil Servants Directory
civil_servants = who_is_who[who_is_who['status'] == 'civil_servant'].copy()
civil_servants['formatted_name'] = civil_servants['first_name'] + ' ' + civil_servants['last_name']
civil_servants_table = civil_servants[['formatted_name', 'raw_name', 'last_name']].sort_values('last_name')
civil_servants_table = civil_servants_table[['formatted_name', 'raw_name']]  # Drop last_name after sorting

# 3. Meeting Participation Heatmap
# First we need to count meeting appearances (this would be better done during initial processing)
def count_meetings(minutes_path):
    meeting_counts = {}
    with open(minutes_path, 'r') as f:
        for line in f:
            try:
                meeting = json.loads(line)
                for status in ['present', 'absent', 'virtual']:
                    for name in meeting['attendance'].get(status, []):
                        meeting_counts[name.strip()] = meeting_counts.get(name.strip(), 0) + 1
            except json.JSONDecodeError:
                continue
    return meeting_counts

meeting_counts = count_meetings(MINUTES_PATH)
who_is_who['meetings_attended'] = who_is_who['raw_name'].map(meeting_counts).fillna(0)

participation_table = who_is_who[
    ['first_name', 'last_name', 'position', 'meetings_attended']
].sort_values('meetings_attended', ascending=False)

# 4. Department Affiliations
department_table = who_is_who.groupby(['division', 'position']).size().unstack(fill_value=0)
department_table['Total'] = department_table.sum(axis=1)

# 5. Ambiguity Resolution Table
ambiguity_table = who_is_who[who_is_who['status'] == 'needs_review'].copy()
ambiguity_table['possible_matches'] = ambiguity_table.apply(
    lambda x: f"{x['division']} ({x['party']})", axis=1
)
ambiguity_table = ambiguity_table[['raw_name', 'possible_matches']]

# Save all tables
tables_path = Path("/Users/lgfolder/github/council-assistant/data/who_is_who_tables/")
tables_path.mkdir(exist_ok=True)

councillors_table.to_csv(tables_path / "councillors.csv", index=False)
civil_servants_table.to_csv(tables_path / "civil_servants.csv", index=False)
participation_table.to_csv(tables_path / "participation.csv", index=False)
department_table.to_csv(tables_path / "divisions.csv")
ambiguity_table.to_csv(tables_path / "ambiguities.csv", index=False)

print("All tables generated successfully!")

### Generate elections json from the elections results

## 🗳️ What This Script Does: `elections.jsonl` Metadata Generator

This script creates a machine-readable reference file (`elections.jsonl`) that describes each **election year** found in your cleaned Kent County Council results dataset.

### 📦 Input
- Loads the cleaned election results from:  
  `../data/elections/kent_results_all_years_cleaned.json`  
  This file contains detailed candidate-level results for all elections and by-elections.

### ⚙️ What It Builds
It generates one metadata entry per **election year**, each including:

| Field | Description |
|-------|-------------|
| `election_id` | Unique ID like `kent_cc_2025` |
| `council_id` | Fixed as `"kent_cc"` |
| `election_date` | ISO date of the election (from first result that year) |
| `election_type` | `"local"` |
| `scope` | `"county-wide"` (assumes all are full council elections) |
| `description` | Human-readable description like `"Kent County Council local elections 2025"` |
| `results_path` | File path to the full dataset |
| `results_filter` | A filter dictionary, e.g. `{"election_year": 2025}` to slice the dataset |
| `source_url` | Link to the council's official elections page |

### 🧾 Output
- Writes the data to a `.jsonl` file at:  
  `../data/references/elections.jsonl`  
  This can be used for indexing, display in UI menus, filtering APIs, or building dashboards.

### ✅ Example Use Cases
- Generating dropdowns like **"View Results for 2025"**
- Linking knowledge graph events to specific election cycles
- Running time-series analyses across elections

This metadata file acts as a lightweight index of all the **election cycles** your project has data for.


In [119]:
import json
from pathlib import Path
import pandas as pd

# Load the cleaned results
input_path = Path("../data/elections/kent_results_all_years_cleaned.jsonl")
results_data = pd.read_json(input_path, lines=True)

In [120]:
import pandas as pd

# Basic shape
print(f"📦 Shape: {results_data.shape[0]:,} rows × {results_data.shape[1]} columns")

# Basic info
print("\n🔍 Data Types & Non-Null Counts:")
print(results_data.info())

# Check for missing values
print("\n🚫 Missing Values per Column:")
missing_counts = results_data.isna().sum()
print(missing_counts[missing_counts > 0])

# Value counts of election types
if "election_type" in results_data.columns:
    print("\n🗳️ Election Type Breakdown:")
    print(results_data["election_type"].value_counts(dropna=False))

# Year coverage
if "election_date" in results_data.columns:
    results_data["election_date"] = pd.to_datetime(results_data["election_date"], errors="coerce")
    results_data["election_year"] = results_data["election_date"].dt.year
    print("\n📆 Election Years Available:")
    print(results_data["election_year"].value_counts().sort_index())

# Division name checks
if "division" in results_data.columns:
    print("\n🏷️ Top 10 Most Frequent Division Names:")
    print(results_data["division"].value_counts().head(10))

# Status field analysis
if "status" in results_data.columns:
    print("\n✅ Status Value Counts:")
    print(results_data["status"].value_counts())

# Candidate-level checks
candidate_counts = results_data["candidates"].apply(lambda x: len(x) if isinstance(x, list) else 0)
print(f"\n👥 Candidate counts per record:\n- Mean: {candidate_counts.mean():.2f}, Max: {candidate_counts.max()}, Min: {candidate_counts.min()}")
print(f"- Records with 0 candidates: {(candidate_counts == 0).sum()}")


📦 Shape: 1,966 rows × 8 columns

🔍 Data Types & Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   division       1966 non-null   object
 1   url            1966 non-null   object
 2   status         1966 non-null   object
 3   candidates     1966 non-null   object
 4   summary        1966 non-null   object
 5   election_date  1966 non-null   object
 6   council        1966 non-null   object
 7   election_type  1966 non-null   object
dtypes: object(8)
memory usage: 123.0+ KB
None

🚫 Missing Values per Column:
Series([], dtype: int64)

🗳️ Election Type Breakdown:
election_type
regular    1966
Name: count, dtype: int64

📆 Election Years Available:
election_year
2009    353
2013    400
2017    390
2018      9
2019     14
2021    347
2022      3
2023     16
2024      7
2025    427
Name: count, dtype: int64

🏷️ Top 10 Most Frequen

In [121]:
# Show rows with missing election_date
missing_election_date = results_data[results_data["election_date"].isna()]

print(f"⚠️ Found {missing_election_date.shape[0]} rows without an election date.")

# Display them
missing_election_date.head(20)  # You can change 20 to see more


⚠️ Found 0 rows without an election date.


Unnamed: 0,division,url,status,candidates,summary,election_date,council,election_type,election_year


In [122]:


# Ensure output directory exists
output_path = Path("../data/references/elections.jsonl")
output_path.parent.mkdir(parents=True, exist_ok=True)

# Add election year column from date
results_data["election_year"] = pd.to_datetime(results_data["election_date"]).dt.year

# Define base fields
council_id = "kent_cc"
source_url = "https://www.kent.gov.uk/about-the-council/how-the-council-works/elections"
results_path = str(input_path)

# Generate one entry per unique election year
elections = []

for year in sorted(results_data["election_year"].unique(), reverse=True):
    # Filter records for this year that have a valid election_date
    subset = results_data[
        (results_data["election_year"] == year) & 
        (results_data["election_date"].notnull())
    ]

    if subset.empty:
        print(f"⚠️ Skipping year {year} — no valid election_date found.")
        continue

    # Use the first valid election_date
    election_date = pd.to_datetime(subset["election_date"].iloc[0]).date().isoformat()

    election = {
        "election_id": f"{council_id}_{int(year)}",
        "council_id": council_id,
        "election_date": election_date,
        "election_type": "local",
        "scope": "county-wide",
        "description": f"Kent County Council local elections {year}",
        "results_path": str(input_path),
        "results_filter": {"election_year": int(year)},
        "source_url": source_url
    }

    elections.append(election)


### Populate people.json from existing civil servants json

The civil servant json was already available - generated by ChatGPT from a pdf I found on the council's website

In [123]:
import json
from pathlib import Path
import re

# === CONFIGURATION ===
INPUT_FILE = Path("../data/jsons/civil_servants_all.json")
OUTPUT_FILE = Path("../data/entities/people.jsonl")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

# === UTILITY FUNCTIONS ===
def slugify(name):
    return re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")

def generate_person_id(slug, counter):
    return f"{slug}_{counter:03d}"

# === LOAD EXISTING PEOPLE ===
existing_people = {}
slug_counter = {}

if OUTPUT_FILE.exists():
    with open(OUTPUT_FILE) as f:
        for line in f:
            person = json.loads(line)
            slug = slugify(person["full_name"])
            existing_people[slug] = person
            # update counter
            id_suffix = person["person_id"].split("_")[-1]
            try:
                slug_counter[slug] = max(slug_counter.get(slug, 0), int(id_suffix))
            except ValueError:
                pass

# === LOAD CIVIL SERVANTS DATA ===
with open(INPUT_FILE) as f:
    civil_servants = json.load(f)

new_people = []
flagged_people = []

for entry in civil_servants:
    full_name = entry.get("name", "").strip()
    if not full_name:
        continue

    parts = full_name.split()
    first_name = parts[0] if parts else ""
    last_name = parts[-1] if len(parts) > 1 else ""
    slug = slugify(full_name)

    if slug in existing_people:
        flagged_people.append(full_name)
        continue  # Skip known person

    # Assign new person_id
    slug_counter[slug] = slug_counter.get(slug, 0) + 1
    person_id = generate_person_id(slug, slug_counter[slug])

    person = {
        "person_id": person_id,
        "full_name": full_name,
        "first_name": first_name,
        "last_name": last_name,
        "aliases": list({full_name, last_name}),
        "roles": ["civil_servant"],
        "civil_service_roles": [{
            "role": entry.get("role", ""),
            "department": entry.get("department", ""),
            "division": entry.get("Division", ""),
            "service_unit": entry.get("Service Unit", ""),
            "grade": entry.get("Grade", ""),
            "contract_title": entry.get("Contract Title", ""),
            "manager_name": entry.get("Manager Name", ""),
            "start_date": "",
            "end_date": ""
        }],
        "committees": entry.get("committees", []),
        "elections": [],
        "profiles": {
            "council_url": "",
            "linkedin": "",
            "twitter": ""
        }
    }

    new_people.append(person)
    existing_people[slug] = person

# === APPEND TO people.jsonl ===
mode = "a" if OUTPUT_FILE.exists() else "w"
with open(OUTPUT_FILE, mode) as f:
    for person in new_people:
        f.write(json.dumps(person) + "\n")

print(f"✅ Added {len(new_people)} new civil servants to: {OUTPUT_FILE}")
if flagged_people:
    print(f"⚠️  Skipped {len(flagged_people)} possible duplicates:")
    for name in flagged_people:
        print(" -", name)

✅ Added 0 new civil servants to: ../data/entities/people.jsonl
⚠️  Skipped 33 possible duplicates:
 - Amanda Beer
 - John Betts
 - Michael Thomas-Sam
 - Richard Smith
 - Sarah Hammond
 - Simon Jones
 - David Whittle
 - Kevin Kasaven
 - Christine McInnes
 - Mark Albiston
 - Richard Ellis
 - Sydney Hill
 - Matthew Smyth
 - Stephanie Holt
 - Haroona Chughtai
 - Andreea Crisan
 - Anjan Ghosh
 - Helen Gillivan
 - Joel Cook
 - Catherine Head
 - David Shipton
 - Jonathan Idle
 - Nicholas Buckland
 - Clare Maynard
 - Philip Lightowler
 - Mark Scrivener
 - Tim Woolmer
 - Caroline Dodge
 - Jennifer Maiden-Brooks
 - Elizabeth Adam
 - Iona Hunter-Whitehouse
 - Victoria Widden
 - Tristan Godfrey


### Append Candidates to people.jsonl

In [127]:
import json
from pathlib import Path
import re

# === CONFIGURATION ===
ELECTION_FILE = Path("../data/elections/kent_results_all_years_cleaned.jsonl")
PEOPLE_FILE = Path("../data/metadata/people.jsonl")
PEOPLE_FILE.parent.mkdir(parents=True, exist_ok=True)

# === UTILITY FUNCTIONS ===
def slugify(name):
    return re.sub(r"[^a-z0-9]+", "_", name.lower()).strip("_")

def generate_person_id(base_slug, counter):
    return f"{base_slug}_{counter:03d}"

# === LOAD EXISTING PEOPLE ===
existing_people = {}
slug_counter = {}

if PEOPLE_FILE.exists():
    with open(PEOPLE_FILE) as f:
        for line in f:
            person = json.loads(line)
            slug = slugify(person["full_name"])
            existing_people[slug] = person
            slug_counter[slug] = int(person["person_id"].split("_")[-1])

# === LOAD ELECTION RESULTS ===
with open(ELECTION_FILE, "r", encoding="utf-8") as f:
    election_data = [json.loads(line) for line in f if line.strip()]

# === PROCESS NEW CANDIDATES ===
for record in election_data:
    year = int(record.get("election_date", "")[:4])
    division = record.get("division", "")
    
    for cand in record.get("candidates", []):
        canonical_name = cand.get("canonical_name", cand.get("name", "")).strip()
        if not canonical_name:
            continue

        slug = slugify(canonical_name)
        first, *rest = canonical_name.split()
        last = rest[-1] if rest else first

        # Prepare the election record
        election_info = {
            "year": year,
            "division": division,
            "party": cand.get("party", ""),
            "status": cand.get("status", "")
        }

        if slug in existing_people:
            # Append election to existing person if not a duplicate
            existing = existing_people[slug]
            if election_info not in existing["elections"]:
                existing["elections"].append(election_info)
        else:
            # New person: assign new ID
            slug_counter[slug] = slug_counter.get(slug, 0) + 1
            person_id = generate_person_id(slug, slug_counter[slug])

            new_person = {
                "person_id": person_id,
                "full_name": canonical_name,
                "first_name": first,
                "last_name": last,
                "aliases": list({canonical_name, last}),
                "roles": ["candidate"],
                "civil_service_roles": [],
                "committees": [],
                "elections": [election_info],
                "profiles": {
                    "council_url": "",
                    "linkedin": "",
                    "twitter": ""
                }
            }

            existing_people[slug] = new_person

# === WRITE UPDATED PEOPLE FILE ===
with open(PEOPLE_FILE, "w") as f:
    for person in existing_people.values():
        f.write(json.dumps(person) + "\n")

print(f"✅ people.jsonl updated with {len(existing_people)} unique individuals.")

✅ people.jsonl updated with 1421 unique individuals.
