In [21]:
from datetime import date
from pathlib import Path

import pandas as pd
from mysoc_validator import Popolo

data_dir = Path("..", "data", "interim")
docs_dir = Path("..", "docs", "_vol")

df = pd.read_parquet(
    data_dir / "register_of_members_financial_interests_one_year.parquet"
)
df["old_person_id"] = df["person_id"]

df["person_id"] = df["person_id"].apply(lambda x: x.split("/")[-1] if x else None)
# drop any where person_id is None
df = df.dropna(subset=["person_id"])
df["date"] = df["speech_id"].apply(lambda x: x.split("/")[2][:10])
df["score"] = df["distance"].apply(lambda x: round(1 - x, 2) * 100).astype(int)
df.head()

Unnamed: 0,search_query,date_range_start,date_range_end,distance,matched_text,speaker_name,person_id,chamber,transcript_type,speech_id,debate_url,old_person_id,date,score
0,register of members interests,2023-09-01,2024-09-26,0.044254,May I draw the House’s attention to my entry i...,David Simmonds,25892,house-of-commons,debates,uk.org.publicwhip/debate/2024-07-23d.605.0#d60...,https://www.theyworkforyou.com/debates/?id=202...,uk.org.publicwhip/person/25892,2024-07-23,96
1,register of members interests,2023-09-01,2024-09-26,0.044254,May I draw the House’s attention to my entry i...,David Simmonds,25892,house-of-commons,debates,uk.org.publicwhip/debate/2024-07-23d.605.0#d60...,https://www.theyworkforyou.com/debates/?id=202...,uk.org.publicwhip/person/25892,2024-07-23,96
2,register of members interests,2023-09-01,2024-09-26,0.064175,I draw Members’ attention to my entry in the R...,Robert Syms,10582,house-of-commons,debates,uk.org.publicwhip/debate/2023-10-23c.669.0#c66...,https://www.theyworkforyou.com/debates/?id=202...,uk.org.publicwhip/person/10582,2023-10-23,94
3,register of members interests,2023-09-01,2024-09-26,0.064296,May I start by drawing the Committee’s attenti...,David Simmonds,25892,house-of-commons,debates,uk.org.publicwhip/debate/2024-01-17c.907.0#c90...,https://www.theyworkforyou.com/debates/?id=202...,uk.org.publicwhip/person/25892,2024-01-17,94
4,register of members interests,2023-09-01,2024-09-26,0.064296,May I start by drawing the Committee’s attenti...,David Simmonds,25892,house-of-commons,debates,uk.org.publicwhip/debate/2024-01-17c.907.0#c90...,https://www.theyworkforyou.com/debates/?id=202...,uk.org.publicwhip/person/25892,2024-01-17,94


In [24]:
popolo = Popolo.from_parlparse()


def is_around_now(person_id):
    person = popolo.persons[person_id]
    if person.membership_on_date(date(2024, 9, 27), chamber=Popolo.Chamber.COMMONS):
        return True
    return False


person_ids = df["old_person_id"].unique()
to_remove = []

for person_id in person_ids:
    if not is_around_now(person_id):
        to_remove.append(person_id)

df = df[~df["old_person_id"].isin(to_remove)]

In [25]:
doc_markdown = """---
page_title: {speaker_name}
---

# {speaker_name}  ({person_id})

[Register link](https://www.theyworkforyou.com/mp/{person_id}/register)

"""

row_markdown = """

## {date}: Match score {score}%

>{matched_text}

[Debate link]({debate_url}) 

---

"""

docs_dir.mkdir(exist_ok=True)

for person_id, gdf in df.groupby("person_id"):
    md = doc_markdown.format(
        speaker_name=gdf["speaker_name"].iloc[0], person_id=person_id
    )

    gdf = gdf.drop(columns=["person_id"])

    for i, row in gdf.iterrows():
        md += row_markdown.format(
            date=row["date"],
            score=row["score"],
            matched_text=row["matched_text"],
            debate_url=row["debate_url"],
        )

    with open(docs_dir / f"{person_id}.md", "w") as f:
        f.write(md)

In [27]:
index_markdown = """---
page_title: Possible matches 2023-24
---

# Possible matches 2023-24

"""

links = []

for person_id, gdf in df.groupby("person_id"):
    speaker_name = gdf["speaker_name"].iloc[0]

    links.append(f"- [{speaker_name}]({person_id})")

index_markdown += "\n".join(links)

with open(docs_dir / "index.md", "w") as f:
    f.write(index_markdown)