In [1]:
import pandas as pd
from pathlib import Path

BASE = r"C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis"
RAW_CONSULT = BASE + r"\data\raw\consultation_2020.csv"
LETTERS = BASE + r"\data\processed\letters_cleaned.csv"
OUT = BASE + r"\data\processed\master_ai_act_consultations_clean.csv"


In [2]:
consult = pd.read_csv(
    RAW_CONSULT,
    encoding="cp1252",
    sep=";",
    engine="python",
    on_bad_lines="skip",
    dtype=str  # keep everything as text so nothing gets mis-parsed
)
print(consult.shape)


(1216, 73)


In [4]:
letters = pd.read_csv(
    LETTERS,
    encoding="utf-8",
    sep=",",   # this one was saved by pandas with default comma
    dtype=str
)
print(letters.shape)


(440, 6)


In [5]:
upload_col = 'You can upload a document here:\n\n'

uploads = consult[[ "Reference", upload_col ]].copy()
uploads["upload"] = uploads[upload_col].astype(str).str.strip()
uploads = uploads[uploads["upload"].str.len() > 4].copy()
uploads["filename_csv"] = uploads["upload"].str.extract(r"([^/\\]+\.pdf)", expand=False)

letters["filename_disk"] = letters["filename"]
letters["filename_stripped"] = letters["filename_disk"].str.split("-", n=1).str[-1]

uploads["filename_csv_clean"] = uploads["filename_csv"].str.lower()
letters["filename_stripped_clean"] = letters["filename_stripped"].str.lower()

merged = pd.merge(
    uploads,
    letters,
    left_on="filename_csv_clean",
    right_on="filename_stripped_clean",
    how="left"
)
print(merged["text_clean"].notna().mean())


0.8700696055684455


In [6]:
merged = merged.drop_duplicates(subset=["Reference"], keep="first")
print(merged.shape)


(423, 14)


In [7]:
merged = merged.rename(columns={
    "text_clean": "text_letter_clean",
    "quality_flag": "text_letter_quality",
    "filename_csv": "upload_filename_from_csv",
    "filename_disk": "upload_filename_on_disk"
})

master = pd.merge(
    consult,
    merged[["Reference", "text_letter_clean", "text_letter_quality",
            "upload_filename_from_csv", "upload_filename_on_disk"]],
    on="Reference",
    how="left"
)
print(master.shape)


(1216, 77)


In [8]:
master.columns = [c.replace("\r", " ").replace("\n", " ").strip() for c in master.columns]


In [9]:
master.to_csv(
    OUT,
    sep=";",
    index=False,
    encoding="utf-8-sig"
)
print("saved to", OUT)


saved to C:\Users\hp EliteBook\Desktop\ai-act-consultation-analysis\data\processed\master_ai_act_consultations_clean.csv
