# **05 Merge EPC data**

# Merge PPD with EPC data based on address matching

## Objectives

* Merge the clean EPC data with the Price Paid, Geography and IMD (ppd_with_geography_and_imd) combined data to create and save the final dataset for analysis 

## Inputs
- ppd_with_geography_and_imd.csv
- epc_master.csv

## Outputs
- ppd_with_geography_and_imd_epc.csv

## Additional Comments

* EPC adds floor area and EPC rating feature.
* The EPC data is in a zip file otherwise the file size is over the 100Mb limit for Github files.

Matches **PPD** to **EPC** using:
  - Exact join on a normalised key without SAON
  - Postcode and address using fuzzy matching
  
The Fuzzy matching functions written by Copilot.

In [None]:
# File names and parameters

ppd_csv = "data/clean/ppd_with_geography_and_imd.csv"
epc_csv = "data/clean/epc_master.csv"
out_csv = "data/clean/ppd_with_geography_and_imd_epc.csv"

# parameters for fuzzy matching
SCORE_CUTOFF = 90      # raise to 92â€“95 for stricter matches
SAMPLE_PPD = 0         # e.g., 500 for a test
SAMPLE_EPC = 0         # e.g., 100_000 for a test

In [1]:
import time
import pandas as pd
from rapidfuzz import fuzz, process

In [2]:
def normalise_series(s: pd.Series) -> pd.Series:
    """Lowercase, remove non-alnum (keep spaces), collapse spaces, strip."""
    return (
        s.fillna("")
         .astype(str)
         .str.lower()
         .str.replace(r"[^a-z0-9 ]", " ", regex=True)
         .str.replace(r"\s+", " ", regex=True)
         .str.strip()
    )

def build_ppd_keys(ppd: pd.DataFrame) -> pd.DataFrame:
    required = {"PAON","SAON","Street","postcode"}
    missing = required - set(ppd.columns)
    if missing:
        raise ValueError(f"PPD missing columns: {missing}")

    for c in required:
        ppd[c] = ppd[c].fillna("").astype(str)

    ppd["addr_key"] = normalise_series(ppd["PAON"] + " " + ppd["SAON"] + " " + ppd["Street"] + " " + ppd["postcode"])
    ppd["exact_key_no_saon"] = normalise_series(ppd["PAON"] + " " + ppd["Street"] + " " + ppd["postcode"])
    return ppd

def build_epc_keys(epc: pd.DataFrame) -> pd.DataFrame:
    required = {"address","postcode","current_energy_rating","total_floor_area"}
    missing = required - set(epc.columns)
    if missing:
        raise ValueError(f"EPC missing columns: {missing}")

    epc["address"] = epc["address"].fillna("").astype(str)
    epc["postcode"] = epc["postcode"].fillna("").astype(str)

    epc["addr_key"] = normalise_series(epc["address"] + " " + epc["postcode"])
    epc["exact_key_no_saon"] = epc["addr_key"]

    epc = epc[["addr_key","exact_key_no_saon","postcode","current_energy_rating","total_floor_area"]]
    epc = epc.drop_duplicates(subset=["addr_key"], keep="first")
    return epc

def build_postcode_index(epc_small: pd.DataFrame):
    """postcode -> (choices_list, payload_df_indexed_by_addr_key)"""
    index = {}
    for pc, chunk in epc_small.groupby("postcode", sort=False):
        choices = chunk["addr_key"].tolist()
        payload = chunk.set_index("addr_key")[["current_energy_rating","total_floor_area"]]
        index[pc] = (choices, payload)
    return index

def fuzzy_within_postcode(addr_key, postcode, epc_by_pc, score_cutoff):
    if not addr_key or not postcode or postcode not in epc_by_pc:
        return (None, None, None, None)
    choices, payload = epc_by_pc[postcode]
    res = process.extractOne(
        addr_key,
        choices,
        scorer=fuzz.token_set_ratio,
        processor=None,
        score_cutoff=score_cutoff
    )
    if not res:
        return (None, None, None, None)
    name, score, _ = res
    rating, area = payload.loc[name]
    return (name, float(score), str(rating), float(area))


**Load input CSV files**

In [None]:
t0 = time.time()
print("Loading CSVs...")
ppd = pd.read_csv(ppd_csv, dtype=str, keep_default_na=False, na_values=[])
epc = pd.read_csv(epc_csv, dtype=str, keep_default_na=False, na_values=[])

if SAMPLE_PPD > 0:
    ppd = ppd.head(SAMPLE_PPD).copy()
if SAMPLE_EPC > 0:
    epc = epc.head(SAMPLE_EPC).copy()

print(f"PPD rows: {len(ppd):,} | EPC rows: {len(epc):,}")


In [None]:
ppd = build_ppd_keys(ppd)
epc = build_epc_keys(epc)
t1 = time.time()
print(f"Built keys in {t1 - t0:0.2f}s")


In [None]:
print("Exact joining on exact_key_no_saon...")
ppd = ppd.merge(
    epc[["exact_key_no_saon","current_energy_rating","total_floor_area"]],
    on="exact_key_no_saon",
    how="left",
    suffixes=("","_epc")
)
matched_exact = ppd["current_energy_rating"].notna().sum()
print(f"Exact matches: {matched_exact:,}")


In [None]:
print("Indexing EPC by postcode...")
epc_small = epc[["addr_key","postcode","current_energy_rating","total_floor_area"]]
epc_by_pc = build_postcode_index(epc_small)
del epc_small
t2 = time.time()
print(f"Built postcode index in {t2 - t1:0.2f}s (total {t2 - t0:0.2f}s)")


In [None]:
mask_unmatched = ppd["current_energy_rating"].isna()
unmatched_count = int(mask_unmatched.sum())
print(f"Fuzzy matching within postcode for unmatched: {unmatched_count:,} rows...")

ppd["matched_key"] = None
ppd["match_score"] = None

def _row_match(row):
    if not pd.isna(row["current_energy_rating"]):
        return (row.get("matched_key"), row.get("match_score"), row["current_energy_rating"], row["total_floor_area"])
    return fuzzy_within_postcode(row["addr_key"], row["postcode"], epc_by_pc, SCORE_CUTOFF)

results = ppd.loc[mask_unmatched].apply(_row_match, axis=1, result_type=None)
ppd.loc[mask_unmatched, ["matched_key","match_score","current_energy_rating","total_floor_area"]] = list(results)


In [None]:
total = len(ppd)
matched_total = ppd["current_energy_rating"].notna().sum()
fuzzy_matched = matched_total - matched_exact
match_rate = 100.0 * matched_total / total if total else 0.0

t3 = time.time()
print(f"Fuzzy matching done in {t3 - t2:0.2f}s (total {t3 - t0:0.2f}s)")
print("-" * 60)
print(f"Total PPD rows       : {total:,}")
print(f"Exact matches        : {matched_exact:,}")
print(f"Fuzzy matches        : {fuzzy_matched:,}")
print(f"All matches          : {matched_total:,} ({match_rate:0.1f}%)")
if ppd['match_score'].notna().any():
    stats = ppd.loc[ppd['match_score'].notna(), 'match_score'].astype(float).describe()
    print("Match score stats    :")
    print(stats.to_string())
else:
    print("Match score stats    : (no fuzzy matches with score)")

ppd.to_csv(out_csv, index=False)
print(f"Wrote: {out_csv}")
