In [None]:
from asf_core_data import get_mcs_installations, load_preprocessed_epc_data

import pandas as pd
import numpy as np

In [None]:
# change path to your own local version of EPC data
epc_path = "/Users/chris.williamson/Documents/ASF_data"

In [None]:
mcs = get_mcs_installations("full")

In [None]:
# convert date columns to datetime type
mcs["commission_date"] = pd.to_datetime(mcs["commission_date"])
mcs["INSPECTION_DATE"] = pd.to_datetime(mcs["INSPECTION_DATE"])

In [None]:
# merge installation type columns and filter to domestic installations
mcs["installation_type"] = mcs["installation_type"].fillna(
    mcs["end_user_installation_type"]
)
mcs = mcs.loc[mcs.installation_type == "Domestic"].reset_index(
    drop=True
)

### What proportion of records in the MCS database relate to new-build installations?

In [None]:
# filter to first records - if property is a new build then it should appear as such in its first EPC record
first_records = (
    mcs
    .sort_values("INSPECTION_DATE")
    .groupby("original_mcs_index")
    .head(1)
    .sort_values("original_mcs_index")
)

In [None]:
# find number of days between first recorded EPC inspection and HP commission
first_records["diff_epc_to_mcs"] = (
    first_records["commission_date"] - first_records["INSPECTION_DATE"]
).dt.days

# assume dwelling was built with HP if:
# - first EPC shows it as a new dwelling
# - time difference between EPC inspection when dwelling was built and HP installation is less than 1 year
first_records["assumed_hp_when_built"] = (
    first_records["TRANSACTION_TYPE"] == "new dwelling"
) & (first_records["diff_epc_to_mcs"] < 365)

Proportions of new build installations in the MCS database:

In [None]:
first_records.assumed_hp_when_built.value_counts(normalize=True)

Top 5 installers of new build installations:

In [None]:
first_records.loc[first_records["assumed_hp_when_built"]]["installer_name"].value_counts().head()

Difference in average costs for retrofits and new builds:

In [None]:
first_records.groupby("assumed_hp_when_built").cost.mean()

### What proportion of properties in the EPC database that were built with a HP appear in the MCS database?

In [None]:
epc = load_preprocessed_epc_data(epc_path, version="preprocessed", usecols=["UPRN", "TRANSACTION_TYPE", "HP_INSTALLED"])

In [None]:
# filter to records of new builds with a heat pump
new_hp = epc.loc[(epc["TRANSACTION_TYPE"] == "new dwelling") & (epc["HP_INSTALLED"])]

In [None]:
# replace missing or unknown UPRNs to avoid them appearing the same in both datasets
new_hp["UPRN"] = new_hp["UPRN"].replace("unknown", np.nan).fillna(0).astype("float").astype("int")
mcs["UPRN"] = mcs["UPRN"].replace("unknown", np.nan).fillna(-1).astype("float").astype("int")

In [None]:
new_hp["in_mcs"] = new_hp["UPRN"].isin(mcs["UPRN"])

Proportions of EPC new builds that are in the MCS database:

In [None]:
new_hp["in_mcs"].value_counts(normalize=True)