In [12]:
import zipfile
from pathlib import Path
import pandas as pd

zip_dir = Path("../data/raw/EPC")
frames = []

for zip_path in zip_dir.glob("*.zip"):
    with zipfile.ZipFile(zip_path) as zf:
        # find the certificates file regardless of case
        names = zf.namelist()
        cert_name = next((n for n in names if n.lower() == "certificates.csv"), None)
        if cert_name is None:
            continue  # skip unexpected bundles

        with zf.open(cert_name) as fh:
            frames.append(pd.read_csv(fh, sep=",", low_memory=False))

epc_master = pd.concat(frames, ignore_index=True)
epc_master.to_csv("../data/clean/epc_master.csv", index=False)
# check the result
epc_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3343753 entries, 0 to 3343752
Data columns (total 93 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   LMK_KEY                        object 
 1   ADDRESS1                       object 
 2   ADDRESS2                       object 
 3   ADDRESS3                       object 
 4   POSTCODE                       object 
 5   BUILDING_REFERENCE_NUMBER      int64  
 6   CURRENT_ENERGY_RATING          object 
 7   POTENTIAL_ENERGY_RATING        object 
 8   CURRENT_ENERGY_EFFICIENCY      int64  
 9   POTENTIAL_ENERGY_EFFICIENCY    int64  
 10  PROPERTY_TYPE                  object 
 11  BUILT_FORM                     object 
 12  INSPECTION_DATE                object 
 13  LOCAL_AUTHORITY                object 
 14  CONSTITUENCY                   object 
 15  COUNTY                         object 
 16  LODGEMENT_DATE                 object 
 17  TRANSACTION_TYPE               object 
 18  EN

In [13]:
"""
keep these fields:
POSTCODE
ADDRESS
CURRENT_ENERGY_RATING
TOTAL_FLOOR_AREA
ADDRESS1
ADDRESS2
ADDRESS3
LODGEMENT_DATE

Do we need to dedup? If so take the last date.
"""

# lowercase column names
epc_master.columns = [col.lower() for col in epc_master.columns]
# reduce to key fields and lowercase column names
epc_reduced = epc_master[[
    'postcode', 
    'address', 
    'current_energy_rating', 
    'total_floor_area', 
    'address1', 
    'address2', 
    'address3', 
    'lodgement_date']]

epc_reduced.head()

Unnamed: 0,postcode,address,current_energy_rating,total_floor_area,address1,address2,address3,lodgement_date
0,DE22 2XJ,"1, Quarndon View, Allestree",D,166.57,"1, Quarndon View",Allestree,,2011-10-10
1,DE23 8SR,"69, Violet Street",D,74.0,"69, Violet Street",,,2010-06-30
2,DE23 1NG,"15, Charnwood Avenue, Littleover",D,88.0,"15, Charnwood Avenue",Littleover,,2015-12-16
3,DE1 3FF,"PENTHOUSE 1, KINGS CRESCENT APARTMENTS, EDWARD...",B,105.0,PENTHOUSE 1,KINGS CRESCENT APARTMENTS,EDWARD STREET,2021-07-08
4,DE23 1LD,"19, Stenson Avenue, Sunnyhill",D,69.0,"19, Stenson Avenue",Sunnyhill,,2014-05-16


In [14]:
epc_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3343753 entries, 0 to 3343752
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   postcode               object 
 1   address                object 
 2   current_energy_rating  object 
 3   total_floor_area       float64
 4   address1               object 
 5   address2               object 
 6   address3               object 
 7   lodgement_date         object 
dtypes: float64(1), object(7)
memory usage: 204.1+ MB


In [15]:
# any duplicates?
duplicates = epc_reduced.duplicated(subset=['postcode', 'address'], keep=False)
print(f"Number of duplicate entries based on postcode and address: {duplicates.sum()}")

Number of duplicate entries based on postcode and address: 1027297


In [16]:
# show duplicates
epc_reduced[duplicates].sort_values(by=['postcode', 'address']).head(10)

Unnamed: 0,postcode,address,current_energy_rating,total_floor_area,address1,address2,address3,lodgement_date
2107984,B1 1BA,"Apartment 2003 Beetham Tower, 10, Holloway Cir...",C,40.0,Apartment 2003 Beetham Tower,"10, Holloway Circus Queensway",,2020-06-15
2349086,B1 1BA,"Apartment 2003 Beetham Tower, 10, Holloway Cir...",C,40.0,Apartment 2003 Beetham Tower,"10, Holloway Circus Queensway",,2020-06-15
2077857,B1 1BA,"Apartment 2105, 10, Holloway Circus Queensway",B,73.25,Apartment 2105,"10, Holloway Circus Queensway",,2011-10-07
2376708,B1 1BA,"Apartment 2105, 10, Holloway Circus Queensway",D,64.44,Apartment 2105,"10, Holloway Circus Queensway",,2010-11-28
2279905,B1 1BA,"Apartment 2108, 10, Holloway Circus Queensway",C,78.0,Apartment 2108,"10, Holloway Circus Queensway",,2018-08-26
2316681,B1 1BA,"Apartment 2108, 10, Holloway Circus Queensway",E,101.853,Apartment 2108,"10, Holloway Circus Queensway",,2010-11-25
2331876,B1 1BA,"Apartment 2205, 10, Holloway Circus Queensway",C,60.0,Apartment 2205,"10, Holloway Circus Queensway",,2019-01-21
2407818,B1 1BA,"Apartment 2205, 10, Holloway Circus Queensway",C,60.3,Apartment 2205,"10, Holloway Circus Queensway",,2008-12-01
2281363,B1 1BA,"Apartment 2304, 10, Holloway Circus Queensway",C,73.0,Apartment 2304,"10, Holloway Circus Queensway",,2018-08-26
2301849,B1 1BA,"Apartment 2304, 10, Holloway Circus Queensway",D,64.439,Apartment 2304,"10, Holloway Circus Queensway",,2010-11-25


In [17]:
# deduplicate by postcode and address, keeping the most recent lodgement date
epc_reduced = epc_reduced.sort_values('lodgement_date').drop_duplicates(subset=['postcode', 'address'], keep='last')
epc_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2776742 entries, 2102727 to 2874689
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   postcode               object 
 1   address                object 
 2   current_energy_rating  object 
 3   total_floor_area       float64
 4   address1               object 
 5   address2               object 
 6   address3               object 
 7   lodgement_date         object 
dtypes: float64(1), object(7)
memory usage: 190.7+ MB


In [19]:
# drop address lines we don't need
epc_reduced = epc_reduced.drop(columns=['address1', 'address2', 'address3'])

In [20]:
# write out the reduced file
epc_reduced.to_csv("../data/clean/epc_master.csv", index=False)

In [21]:
# zip the cleaned file as it is over 100MB
with zipfile.ZipFile("../data/clean/epc_master.zip", 'w', zipfile.ZIP_DEFLATED) as zf:
    zf.write("../data/clean/epc_master.csv", arcname="epc_master.csv")


In [22]:
# remove the unzipped cleaned file to save space
Path("../data/clean/epc_master.csv").unlink()