In [3]:
import requests
import zipfile
import pandas as pd
import os
import pickle

token = os.environ['EPC_TOKEN']

def download_data(file, output_folder):
    output_file = f"{output_folder}/{file}"
    extract_to = f"{output_file[:-4]}/"
    
    url = f"https://epc.opendatacommunities.org/api/v1/files/{file}"
    
    headers = {
        "Authorization": f"Basic {token}"
    }
    
    try:
        response = requests.get(url, headers=headers, stream=True)
        response.raise_for_status()
    
        with open(output_file, "wb") as out_file:
            for chunk in response.iter_content(chunk_size=10_000):
                out_file.write(chunk)
            
        print(f"{file} downloaded.")
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
    
    try:
        with zipfile.ZipFile(output_file, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
    except exceptions as e:
        print(f"Error: {e}")

In [6]:
output_folder = "tmp/"

files = ['domestic-2023.zip', 'non-domestic-2023.zip', 'display-2023.zip']

for file in files: download_data(file, output_folder)

domestic-2023.zip downloaded.
non-domestic-2023.zip downloaded.
display-2023.zip downloaded.


In [8]:
domestic_it = pd.read_csv(f"{output_folder}/domestic-2023/certificates.csv", chunksize=1000)
domestic = next(domestic_it)

non_domestic_it = pd.read_csv(f"{output_folder}/non-domestic-2023/certificates.csv", chunksize=1000)
non_domestic = next(non_domestic_it)

display_it = pd.read_csv(f"{output_folder}/display-2023/certificates.csv", chunksize=1000)
display = next(display_it)

In [10]:
d = pd.DataFrame({"col":domestic.columns})
d["in_dom"] = True

nd = pd.DataFrame({"col":non_domestic.columns})
nd["in_non-dom"] = True

dis = pd.DataFrame({"col":display.columns})
dis["in_dis"] = True

allcols = pd.concat([d[["col"]], nd[["col"]], dis[["col"]]])

allcols = allcols.merge(d, how="left", on='col')
allcols = allcols.merge(nd, how="left", on='col')
allcols = allcols.merge(dis, how="left", on='col')

In [11]:
allcols

Unnamed: 0,col,in_dom,in_non-dom,in_dis
0,LMK_KEY,True,True,True
1,ADDRESS1,True,True,True
2,ADDRESS2,True,True,True
3,ADDRESS3,True,True,True
4,POSTCODE,True,True,True
...,...,...,...,...
180,OR_ASSESSMENT_END_DATE,,,True
181,LODGEMENT_DATETIME,True,True,True
182,OCCUPANCY_LEVEL,,,True
183,UPRN,True,True,True


In [24]:
columns = list(allcols["col"])

with open("../../../../data/epc-columns.pkl", "wb") as f:
    pickle.dump(columns, f)

In [25]:
!rm -rf tmp/*