In [28]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import pickle
from tqdm import tqdm

# WMCA LA Code

In [2]:
# Get all local authority codes and corresponding council names
page = requests.get("https://epc.opendatacommunities.org/docs/api/domestic#domestic-local-authority").text
soup = BeautifulSoup(page)

table = soup.findAll('table')[3]
la_code_dict = {}

for tr in table.findAll('tr')[1:]:
   code, local_auth = tr.findAll('td')
   la_code_dict[local_auth.text] = code.text

# Get local authority codes for councils in WMCA
WMCA_councils = open("../data/raw/WMCA_council.txt").read().split(",")[:-1]
WMCA_code = [la_code_dict[i] for i in WMCA_councils]
WMCA = dict(zip(WMCA_code, WMCA_councils))

# Save codes for future use
with open('../data/raw/WMCA_council_code.pkl', 'wb') as f:
    pickle.dump(WMCA, f)

# Electricity consumption

In [3]:
# Electricity consumption data
elec_consump_df = pd.read_excel('../data/raw/LSOA_domestic_elec_2010-20.xlsx', sheet_name="2020", header=4, engine='openpyxl')
elec_consump_df.columns = [
        'la_code', 'la', 'msoa_code', 'msoa', 'lsoa_code', 'lsoa', 'num_meter', 'total_consumption', 'mean_counsumption', 'median_consumption'
        ]
# Filter for local authorities in WMCA
elec_consump_df = elec_consump_df[elec_consump_df['la_code'].isin(WMCA_code)]

In [4]:
# Post code to LSOA to MSOA converting data
postcode_df = pd.read_csv("../data/raw/PCD_OA_LSOA_MSOA_LAD_AUG19_UK_LU.csv", low_memory=False)
# Filter for local authorities in WMCA
postcode_df = postcode_df[postcode_df['ladcd'].isin(WMCA_code)]

In [5]:
# Merge data to get postcodes associated with each LSOA code
postcode_elec_consump_df = pd.merge(postcode_df, elec_consump_df, left_on="lsoa11cd", right_on="lsoa_code", how="left")
postcode_elec_consump_df.drop(columns=['la_code', 'la', 'msoa_code', 'msoa', 'lsoa_code', 'lsoa', 'pcd7', 'pcd8'], inplace=True)

In [6]:
postcode_elec_consump_df.head()

Unnamed: 0,pcds,dointr,doterm,usertype,oa11cd,lsoa11cd,msoa11cd,ladcd,lsoa11nm,msoa11nm,ladnm,ladnmw,num_meter,total_consumption,mean_counsumption,median_consumption
0,B1 1AA,199002,201008.0,1,E00175658,E01033625,E02006899,E08000025,Birmingham 138D,Birmingham 138,Birmingham,,1644.0,8835614.0,5374.460963,4826.95
1,B1 1AD,199101,200307.0,1,E00175658,E01033625,E02006899,E08000025,Birmingham 138D,Birmingham 138,Birmingham,,1644.0,8835614.0,5374.460963,4826.95
2,B1 1AG,198704,200011.0,1,E00175626,E01033616,E02006896,E08000025,Birmingham 135D,Birmingham 135,Birmingham,,1000.0,3950506.0,3950.506152,3095.1
3,B1 1AH,198604,199405.0,1,E00175658,E01033625,E02006899,E08000025,Birmingham 138D,Birmingham 138,Birmingham,,1644.0,8835614.0,5374.460963,4826.95
4,B1 1AQ,199201,200011.0,1,E00175626,E01033616,E02006896,E08000025,Birmingham 135D,Birmingham 135,Birmingham,,1000.0,3950506.0,3950.506152,3095.1


# Lat and Lng

In [7]:
# If we need to get lat and lng for each postcode, then use this. For now, I'm not sure we need it yet.
def lookup_postcodes(postcodes):
    """ 
    Look up a pipe-separated list of postcodes using postcodes.io 
    
    Input:
    postcodes(list): List of postcodes to search

    Output:
    

    """
    url = 'http://api.postcodes.io/postcodes'
    data = {
        'postcodes': postcodes
    }
    r = json.loads(requests.post(url, data=data).text)['result']
    return r

# EPC Rating Data

In [22]:
# Make private
import os
from dotenv import load_dotenv, find_dotenv

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

AUTH_TOKEN = "ENV_AUTH_CODE"

In [23]:
def get_epc_data(postcode, num_rows=5000):
    """
    Pull data from Domestic Energy Performance Certificates API.

    Input:
    postcode(str): (1) Postcode 
    num_rows(int): Number of rows to pull. Max 5000 allowed at one time

    Output:
    (str): Pulled data from API

    """
    headers = {
        'Authorization': f'Basic {AUTH_TOKEN}',
        'Accept': 'application/json'
    }
    params = {
        'postcode': postcode,
        'size': num_rows
    }
    url = f'https://epc.opendatacommunities.org/api/v1/domestic/search'
    res = requests.get(url, headers=headers, params=params)
    return res.text

In [24]:
postcode_elec_consump_df.pcds.unique()

array(['B1 1AA', 'B1 1AD', 'B1 1AG', ..., 'WV99 2HB', 'WV99 2HD',
       'WV99 2ND'], dtype=object)

In [29]:
# Pull WMCA postcode data and save as CSV
result = list()

for code in tqdm(postcode_elec_consump_df.pcds.unique()):
    requested_data = get_epc_data(code)
    if len(requested_data)!=0:
        result.extend(json.loads(requested_data)['rows'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 136876/136876 [4:14:20<00:00,  8.97it/s]


In [31]:
EPC_data = pd.DataFrame(result)

# Merge EPC and electricity consumption data on postcode
# EPC_postcode_elec_consump = pd.merge(EPC_data, postcode_elec_consump_df, left_on="postcode", right_on="pcds", how="left")
# EPC_postcode_elec_consump.drop(columns=["pcds"], inplace=True)

In [33]:
EPC_data.to_csv("../data/processed/EPC_data_{0}_homes.csv".format(len(EPC_data)), index=False)

In [None]:
# Export postcodes
with open('../data/processed/WMCA_postcodes.pkl', 'wb') as fp:
    pickle.dump(EPC_postcode_elec_consump.postcode.unique(), fp)

# Fuel Poverty

In [None]:
fuel_poverty_df = pd.read_excel("../data/external/sub-regional-fuel-poverty-2022-tables.xlsx", sheet_name="Table 3", header=2)
fuel_poverty_df.drop(columns=["LSOA Name", "LA Code", "LA Name", "Region", "Unnamed: 8"], inplace=True)
fuel_poverty_df.columns = ["lsoa11cd", "num_households", "num_households_fuel_poverty", "prop_households_fuel_poor"]

In [None]:
fuel_poverty_df.head()

In [None]:
EPC_postcode_elec_consump = pd.read_csv("../data/processed/EPC_elec_consump.csv")

In [None]:
EPC_postcode_elec_consump_fuel_poverty = pd.merge(EPC_postcode_elec_consump, fuel_poverty_df, on="lsoa11cd", how="left")

In [None]:
EPC_postcode_elec_consump_fuel_poverty.to_csv("../data/processed/EPC_elec_consump_fuel_poverty.csv", index=False)