In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import os
from matplotlib.colors import ListedColormap
from matplotlib_scalebar.scalebar import ScaleBar
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

#set font globally
plt.rcParams['font.family'] = 'Arial'

import warnings
warnings.filterwarnings('ignore')

* LILEE validation

In [None]:
# use fuel poverty survey data at 2021 LSOAs to validate energy deprivation classification results
fuel_poverty = pd.read_excel('validation/sub-regional-fuel-poverty-tables-2023-2021-data.xlsx', sheet_name = 'Table 3',header = 2).iloc[:33755]
fuel_poverty = fuel_poverty[['LSOA Code','Proportion of households fuel poor (%)','LA Name','Region']].set_index('LSOA Code')
# Classification data
energy_geo = gpd.read_file('EDC_tier1&2_gb.shp')

In [None]:
joined = energy_geo.set_index('index')[['supergroup','geometry']].join(fuel_poverty, how= 'inner')
joined.columns = ['Energy Deprivation Classification','geometry','Fuel Poor Households(%)','LA Name','Region']
joined = joined.sort_values('Energy Deprivation Classification', ascending = True)

* IMD validation

In [None]:
eimd = pd.read_csv('validation/EIMD 2019.csv',index_col = 0).iloc[:,[4]]
wimd = pd.read_csv('validation/WIMD(SOA lookup).csv',index_col = 0)
simd = pd.read_csv('validation/simd2020_withinds.csv',index_col = 0).iloc[:,[5]]
eimd.columns = wimd.columns = simd.columns = ['IMD Rank (where 1 is most deprived)']
eimd['decile'] = pd.qcut(eimd.iloc[:,0], q=10, labels=False, duplicates='drop')+1
wimd['decile'] = pd.qcut(wimd.iloc[:,0], q=10, labels=False, duplicates='drop')+1
simd['decile'] = pd.qcut(simd.iloc[:,0], q=10, labels=False, duplicates='drop')+1
eimd['percentile'] = pd.qcut(eimd.iloc[:,0], q=100, labels=False, duplicates='drop')+1
wimd['percentile'] = pd.qcut(wimd.iloc[:,0], q=100, labels=False, duplicates='drop')+1
simd['percentile'] = pd.qcut(simd.iloc[:,0], q=100, labels=False, duplicates='drop')+1
# harmonised IMD for GB
gb_imd = pd.concat([eimd,wimd,simd])

In [None]:
# Read a lookup table from ONSPD UK 2023.11 
lookup = pd.read_csv('ONSPD_NOV_2023_UK.csv')
lookup = lookup[['pcd','oa21','lsoa21','msoa21', 'oslaua','rgn','oa11','lsoa11','msoa11']].fillna('NA')
# England and Wales 
lookup_EW = lookup[lookup['lsoa21'].str.contains('E|W')]
# Scotland only
lookup_S = lookup[lookup['lsoa11'].str.startswith('S0')]
# merge for GB
lookup_gb = pd.concat([lookup_EW,lookup_S])

In [None]:
lsoa11_21 = lookup_gb[['lsoa21','lsoa11']].drop_duplicates().set_index('lsoa11')
gb_imd = gb_imd.join(lsoa11_21).groupby('lsoa21').mean()

gb_imd['Country'] = ''
gb_imd['Country'] = np.where(gb_imd.index.str.contains('E'), 'England', gb_imd['Country'])
gb_imd['Country'] = np.where(gb_imd.index.str.contains('W'), 'Wales', gb_imd['Country'])
gb_imd['Country'] = np.where(gb_imd.index.str.contains('S'), 'Scotland', gb_imd['Country'])

In [None]:
energy_imd = energy_geo.set_index('index').join(gb_imd)

energy_imd = energy_imd.sort_values('supergroup', ascending = True)
energy_imd_country = [energy_imd[energy_imd['Country'] == value] for value in energy_imd['Country'].unique()]

* Internal validation

In [None]:
energy_geo = energy_geo.set_index('index')

In [None]:
#drop unused columns of kmeans clustering
X = energy_geo.drop(['Renewable','Retired','Co2 emissi','Prepay ele','Under occu','Universal','Owns outri'], axis = 1).fillna(100)
supergroup6 = [list(X.iloc[:,:-1].groupby('supergroup'))[i][1] for i in range(len(energy_geo.supergroup.unique()))]

In [None]:
def distance_to_mean(df):
    # Calculate the mean vector of the DataFrame
    mean_vector = df.mean(axis=0, numeric_only=True)

    # Calculate the Euclidean distance between each row and the mean vector
    # We subtract the mean vector from each row and square each element
    squared_differences = (df - mean_vector) ** 2

    # Sum the squared differences along the columns (axis=1)
    squared_distances = squared_differences.sum(axis=1)

    # Take the square root of the sum to get the Euclidean distance
    euclidean_distances = np.sqrt(squared_distances)

    # Now, euclidean_distances will contain the Euclidean distance of each row to the mean of each variable
    # This is to validate the closeness and robustness of cluster results
    df['EDC Fit'] = euclidean_distances.round()
    return df

In [None]:
supergroup6 = [distance_to_mean(supergroup6[i]).sort_values('EDC Fit', ascending = True) for i in range(len(energy_geo.supergroup.unique()))]

In [None]:
cluster_fit = pd.concat([supergroup6[i].iloc[:,[-3,-2,-1]] for i in range(len(energy_geo.supergroup.unique()))])
cluster_fit_geo = energy_geo[['geometry']].join(cluster_fit)

In [None]:
# Average fit for each country
fit_E_avg = cluster_fit_geo[cluster_fit_geo.index.str.contains('E')].mean()[0].astype('int')
fit_W_avg= cluster_fit_geo[cluster_fit_geo.index.str.contains('W')].mean()[0].astype('int')
fit_S_avg = cluster_fit_geo[cluster_fit_geo.index.str.contains('S')].mean()[0].astype('int')
fit_E_md = cluster_fit_geo[cluster_fit_geo.index.str.contains('E')].median()[0].astype('int')
fit_W_md = cluster_fit_geo[cluster_fit_geo.index.str.contains('W')].median()[0].astype('int')
fit_S_md = cluster_fit_geo[cluster_fit_geo.index.str.contains('S')].median()[0].astype('int')

fit_E_W_S = pd.DataFrame({'Mean': [fit_E_avg,fit_W_avg,fit_S_avg],'Median':[fit_E_md,fit_W_md,fit_S_md]}, index = ['England','Wales','Scotland'])