In [None]:
"""
03 Load and Join IMD data
- Joins IMD data to the main dataframe using LSOA codes.
- fields: `lsoa11cd`, `imd_decile`, `imd_score`.
- Purpose: enrich the dataset with deprivation indices.
- Saves the enriched dataframe to a new CSV file: ppd_with_geography_and_imd.csv
"""
import pandas as pd

# Load the previously merged Price Paid Data (PPD) with ONS Postcode Directory
df = pd.read_csv('../data/clean/ppd_with_geography.csv')

In [51]:
# Load the IMD data
imd = pd.read_csv('../data/raw/imd2019lsoa.csv')


In [52]:
imd.head(50)


Unnamed: 0,FeatureCode,DateCode,Measurement,Units,Value,Indices of Deprivation
0,E01005278,2019,Rank,,11281.0,b. Income Deprivation Domain
1,E01006616,2019,Decile,,1.0,a. Index of Multiple Deprivation (IMD)
2,E01005236,2019,Rank,,4565.0,b. Income Deprivation Domain
3,E01031873,2019,Rank,,25826.0,b. Income Deprivation Domain
4,E01007367,2019,Decile,,2.0,a. Index of Multiple Deprivation (IMD)
5,E01007376,2019,Decile,,1.0,a. Index of Multiple Deprivation (IMD)
6,E01006594,2019,Decile,,9.0,a. Index of Multiple Deprivation (IMD)
7,E01007489,2019,Decile,,4.0,a. Index of Multiple Deprivation (IMD)
8,E01031455,2019,Rank,,10380.0,b. Income Deprivation Domain
9,E01031971,2019,Rank,,20299.0,b. Income Deprivation Domain


In [53]:
imd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 985320 entries, 0 to 985319
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   FeatureCode             985320 non-null  object 
 1   DateCode                985320 non-null  int64  
 2   Measurement             985320 non-null  object 
 3   Units                   0 non-null       float64
 4   Value                   985320 non-null  float64
 5   Indices of Deprivation  985320 non-null  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 45.1+ MB


In [54]:
# unique values for Indices of Deprivation
imd['Indices of Deprivation'].unique()

array(['b. Income Deprivation Domain',
       'a. Index of Multiple Deprivation (IMD)',
       'c. Employment Deprivation Domain',
       'd. Education, Skills and Training Domain',
       'j. Income Deprivation Affecting Older People Index (IDAOPI)',
       'i. Income Deprivation Affecting Children Index (IDACI)',
       'f. Crime Domain', 'e. Health Deprivation and Disability Domain',
       'h. Living Environment Deprivation Domain',
       'g. Barriers to Housing and Services Domain'], dtype=object)

In future a more granular approach may be done by individual indices rather than just the IMD index:
  - Index of Multiple Deprivation (IMD)
  - Income Deprivation Domain
  - Employment Deprivation Domain
  - Education, Skills and Training Domain
  - Health Deprivation and Disability Domain
  - Crime Domain
  - Barriers to Housing and Services Domain
  - Living Environment Deprivation Domain
  - Income Deprivation Affecting Children Index (IDACI)
  - Income Deprivation Affecting Older People Index (IDAOPI)

In [55]:
# Select relevant columns from IMD data where Measurement is Decile and Indices of Deprivation is 'a. Index of Multiple Deprivation (IMD)'
imd_decile = imd[(imd['Measurement'].str.strip() == 'Decile') &
                 (imd['Indices of Deprivation'].str.strip() == 'a. Index of Multiple Deprivation (IMD)')]
imd_rank = imd[(imd['Measurement'].str.strip() == 'Rank') &
               (imd['Indices of Deprivation'].str.strip() == 'a. Index of Multiple Deprivation (IMD)')]
# rename measurement columns
imd_decile = imd_decile.rename(columns={'Value': 'IMD_Decile'})
imd_rank = imd_rank.rename(columns={'Value': 'IMD_Rank'})
# combine decile and rank dataframes
imd_combined = pd.merge(imd_decile[['FeatureCode', 'IMD_Decile']], imd_rank[['FeatureCode', 'IMD_Rank']],
                        on='FeatureCode', how='inner')
imd_combined.head()


Unnamed: 0,FeatureCode,IMD_Decile,IMD_Rank
0,E01006616,1.0,2556.0
1,E01007367,2.0,3334.0
2,E01007376,1.0,2048.0
3,E01006594,9.0,28228.0
4,E01007489,4.0,12952.0


In [56]:
# Perform the join with the main dataframe
merged_df = pd.merge(df, imd_combined, left_on='lsoa11cd', right_on='FeatureCode', how='left')
# Drop the `FeatureCode` column as it's no longer needed
merged_df = merged_df.drop(columns=['FeatureCode'])
# check the result
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8231 entries, 0 to 8230
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   transaction    8231 non-null   object 
 1   price          8231 non-null   int64  
 2   transfer_date  8231 non-null   object 
 3   postcode       8231 non-null   object 
 4   property_type  8231 non-null   object 
 5   new_build      8231 non-null   object 
 6   tenure         8231 non-null   object 
 7   PAON           8231 non-null   object 
 8   SAON           469 non-null    object 
 9   Street         8193 non-null   object 
 10  Locality       3570 non-null   object 
 11  town_city      8231 non-null   object 
 12  district       8231 non-null   object 
 13  county         8231 non-null   object 
 14  PPD_category   8231 non-null   object 
 15  Status         8231 non-null   object 
 16  lsoa11cd       7619 non-null   object 
 17  msoa11nm       7619 non-null   object 
 18  ladnm   

In [57]:
# are there any missing IMD values?
merged_df['IMD_Decile'].isnull().sum(), merged_df['IMD_Rank'].isnull().sum()
# remove rows with missing IMD values
merged_df = merged_df.dropna(subset=['IMD_Decile', 'IMD_Rank'])
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7619 entries, 0 to 8230
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   transaction    7619 non-null   object 
 1   price          7619 non-null   int64  
 2   transfer_date  7619 non-null   object 
 3   postcode       7619 non-null   object 
 4   property_type  7619 non-null   object 
 5   new_build      7619 non-null   object 
 6   tenure         7619 non-null   object 
 7   PAON           7619 non-null   object 
 8   SAON           437 non-null    object 
 9   Street         7581 non-null   object 
 10  Locality       3199 non-null   object 
 11  town_city      7619 non-null   object 
 12  district       7619 non-null   object 
 13  county         7619 non-null   object 
 14  PPD_category   7619 non-null   object 
 15  Status         7619 non-null   object 
 16  lsoa11cd       7619 non-null   object 
 17  msoa11nm       7619 non-null   object 
 18  ladnm        

In [58]:
# Save the enriched dataframe to a new CSV file
merged_df.to_csv('../data/clean/ppd_with_geography_and_imd.csv', index=False)