# Structural Data
As the structural values of the lsoa area, describe the background featuers of each lsoa.
- population density
- age groups (elderly)
- ethnic (minority)

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

## Population

Population density and population growth rate

In [2]:
# reading population density
density19_path = "data/stru_data/SAPE22DT11_mid_2019_lsoa_population_density.xlsx"
pop_den19 = pd.ExcelFile(density19_path)

In [3]:
# reading sheet
df_den_19 = pop_den19.parse(sheet_name="Mid-2019 Population Density", header = 4) 
df_den_19.head()

Unnamed: 0,LSOA Code,LSOA Name,Mid-2019 population,Area Sq Km,People per Sq Km
0,E01011949,Hartlepool 009A,1954,0.5189,3765.658123
1,E01011950,Hartlepool 008A,1257,0.1325,9486.792453
2,E01011951,Hartlepool 007A,1209,0.2086,5795.7814
3,E01011952,Hartlepool 002A,1740,0.4641,3749.191984
4,E01011953,Hartlepool 002B,2033,0.8833,2301.596287


In [4]:
# select columns
den19_col = ['LSOA Code', 'Mid-2019 population', 'People per Sq Km']
den_19 = df_den_19[den19_col]

In [5]:
# reading the density for 2015
density15_path = "data/stru_data/SAPE20DT11-mid-2015-lsoa-population-density.xls"
pop_den15 = pd.ExcelFile(density15_path)
df_den_15 = pop_den15.parse(sheet_name="Mid-2015 Population Density", header = 4) 
df_den_15.head()

Unnamed: 0,Code,Name,Mid-2015 population,Area Sq Km,People per Sq Km
0,E01020634,County Durham 001A,1524,6.4975,235
1,E01020635,County Durham 001B,1315,1.6963,775
2,E01020636,County Durham 001C,1749,0.9272,1886
3,E01020654,County Durham 001D,1860,0.9884,1882
4,E01020676,County Durham 001E,1479,5.4717,270


In [6]:
# select columns
den15_col = ['Code', 'Mid-2015 population', 'People per Sq Km']
den_15 = df_den_15[den15_col]

In [7]:
# join together
# 把 df2 的列名改成一致
den_15.rename(columns={'Code': 'LSOA Code'}, inplace=True)
den_15.rename(columns={'People per Sq Km': '2015_population_density'}, inplace=True)
den_19.rename(columns={'People per Sq Km': '2019_population_density'}, inplace=True)

# 然后 merge
den_all = den_19.merge(den_15, on='LSOA Code', how='left')
print(den_all.head())


   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01011949                 1954              3765.658123   
1  E01011950                 1257              9486.792453   
2  E01011951                 1209              5795.781400   
3  E01011952                 1740              3749.191984   
4  E01011953                 2033              2301.596287   

   Mid-2015 population  2015_population_density  
0                 2009                     3872  
1                 1443                    10891  
2                 1279                     6131  
3                 1745                     3760  
4                 2011                     2277  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_15.rename(columns={'Code': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_15.rename(columns={'People per Sq Km': '2015_population_density'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_19.rename(columns={'People per Sq Km': '2019_population_density'}, inplace=True)


In [8]:
den_all['pop_growth_rate'] = (den_all['Mid-2019 population'] - den_all['Mid-2015 population']) / den_all['Mid-2015 population']
den_all['avg_den'] = (den_all['2015_population_density'] + den_all['2019_population_density']) / 2
print(den_all.head())

   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01011949                 1954              3765.658123   
1  E01011950                 1257              9486.792453   
2  E01011951                 1209              5795.781400   
3  E01011952                 1740              3749.191984   
4  E01011953                 2033              2301.596287   

   Mid-2015 population  2015_population_density  pop_growth_rate       avg_den  
0                 2009                     3872        -0.027377   3818.829061  
1                 1443                    10891        -0.128898  10188.896226  
2                 1279                     6131        -0.054730   5963.390700  
3                 1745                     3760        -0.002865   3754.595992  
4                 2011                     2277         0.010940   2289.298143  


In [9]:
den_all.isna().sum()

LSOA Code                  0
Mid-2019 population        0
2019_population_density    0
Mid-2015 population        0
2015_population_density    0
pop_growth_rate            0
avg_den                    0
dtype: int64

In [10]:
# read in lsoa shapefile
london_gdf = gpd.read_file("data/statistical-gis-boundaries-london/statistical-gis-boundaries-london/ESRI/LSOA_2011_London_gen_MHW.shp")

# extract lsoa11cd as LSOA list
london_lsoa_list = london_gdf['LSOA11CD'].unique()

# select data in London area
df_london_den = den_all[den_all['LSOA Code'].isin(london_lsoa_list)]
df_london_den.shape

(4835, 7)

In [11]:
df_london_den.to_csv("data/population_data.csv", index=False)

## Age Group

focusing on the elderly(over 65) percentage, as a description date.

In [12]:
# read in files
age15_path = "data/stru_data/SAPE20DT12-mid-2015-lsoa-Broad_ages-estimates-formatted.xls"
age15 = pd.ExcelFile(age15_path)
age19_path = "data/stru_data/SAPE22DT13_mid_2019_lsoa_Broad_ages_estimates_unformatted.xlsx"
age19 = pd.ExcelFile(age19_path)

In [13]:
# reading sheet
age_group_15 = age15.parse(sheet_name="Mid-2015 Persons", header = 4) 
age_group_15.head()

Unnamed: 0,Area Codes,Area Names,Unnamed: 2,All Ages,0-15,16-29,30-44,45-64,65+
0,E06000047,County Durham,,519347,88643,93119,89995,144162,103428
1,E01020634,,County Durham 001A,1524,245,197,270,445,367
2,E01020635,,County Durham 001B,1315,204,158,258,397,298
3,E01020636,,County Durham 001C,1749,339,284,344,455,327
4,E01020654,,County Durham 001D,1860,311,248,350,534,417


In [14]:
# select columns
age15_col = ['Area Codes', '65+', 'All Ages']
age_15 = age_group_15[age15_col]

In [15]:
age_group_19 = age19.parse(sheet_name="Mid-2019 Persons", header = 4) 
age_group_19.head()

  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0,LSOA Code,LSOA Name,LA Code (2019 boundaries),LA name (2019 boundaries),LA Code (2020 boundaries),LA name (2020 boundaries),All Ages,0-15,16-29,30-44,45-64,65+
0,E01011949,Hartlepool 009A,E06000001,Hartlepool,E06000001,Hartlepool,1954,450,315,382,502,305
1,E01011950,Hartlepool 008A,E06000001,Hartlepool,E06000001,Hartlepool,1257,167,259,250,402,179
2,E01011951,Hartlepool 007A,E06000001,Hartlepool,E06000001,Hartlepool,1209,199,257,273,326,154
3,E01011952,Hartlepool 002A,E06000001,Hartlepool,E06000001,Hartlepool,1740,373,278,251,429,409
4,E01011953,Hartlepool 002B,E06000001,Hartlepool,E06000001,Hartlepool,2033,497,413,365,477,281


In [16]:
# select columns
age19_col = ['LSOA Code', '65+', 'All Ages']
age_19 = age_group_19[age19_col]

In [17]:
# join together
# 把 df2 的列名改成一致
age_15.rename(columns={'Area Codes': 'LSOA Code'}, inplace=True)
age_15.rename(columns={'All Ages': '2015_all'}, inplace=True)
age_15.rename(columns={'65+': '2015_senior'}, inplace=True)
age_19.rename(columns={'All Ages': '2019_all'}, inplace=True)
age_19.rename(columns={'65+': '2019_senior'}, inplace=True)

# 然后 merge
age_all = age_19.merge(age_15, on='LSOA Code', how='left')
print(age_all.head())

   LSOA Code  2019_senior  2019_all  2015_senior  2015_all
0  E01011949          305      1954          294      2009
1  E01011950          179      1257          191      1443
2  E01011951          154      1209          128      1279
3  E01011952          409      1740          420      1745
4  E01011953          281      2033          242      2011


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'Area Codes': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'All Ages': '2015_all'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'65+': '2015_senior'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [18]:
age_all[['2015_all', '2019_all', '2015_senior', '2019_senior']].isnull().sum()

2015_all       0
2019_all       0
2015_senior    0
2019_senior    0
dtype: int64

In [19]:
age_all['senior_per'] = (age_all['2019_senior'] / age_all['2019_all'] + age_all['2015_senior'] / age_all['2015_all']) * 100 /2
print(age_all.head())

   LSOA Code  2019_senior  2019_all  2015_senior  2015_all  senior_per
0  E01011949          305      1954          294      2009   15.121577
1  E01011950          179      1257          191      1443   13.738284
2  E01011951          154      1209          128      1279   11.372809
3  E01011952          409      1740          420      1745   23.787258
4  E01011953          281      2033          242      2011   12.927876


In [20]:
# select data in London area
df_london_age = age_all[age_all['LSOA Code'].isin(london_lsoa_list)]
df_london_age.shape

(4835, 6)

## Ethnic groups

Focusing on the minority percentage here.

In [21]:
ethnic_group = pd.read_csv("data/stru_data/bulk.csv")
ethnic_group.head(5)

Unnamed: 0,date,geography,geography code,Ethnic Group: All categories: Ethnic group; measures: Value,Ethnic Group: White; measures: Value,Ethnic Group: Gypsy / Traveller / Irish Traveller; measures: Value,Ethnic Group: Mixed / Multiple ethnic group; measures: Value,Ethnic Group: Asian / Asian British: Indian; measures: Value,Ethnic Group: Asian / Asian British: Pakistani; measures: Value,Ethnic Group: Asian / Asian British: Bangladeshi; measures: Value,Ethnic Group: Asian / Asian British: Chinese; measures: Value,Ethnic Group: Asian / Asian British: Other Asian; measures: Value,Ethnic Group: Black / African / Caribbean / Black British; measures: Value,Ethnic Group: Other Ethnic Group; measures: Value
0,2011,Darlington 001B,E01012334,2466,2401,3,29,18,3,0,5,1,1,5
1,2011,Darlington 001C,E01012335,1383,1362,0,9,0,0,5,3,3,0,1
2,2011,Darlington 001D,E01012366,2008,1961,5,16,8,1,1,1,9,6,0
3,2011,Darlington 001E,E01033481,1364,1311,5,16,11,3,0,5,3,6,4
4,2011,Darlington 001F,E01033482,1621,1540,26,20,14,0,0,6,3,11,1


In [22]:
# select columns
eth_col = ['geography code', 'Ethnic Group: All categories: Ethnic group; measures: Value', 'Ethnic Group: White; measures: Value']
df_eth = ethnic_group[eth_col]

In [23]:
df_eth.rename(columns={'geography code': 'LSOA Code'}, inplace=True)
df_eth.rename(columns={'Ethnic Group: All categories: Ethnic group; measures: Value': 'all'}, inplace=True)
df_eth.rename(columns={'Ethnic Group: White; measures: Value': 'white'}, inplace=True)
df_eth[['all', 'white']].isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'geography code': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'Ethnic Group: All categories: Ethnic group; measures: Value': 'all'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'Ethnic Group: White; measures: Value': 'white'}, inplace=True)


all      0
white    0
dtype: int64

In [24]:
df_eth['minority_per'] = (df_eth['all'] - df_eth['white']) *100 / df_eth['all']
print(df_eth.head())

   LSOA Code   all  white  minority_per
0  E01012334  2466   2401      2.635848
1  E01012335  1383   1362      1.518438
2  E01012366  2008   1961      2.340637
3  E01033481  1364   1311      3.885630
4  E01033482  1621   1540      4.996915


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth['minority_per'] = (df_eth['all'] - df_eth['white']) *100 / df_eth['all']


In [25]:
# select data in London area
df_london_eth = df_eth[df_eth['LSOA Code'].isin(london_lsoa_list)]
df_london_eth.shape

(4835, 4)

In [26]:
# merge all the data together
df_data_all = df_london_den.merge(df_london_age, on='LSOA Code', how='left')

In [27]:
df_data_all = df_data_all.merge(df_london_eth, on='LSOA Code', how='left')
print(df_data_all.head(5))

   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01000001                 1636             12604.006163   
1  E01000002                 1558              6821.366025   
2  E01000003                 1786             30219.966159   
3  E01000005                 1888              9957.805907   
4  E01032739                 1375               831.519110   

   Mid-2015 population  2015_population_density  pop_growth_rate  \
0                 1296                     9985         0.262346   
1                 1156                     5061         0.347751   
2                 1350                    22843         0.322963   
3                 1121                     5912         0.684211   
4                  802                      485         0.714464   

        avg_den  2019_senior  2019_all  2015_senior  2015_all  senior_per  \
0  11294.503082          451      1636          351      1296   27.325285   
1   5941.183012          371      1558          303      1156   

In [28]:
columns_to_keep = ['LSOA Code', 'pop_growth_rate', 'avg_den', 'senior_per', 'minority_per']
cleaned_df = df_data_all[columns_to_keep]

In [29]:
cleaned_df.to_csv("data/all_stru_data.csv", index=False)