# Structural Data
As the structural values of the lsoa area, describe the background featuers of each lsoa.
- population density
- age groups (elderly)
- ethnic (minority)

In [38]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

## Population

Population density and population growth rate

In [39]:
# reading population density
density19_path = "data/stru_data/SAPE22DT11_mid_2019_lsoa_population_density.xlsx"
pop_den19 = pd.ExcelFile(density19_path)

In [40]:
# reading sheet
df_den_19 = pop_den19.parse(sheet_name="Mid-2019 Population Density", header = 4) 
df_den_19.head()

Unnamed: 0,LSOA Code,LSOA Name,Mid-2019 population,Area Sq Km,People per Sq Km
0,E01011949,Hartlepool 009A,1954,0.5189,3765.658123
1,E01011950,Hartlepool 008A,1257,0.1325,9486.792453
2,E01011951,Hartlepool 007A,1209,0.2086,5795.7814
3,E01011952,Hartlepool 002A,1740,0.4641,3749.191984
4,E01011953,Hartlepool 002B,2033,0.8833,2301.596287


In [41]:
# select columns
den19_col = ['LSOA Code', 'Mid-2019 population', 'People per Sq Km']
den_19 = df_den_19[den19_col]

In [42]:
# reading the density for 2015
density15_path = "data/stru_data/SAPE20DT11-mid-2015-lsoa-population-density.xls"
pop_den15 = pd.ExcelFile(density15_path)
df_den_15 = pop_den15.parse(sheet_name="Mid-2015 Population Density", header = 4) 
df_den_15.head()

Unnamed: 0,Code,Name,Mid-2015 population,Area Sq Km,People per Sq Km
0,E01020634,County Durham 001A,1524,6.4975,235
1,E01020635,County Durham 001B,1315,1.6963,775
2,E01020636,County Durham 001C,1749,0.9272,1886
3,E01020654,County Durham 001D,1860,0.9884,1882
4,E01020676,County Durham 001E,1479,5.4717,270


In [43]:
# select columns
den15_col = ['Code', 'Mid-2015 population', 'People per Sq Km']
den_15 = df_den_15[den15_col]

In [44]:
# join together
# 把 df2 的列名改成一致
den_15.rename(columns={'Code': 'LSOA Code'}, inplace=True)
den_15.rename(columns={'People per Sq Km': '2015_population_density'}, inplace=True)
den_19.rename(columns={'People per Sq Km': '2019_population_density'}, inplace=True)

# 然后 merge
den_all = den_19.merge(den_15, on='LSOA Code', how='left')
print(den_all.head())


   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01011949                 1954              3765.658123   
1  E01011950                 1257              9486.792453   
2  E01011951                 1209              5795.781400   
3  E01011952                 1740              3749.191984   
4  E01011953                 2033              2301.596287   

   Mid-2015 population  2015_population_density  
0                 2009                     3872  
1                 1443                    10891  
2                 1279                     6131  
3                 1745                     3760  
4                 2011                     2277  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_15.rename(columns={'Code': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_15.rename(columns={'People per Sq Km': '2015_population_density'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den_19.rename(columns={'People per Sq Km': '2019_population_density'}, inplace=True)


In [45]:
den_all['pop_growth_rate'] = (den_all['Mid-2019 population'] - den_all['Mid-2015 population']) / den_all['Mid-2015 population']
den_all['avg_den'] = (den_all['2015_population_density'] + den_all['2019_population_density']) / 2
print(den_all.head())

   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01011949                 1954              3765.658123   
1  E01011950                 1257              9486.792453   
2  E01011951                 1209              5795.781400   
3  E01011952                 1740              3749.191984   
4  E01011953                 2033              2301.596287   

   Mid-2015 population  2015_population_density  pop_growth_rate       avg_den  
0                 2009                     3872        -0.027377   3818.829061  
1                 1443                    10891        -0.128898  10188.896226  
2                 1279                     6131        -0.054730   5963.390700  
3                 1745                     3760        -0.002865   3754.595992  
4                 2011                     2277         0.010940   2289.298143  


In [46]:
den_all.isna().sum()

LSOA Code                  0
Mid-2019 population        0
2019_population_density    0
Mid-2015 population        0
2015_population_density    0
pop_growth_rate            0
avg_den                    0
dtype: int64

In [47]:
# read in lsoa shapefile
london_gdf = gpd.read_file("data/statistical-gis-boundaries-london/statistical-gis-boundaries-london/ESRI/LSOA_2011_London_gen_MHW.shp")

# extract lsoa11cd as LSOA list
london_lsoa_list = london_gdf['LSOA11CD'].unique()

# select data in London area
df_london_den = den_all[den_all['LSOA Code'].isin(london_lsoa_list)]
df_london_den.shape

(4835, 7)

In [48]:
df_london_den.to_csv("data/population_data.csv", index=False)

## Age Group

focusing on the elderly(over 65) percentage, as a description date.

In [49]:
# read in files
age15_path = "data/stru_data/SAPE20DT12-mid-2015-lsoa-Broad_ages-estimates-formatted.xls"
age15 = pd.ExcelFile(age15_path)
age19_path = "data/stru_data/SAPE22DT13_mid_2019_lsoa_Broad_ages_estimates_unformatted.xlsx"
age19 = pd.ExcelFile(age19_path)

In [50]:
# reading sheet
age_group_15 = age15.parse(sheet_name="Mid-2015 Persons", header = 4) 
age_group_15.head()

Unnamed: 0,Area Codes,Area Names,Unnamed: 2,All Ages,0-15,16-29,30-44,45-64,65+
0,E06000047,County Durham,,519347,88643,93119,89995,144162,103428
1,E01020634,,County Durham 001A,1524,245,197,270,445,367
2,E01020635,,County Durham 001B,1315,204,158,258,397,298
3,E01020636,,County Durham 001C,1749,339,284,344,455,327
4,E01020654,,County Durham 001D,1860,311,248,350,534,417


In [51]:
# select columns
age15_col = ['Area Codes', '65+', 'All Ages']
age_15 = age_group_15[age15_col]

In [52]:
age_group_19 = age19.parse(sheet_name="Mid-2019 Persons", header = 4) 
age_group_19.head()

  warn("""Cannot parse header or footer so it will be ignored""")


Unnamed: 0,LSOA Code,LSOA Name,LA Code (2019 boundaries),LA name (2019 boundaries),LA Code (2020 boundaries),LA name (2020 boundaries),All Ages,0-15,16-29,30-44,45-64,65+
0,E01011949,Hartlepool 009A,E06000001,Hartlepool,E06000001,Hartlepool,1954,450,315,382,502,305
1,E01011950,Hartlepool 008A,E06000001,Hartlepool,E06000001,Hartlepool,1257,167,259,250,402,179
2,E01011951,Hartlepool 007A,E06000001,Hartlepool,E06000001,Hartlepool,1209,199,257,273,326,154
3,E01011952,Hartlepool 002A,E06000001,Hartlepool,E06000001,Hartlepool,1740,373,278,251,429,409
4,E01011953,Hartlepool 002B,E06000001,Hartlepool,E06000001,Hartlepool,2033,497,413,365,477,281


In [53]:
# select columns
age19_col = ['LSOA Code', '65+', 'All Ages']
age_19 = age_group_19[age19_col]

In [54]:
# join together
# 把 df2 的列名改成一致
age_15.rename(columns={'Area Codes': 'LSOA Code'}, inplace=True)
age_15.rename(columns={'All Ages': '2015_all'}, inplace=True)
age_15.rename(columns={'65+': '2015_senior'}, inplace=True)
age_19.rename(columns={'All Ages': '2019_all'}, inplace=True)
age_19.rename(columns={'65+': '2019_senior'}, inplace=True)

# 然后 merge
age_all = age_19.merge(age_15, on='LSOA Code', how='left')
print(age_all.head())

   LSOA Code  2019_senior  2019_all  2015_senior  2015_all
0  E01011949          305      1954          294      2009
1  E01011950          179      1257          191      1443
2  E01011951          154      1209          128      1279
3  E01011952          409      1740          420      1745
4  E01011953          281      2033          242      2011


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'Area Codes': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'All Ages': '2015_all'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_15.rename(columns={'65+': '2015_senior'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [55]:
age_all[['2015_all', '2019_all', '2015_senior', '2019_senior']].isnull().sum()

2015_all       0
2019_all       0
2015_senior    0
2019_senior    0
dtype: int64

In [56]:
age_all['senior_per'] = (age_all['2019_senior'] / age_all['2019_all'] + age_all['2015_senior'] / age_all['2015_all']) * 100 /2
print(age_all.head())

   LSOA Code  2019_senior  2019_all  2015_senior  2015_all  senior_per
0  E01011949          305      1954          294      2009   15.121577
1  E01011950          179      1257          191      1443   13.738284
2  E01011951          154      1209          128      1279   11.372809
3  E01011952          409      1740          420      1745   23.787258
4  E01011953          281      2033          242      2011   12.927876


In [57]:
# select data in London area
df_london_age = age_all[age_all['LSOA Code'].isin(london_lsoa_list)]
df_london_age.shape

(4835, 6)

## Ethnic groups

Focusing on the minority percentage here.

In [58]:
ethnic_group = pd.read_csv("data/stru_data/bulk.csv")
ethnic_group.head(5)

Unnamed: 0,date,geography,geography code,Ethnic Group: All categories: Ethnic group; measures: Value,Ethnic Group: White; measures: Value,Ethnic Group: Gypsy / Traveller / Irish Traveller; measures: Value,Ethnic Group: Mixed / Multiple ethnic group; measures: Value,Ethnic Group: Asian / Asian British: Indian; measures: Value,Ethnic Group: Asian / Asian British: Pakistani; measures: Value,Ethnic Group: Asian / Asian British: Bangladeshi; measures: Value,Ethnic Group: Asian / Asian British: Chinese; measures: Value,Ethnic Group: Asian / Asian British: Other Asian; measures: Value,Ethnic Group: Black / African / Caribbean / Black British; measures: Value,Ethnic Group: Other Ethnic Group; measures: Value
0,2011,Darlington 001B,E01012334,2466,2401,3,29,18,3,0,5,1,1,5
1,2011,Darlington 001C,E01012335,1383,1362,0,9,0,0,5,3,3,0,1
2,2011,Darlington 001D,E01012366,2008,1961,5,16,8,1,1,1,9,6,0
3,2011,Darlington 001E,E01033481,1364,1311,5,16,11,3,0,5,3,6,4
4,2011,Darlington 001F,E01033482,1621,1540,26,20,14,0,0,6,3,11,1


In [59]:
# select columns
eth_col = ['geography code', 'Ethnic Group: All categories: Ethnic group; measures: Value', 'Ethnic Group: White; measures: Value']
df_eth = ethnic_group[eth_col]

In [60]:
df_eth.rename(columns={'geography code': 'LSOA Code'}, inplace=True)
df_eth.rename(columns={'Ethnic Group: All categories: Ethnic group; measures: Value': 'all'}, inplace=True)
df_eth.rename(columns={'Ethnic Group: White; measures: Value': 'white'}, inplace=True)
df_eth[['all', 'white']].isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'geography code': 'LSOA Code'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'Ethnic Group: All categories: Ethnic group; measures: Value': 'all'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth.rename(columns={'Ethnic Group: White; measures: Value': 'white'}, inplace=True)


all      0
white    0
dtype: int64

In [61]:
df_eth['minority_per'] = (df_eth['all'] - df_eth['white']) *100 / df_eth['all']
print(df_eth.head())

   LSOA Code   all  white  minority_per
0  E01012334  2466   2401      2.635848
1  E01012335  1383   1362      1.518438
2  E01012366  2008   1961      2.340637
3  E01033481  1364   1311      3.885630
4  E01033482  1621   1540      4.996915


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eth['minority_per'] = (df_eth['all'] - df_eth['white']) *100 / df_eth['all']


In [62]:
# select data in London area
df_london_eth = df_eth[df_eth['LSOA Code'].isin(london_lsoa_list)]
df_london_eth.shape

(4835, 4)

In [63]:
# merge all the data together
df_data_all = df_london_den.merge(df_london_age, on='LSOA Code', how='left')

In [64]:
df_data_all = df_data_all.merge(df_london_eth, on='LSOA Code', how='left')
print(df_data_all.head(5))

   LSOA Code  Mid-2019 population  2019_population_density  \
0  E01000001                 1636             12604.006163   
1  E01000002                 1558              6821.366025   
2  E01000003                 1786             30219.966159   
3  E01000005                 1888              9957.805907   
4  E01032739                 1375               831.519110   

   Mid-2015 population  2015_population_density  pop_growth_rate  \
0                 1296                     9985         0.262346   
1                 1156                     5061         0.347751   
2                 1350                    22843         0.322963   
3                 1121                     5912         0.684211   
4                  802                      485         0.714464   

        avg_den  2019_senior  2019_all  2015_senior  2015_all  senior_per  \
0  11294.503082          451      1636          351      1296   27.325285   
1   5941.183012          371      1558          303      1156   

In [65]:
columns_to_keep = ['LSOA Code', 'pop_growth_rate', 'avg_den', 'senior_per', 'minority_per']
cleaned_df = df_data_all[columns_to_keep]

In [66]:
cleaned_df.to_csv("data/all_stru_data.csv", index=False)

## prepare the 2020-2022 data

In [67]:
# reading population density
density_path = "data/stru_data/sapelsoapopulationdensity20112022.xlsx"
pop_den = pd.ExcelFile(density_path)

In [68]:
# reading sheet
df_den = pop_den.parse(sheet_name="Mid-2011 to mid-2022 LSOA 2021", header = 3) 
df_den.head()

Unnamed: 0,LAD 2021 Code,LAD 2021 Name,LSOA 2021 Code,LSOA 2021 Name,Area Sq Km,Mid-2011: Population,Mid-2011: People per Sq Km,Mid-2012: Population,Mid-2012: People per Sq Km,Mid-2013: Population,...,Mid-2018: Population,Mid-2018: People per Sq Km,Mid-2019: Population,Mid-2019: People per Sq Km,Mid-2020: Population,Mid-2020: People per Sq Km,Mid-2021: Population,Mid-2021: People per Sq Km,Mid-2022: Population,Mid-2022: People per Sq Km
0,E09000001,City of London,E01000001,City of London 001A,0.1298,1472,11340.523883,1498,11540.832049,1624,...,1589,12241.910632,1677,12919.876733,1563,12041.602465,1573,12118.644068,1795,13828.967643
1,E09000001,City of London,E01000002,City of London 001B,0.2283,1438,6298.729742,1473,6452.036794,1494,...,1380,6044.678055,1423,6233.026719,1309,5733.683749,1407,6162.943495,1671,7319.316689
2,E09000001,City of London,E01000003,City of London 001C,0.0589,1348,22886.247878,1486,25229.202037,1488,...,1786,30322.580645,1841,31256.366723,1722,29235.993209,1610,27334.465195,1896,32190.152801
3,E09000001,City of London,E01000005,City of London 001E,0.1896,987,5205.696203,915,4825.949367,946,...,1349,7114.978903,1426,7521.097046,1360,7172.995781,1104,5822.78481,1737,9161.392405
4,E09000002,Barking and Dagenham,E01000006,Barking and Dagenham 016A,0.1466,1731,11807.639836,1779,12135.061392,1806,...,1844,12578.444748,1844,12578.444748,1834,12510.231924,1829,12476.125512,1837,12530.695771


In [69]:
# select columns
den_col = ['LSOA 2021 Code', 'Mid-2020: Population', 'Mid-2020: People per Sq Km', 'Mid-2022: Population', 'Mid-2022: People per Sq Km']
den = df_den[den_col]

In [70]:
# join together
# 把 df2 的列名改成一致
den.rename(columns={'Mid-2020: Population': '2020_population'}, inplace=True)
den.rename(columns={'Mid-2022: Population': '2022_population'}, inplace=True)
den.rename(columns={'Mid-2020: People per Sq Km': '2020_population_density'}, inplace=True)
den.rename(columns={'Mid-2022: People per Sq Km': '2022_population_density'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den.rename(columns={'Mid-2020: Population': '2020_population'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den.rename(columns={'Mid-2022: Population': '2022_population'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den.rename(columns={'Mid-2020: People per Sq Km': '2020_population_density'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.o

In [71]:
den['pop_growth_rate'] = (den['2022_population'] - den['2020_population']) / den['2020_population']
den['avg_den'] = (den['2020_population_density'] + den['2022_population_density']) / 2
print(den.head())

  LSOA 2021 Code  2020_population  2020_population_density  2022_population  \
0      E01000001             1563             12041.602465             1795   
1      E01000002             1309              5733.683749             1671   
2      E01000003             1722             29235.993209             1896   
3      E01000005             1360              7172.995781             1737   
4      E01000006             1834             12510.231924             1837   

   2022_population_density  pop_growth_rate       avg_den  
0             13828.967643         0.148433  12935.285054  
1              7319.316689         0.276547   6526.500219  
2             32190.152801         0.101045  30713.073005  
3              9161.392405         0.277206   8167.194093  
4             12530.695771         0.001636  12520.463847  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den['pop_growth_rate'] = (den['2022_population'] - den['2020_population']) / den['2020_population']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  den['avg_den'] = (den['2020_population_density'] + den['2022_population_density']) / 2


In [72]:
# read in files
age_path = "data/stru_data/sapelsoabroadage20112022.xlsx"
age = pd.ExcelFile(age_path)
age_group_20 = age.parse(sheet_name="Mid-2020 LSOA 2021", header = 3) 
age_group_22 = age.parse(sheet_name="Mid-2022 LSOA 2021", header = 3) 

In [74]:
print(age_group_20)

      LAD 2021 Code   LAD 2021 Name LSOA 2021 Code       LSOA 2021 Name  \
0         E06000001      Hartlepool      E01011949      Hartlepool 009A   
1         E06000001      Hartlepool      E01011950      Hartlepool 008A   
2         E06000001      Hartlepool      E01011951      Hartlepool 007A   
3         E06000001      Hartlepool      E01011952      Hartlepool 002A   
4         E06000001      Hartlepool      E01011953      Hartlepool 002B   
...             ...             ...            ...                  ...   
35667     W06000024  Merthyr Tydfil      W01001324  Merthyr Tydfil 003E   
35668     W06000024  Merthyr Tydfil      W01001898  Merthyr Tydfil 008F   
35669     W06000024  Merthyr Tydfil      W01001959  Merthyr Tydfil 005E   
35670     W06000024  Merthyr Tydfil      W01001960  Merthyr Tydfil 005F   
35671     W06000024  Merthyr Tydfil      W01001961  Merthyr Tydfil 006G   

       Total  F0 to 15  F16 to 29  F30 to 44  F45 to 64  F65 and over  \
0       1918       194    

In [78]:
# select columns
age_col = ['LSOA 2021 Code', 'F65 and over', 'M65 and over', 'Total']
age_20 = age_group_20[age_col]
age_22 = age_group_22[age_col]
age_20["elderly_20"] = age_20["F65 and over"] + age_20["M65 and over"]
age_20["elderly_per_20"] = age_20["elderly_20"] * 100 / age_20["Total"]
age_22["elderly_22"] = age_22["F65 and over"] + age_22["M65 and over"]
age_22["elderly_per_22"] = age_22["elderly_22"] * 100 / age_22["Total"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_20["elderly_20"] = age_20["F65 and over"] + age_20["M65 and over"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_20["elderly_per_20"] = age_20["elderly_20"] * 100 / age_20["Total"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_22["elderly_22"] = age_22["F65 and over"] + age_22["M65 an

In [80]:
age_group = age_20[["LSOA 2021 Code", "elderly_per_20"]].merge(age_22[["LSOA 2021 Code", "elderly_per_22"]], on = "LSOA 2021 Code", how = "left")

In [82]:
age_group["senior_per"] = (age_group["elderly_per_20"] + age_group["elderly_per_22"]) / 2
print(age_group.head(5))

  LSOA 2021 Code  elderly_per_20  elderly_per_22  senior_per
0      E01011949       15.589155       16.256684   15.922920
1      E01011950       15.739868       16.134913   15.937391
2      E01011951       13.943894       13.053989   13.498942
3      E01011952       19.797596       20.495356   20.146476
4      E01011953       14.494949       13.319879   13.907414


In [88]:
ethnic_group_20_22 = pd.read_csv("data/stru_data/1570121892025247.csv", skiprows=6)
ethnic_group_20_22.head(5)

Unnamed: 0,2021 super output area - lower layer,mnemonic,Total: All usual residents,White
0,Hartlepool 001A,E01011954,2284.0,2248.0
1,Hartlepool 001B,E01011969,1345.0,1329.0
2,Hartlepool 001C,E01011970,1070.0,1059.0
3,Hartlepool 001D,E01011971,1323.0,1311.0
4,Hartlepool 001F,E01033465,1954.0,1914.0


In [92]:
ethnic_group_20_22.rename(columns={'mnemonic': 'LSOA 2021 Code'}, inplace=True)
ethnic_group_20_22.rename(columns={'Total: All usual residents': 'All'}, inplace=True)
ethnic_group_20_22.drop(columns="2021 super output area - lower layer", inplace=True)

In [94]:
ethnic_group_20_22["minority_per"] = (ethnic_group_20_22["All"] - ethnic_group_20_22["White"])*100/ethnic_group_20_22["All"]
print(ethnic_group_20_22)

                                          LSOA 2021 Code     All   White  \
0                                              E01011954  2284.0  2248.0   
1                                              E01011969  1345.0  1329.0   
2                                              E01011970  1070.0  1059.0   
3                                              E01011971  1323.0  1311.0   
4                                              E01033465  1954.0  1914.0   
...                                                  ...     ...     ...   
35672  In order to protect against disclosure of pers...     NaN     NaN   
35673  have been swapped between different geographic...     NaN     NaN   
35674  by small amounts. Small counts at the lowest g...     NaN     NaN   
35675                                          affected.     NaN     NaN   
35676                                                NaN     NaN     NaN   

       minority_per  
0          1.576182  
1          1.189591  
2          1.028037  

In [None]:
df_20_22 = den[["LSOA 2021 Code", "pop_growth_rate", "avg_den"]].merge(age_group[["LSOA 2021 Code", "senior_per"]], 
                                                                       on="LSOA 2021 Code", how="left")
df_20_22 = df_20_22.merge(ethnic_group_20_22[["minority_per", "LSOA 2021 Code"]], on="LSOA 2021 Code", how="left")

In [101]:
print(df_20_22)

      LSOA 2021 Code  pop_growth_rate       avg_den  senior_per  minority_per
0          E01000001         0.148433  12935.285054   23.978012     19.877883
1          E01000002         0.276547   6526.500219   18.499190     21.789322
2          E01000003         0.101045  30713.073005   17.640567     25.868486
3          E01000005         0.277206   8167.194093    7.797017     61.489555
4          E01000006         0.001636  12520.463847    8.989160     67.208672
...              ...              ...           ...         ...           ...
35667      W01002036         0.006032   4330.150463   14.432804      9.922680
35668      W01002037         0.058416   4900.990099   17.737267      6.760563
35669      W01002038         0.066667    354.762109   25.013021      3.399209
35670      W01002039         0.137112   2696.033210    9.929882      4.375000
35671      W01002040         0.230984   2927.921315    3.974182      7.977355

[35672 rows x 5 columns]


In [103]:
import os

# 1. 设定 shapefile 存放路径
folder_path = "data/statistical-gis-boundaries-london/LB_LSOA2021_shp/LB_shp"

# 2. 收集所有 .shp 文件
shp_files = [f for f in os.listdir(folder_path) if f.endswith(".shp")]

# 3. 逐个读取并合并
gdf_list = []
for shp in shp_files:
    path = os.path.join(folder_path, shp)
    gdf = gpd.read_file(path)
    gdf["borough"] = shp.replace(".shp", "")  # 添加 borough 来源列（可选）
    gdf_list.append(gdf)

# 4. 合并所有 GeoDataFrame
london_lsoa21_combined = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True), crs=gdf_list[0].crs)

# 5. 保存成一个新的 shapefile
london_lsoa21_combined.to_file("london_lsoa21_combined.shp")

In [105]:
print(london_lsoa21_combined.columns)

Index(['lsoa21cd', 'lsoa21nm', 'msoa21cd', 'msoa21nm', 'lad22cd', 'lad22nm',
       'geometry', 'borough'],
      dtype='object')


In [106]:
london_lsoa21_combined.rename(columns={'lsoa21cd': 'LSOA 2021 Code'}, inplace=True)

stru_20_22_df = london_lsoa21_combined[["LSOA 2021 Code", "geometry", "borough"]].merge(df_20_22, on="LSOA 2021 Code", how="left")

In [107]:
print(stru_20_22_df.shape)

(4994, 7)


In [109]:
stru_20_22_df.to_file("data/gentri_data/all_stru_data_20_22.shp", index=False)

  stru_20_22_df.to_file("data/gentri_data/all_stru_data_20_22.shp", index=False)
  ogr_write(
  ogr_write(
  ogr_write(
