## Config

In [29]:
# change to raw data file
INPUT_DATA_PATH = "../../data/raw/leefbarometer/Leefbaarometer-scores-gemeenten-2002-2022_raw.csv"
INPUT_SHAPEFILE_PATH = "../../data/gemeente_shapefiles/gemeente_shapefile.shp"

# set true or false
SAVE_OUTPUT = True
# change to filename
OUTPUT_DATA_PATH = "../../data/processed/leefbarometer_clean.csv"

In [30]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize, LinearSegmentedColormap

## Load data

In [31]:
# load data
scores = pd.read_csv(INPUT_DATA_PATH, sep = ',')

# read shapefile
gdf = gpd.read_file(INPUT_SHAPEFILE_PATH, sep = ';')

## Data management

In [32]:
scores.head()

Unnamed: 0,gm_code,gm_naam,jaar,lbm,afw,fys,onv,soc,vrz,won
0,GM0014,Groningen,2002,4.045898,-0.056367,,,,,
1,GM0014,Groningen,2008,4.089955,-0.01231,,,,,
2,GM0014,Groningen,2012,4.123199,0.020934,,,,,
3,GM0014,Groningen,2014,4.129275,0.02701,0.023791,-0.031338,-0.023424,0.070398,-0.012416
4,GM0014,Groningen,2016,4.151465,0.0492,,,,,


In [33]:
gdf.head()

Unnamed: 0,gm_code,gm_naam,geometry
0,GM0003,Appingedam,"POLYGON ((249539.950 593217.579, 249543.450 59..."
1,GM0010,Delfzijl,"MULTIPOLYGON (((267829.905 587996.480, 267824...."
2,GM0014,Groningen,"POLYGON ((227979.943 584879.819, 227981.153 58..."
3,GM0024,Loppersum,"POLYGON ((244952.159 601043.418, 244954.500 60..."
4,GM0034,Almere,"MULTIPOLYGON (((142993.687 482642.725, 142993...."


In [34]:
# rename statcode to gm_code
gdf = gdf.rename(columns={'statnaam': 'gm_naam'})

In [35]:
# check for nans per column
scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2760 entries, 0 to 2759
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   gm_code  2760 non-null   object 
 1   gm_naam  2760 non-null   object 
 2   jaar     2760 non-null   int64  
 3   lbm      2760 non-null   float64
 4   afw      2760 non-null   float64
 5   fys      1380 non-null   float64
 6   onv      1380 non-null   float64
 7   soc      1380 non-null   float64
 8   vrz      1380 non-null   float64
 9   won      1380 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 215.8+ KB


In [36]:
scores['lbm'].max()

4.689251806830523

In [37]:
# check for nans per column
gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 355 entries, 0 to 354
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   gm_code   355 non-null    object  
 1   gm_naam   355 non-null    object  
 2   geometry  355 non-null    geometry
dtypes: geometry(1), object(2)
memory usage: 8.4+ KB


For some years of municiplaties fys, onv, soc, vrz, won are not available. However, all leefbarometers are available.

In [38]:
# merge with shapefile
merged_df = scores.merge(gdf, on=['gm_code', 'gm_naam'], how='right')

In [39]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2742 entries, 0 to 2741
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   gm_code   2742 non-null   object  
 1   gm_naam   2742 non-null   object  
 2   jaar      2728 non-null   float64 
 3   lbm       2728 non-null   float64 
 4   afw       2728 non-null   float64 
 5   fys       1364 non-null   float64 
 6   onv       1364 non-null   float64 
 7   soc       1364 non-null   float64 
 8   vrz       1364 non-null   float64 
 9   won       1364 non-null   float64 
 10  geometry  2742 non-null   geometry
dtypes: float64(8), geometry(1), object(2)
memory usage: 235.8+ KB


In [40]:
# make it a geofaraframe
merged_gdf = gpd.GeoDataFrame(merged_df, geometry='geometry')
# Reproject to WGS 84 (EPSG:4326)
merged_gdf = merged_gdf.to_crs(epsg=4326)

In [41]:
# print nan values of lbm
# some lbm scores are missing
nan_lbm = merged_df[merged_df['lbm'].isna()]
nan_lbm.head()

Unnamed: 0,gm_code,gm_naam,jaar,lbm,afw,fys,onv,soc,vrz,won,geometry
0,GM0003,Appingedam,,,,,,,,,"POLYGON ((249539.950 593217.579, 249543.450 59..."
1,GM0010,Delfzijl,,,,,,,,,"MULTIPOLYGON (((267829.905 587996.480, 267824...."
10,GM0024,Loppersum,,,,,,,,,"POLYGON ((244952.159 601043.418, 244954.500 60..."
859,GM0370,Beemster,,,,,,,,,"POLYGON ((119458.546 509093.046, 119460.054 50..."
956,GM0398,Heerhugowaard,,,,,,,,,"POLYGON ((117032.251 522096.714, 117032.300 52..."


In [42]:
merged_gdf

Unnamed: 0,gm_code,gm_naam,jaar,lbm,afw,fys,onv,soc,vrz,won,geometry
0,GM0003,Appingedam,,,,,,,,,"POLYGON ((6.80583 53.31703, 6.80588 53.31704, ..."
1,GM0010,Delfzijl,,,,,,,,,"MULTIPOLYGON (((7.07833 53.26658, 7.07826 53.2..."
2,GM0014,Groningen,2002.0,4.045898,-0.056367,,,,,,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
3,GM0014,Groningen,2008.0,4.089955,-0.012310,,,,,,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
4,GM0014,Groningen,2012.0,4.123199,0.020934,,,,,,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
...,...,...,...,...,...,...,...,...,...,...,...
2737,GM1978,Molenlanden,2014.0,4.102979,0.000715,-0.004494,0.080646,0.051251,-0.147532,0.020843,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2738,GM1978,Molenlanden,2016.0,4.118999,0.016734,,,,,,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2739,GM1978,Molenlanden,2018.0,4.114004,0.011739,-0.006625,0.086057,0.048165,-0.139700,0.023843,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2740,GM1978,Molenlanden,2020.0,4.125291,0.023026,-0.004530,0.089176,0.042746,-0.128014,0.023648,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."


In [43]:
# look which gm_naam is in scores and not in gdf
scores_gm_naam = set(scores["gm_naam"].unique())
gdf_gm_naam = set(gdf["gm_naam"].unique())

# Find gm_naam in scores but not in gdf
missing_in_gdf = scores_gm_naam - gdf_gm_naam

# Convert the result to a list or DataFrame for display
missing_in_gdf_list = list(missing_in_gdf)
print("gm_naam in scores but not in gdf:")
print(missing_in_gdf_list)

gm_naam in scores but not in gdf:
['Land van Cuijk', 'Dijk en Waard', 'Eemsdelta', 'Maashorst']


In [44]:
# look which gm_naam is in gdf and not in scores
scores_gm_naam = set(scores["gm_naam"].unique())
gdf_gm_naam = set(gdf["gm_naam"].unique())

# Find gm_naam in scores but not in gdf
missing_in_scores = gdf_gm_naam - scores_gm_naam

# Convert the result to a list or DataFrame for display
missing_in_scores_list = list(missing_in_scores)
print("gm_naam in gdf but not in scores:")
print(missing_in_scores)

gm_naam in gdf but not in scores:
{'Uden', 'Mill en Sint Hubert', 'Langedijk', 'Haaren', 'Delfzijl', 'Sint Anthonis', 'Landerd', 'Grave', 'Cuijk', 'Loppersum', 'Boxmeer', 'Appingedam', 'Heerhugowaard', 'Beemster'}


In [45]:
# for now, drop these nans
merged_gdf = merged_gdf.dropna(subset=['lbm'])

In [46]:
# drop all columns except for gm_code, gm_naam, jaar, lbm and geometry
merged_gdf = merged_gdf[['gm_code', 'gm_naam', 'jaar', 'lbm', 'geometry']]
merged_gdf.head()

Unnamed: 0,gm_code,gm_naam,jaar,lbm,geometry
2,GM0014,Groningen,2002.0,4.045898,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
3,GM0014,Groningen,2008.0,4.089955,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
4,GM0014,Groningen,2012.0,4.123199,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
5,GM0014,Groningen,2014.0,4.129275,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
6,GM0014,Groningen,2016.0,4.151465,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."


In [47]:
merged_gdf['jaar'] = merged_gdf['jaar'].astype('int')
merged_gdf

Unnamed: 0,gm_code,gm_naam,jaar,lbm,geometry
2,GM0014,Groningen,2002,4.045898,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
3,GM0014,Groningen,2008,4.089955,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
4,GM0014,Groningen,2012,4.123199,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
5,GM0014,Groningen,2014,4.129275,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
6,GM0014,Groningen,2016,4.151465,"POLYGON ((6.48047 53.24551, 6.48050 53.24579, ..."
...,...,...,...,...,...
2737,GM1978,Molenlanden,2014,4.102979,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2738,GM1978,Molenlanden,2016,4.118999,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2739,GM1978,Molenlanden,2018,4.114004,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."
2740,GM1978,Molenlanden,2020,4.125291,"POLYGON ((4.62940 51.88084, 4.62932 51.88088, ..."


In [48]:
# Make a backup of gm_code and geometry
metadata = merged_gdf[['gm_naam', 'gm_code', 'geometry']].drop_duplicates()

# Pivot for lbm
pivoted_lbm = merged_gdf.pivot(index='jaar', columns='gm_naam', values='lbm')

# Set the metadata index to gm_name and add it to the result
metadata = metadata.set_index('gm_naam')
pivoted_lbm = pivoted_lbm.T  # Transpose to make gm_name as index
pivoted_lbm = pivoted_lbm.join(metadata[['gm_code', 'geometry']])  # Add metadata

# Check the result
pivoted_lbm.head()

Unnamed: 0_level_0,2002,2008,2012,2014,2016,2018,2020,2022,gm_code,geometry
gm_naam,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
's-Gravenhage,4.062348,4.065541,4.04184,4.031019,4.077424,4.09934,4.106839,4.134549,GM0518,"MULTIPOLYGON (((4.19210 52.05797, 4.19197 52.0..."
's-Hertogenbosch,3.972506,4.0029,4.017644,4.054725,4.092917,4.101404,4.117857,4.118029,GM0796,"POLYGON ((5.22752 51.72715, 5.22749 51.72715, ..."
Aa en Hunze,4.19235,4.187574,4.215539,4.234321,4.241116,4.249099,4.225308,4.227829,GM1680,"POLYGON ((6.64801 53.02826, 6.64786 53.02879, ..."
Aalsmeer,4.065392,4.079211,4.091437,4.09297,4.063587,4.113765,4.118463,4.116335,GM0358,"POLYGON ((4.79444 52.29627, 4.79445 52.29628, ..."
Aalten,4.097104,4.088877,4.101236,4.10818,4.130035,4.129945,4.140398,4.136142,GM0197,"POLYGON ((6.63918 51.93247, 6.63973 51.93217, ..."


In [49]:
pivoted_lbm.reset_index(inplace = True)
pivoted_lbm

Unnamed: 0,gm_naam,2002,2008,2012,2014,2016,2018,2020,2022,gm_code,geometry
0,'s-Gravenhage,4.062348,4.065541,4.041840,4.031019,4.077424,4.099340,4.106839,4.134549,GM0518,"MULTIPOLYGON (((4.19210 52.05797, 4.19197 52.0..."
1,'s-Hertogenbosch,3.972506,4.002900,4.017644,4.054725,4.092917,4.101404,4.117857,4.118029,GM0796,"POLYGON ((5.22752 51.72715, 5.22749 51.72715, ..."
2,Aa en Hunze,4.192350,4.187574,4.215539,4.234321,4.241116,4.249099,4.225308,4.227829,GM1680,"POLYGON ((6.64801 53.02826, 6.64786 53.02879, ..."
3,Aalsmeer,4.065392,4.079211,4.091437,4.092970,4.063587,4.113765,4.118463,4.116335,GM0358,"POLYGON ((4.79444 52.29627, 4.79445 52.29628, ..."
4,Aalten,4.097104,4.088877,4.101236,4.108180,4.130035,4.129945,4.140398,4.136142,GM0197,"POLYGON ((6.63918 51.93247, 6.63973 51.93217, ..."
...,...,...,...,...,...,...,...,...,...,...,...
336,Zundert,4.068531,4.038181,4.040567,4.017688,4.053564,4.064086,4.075664,4.091302,GM0879,"POLYGON ((4.57230 51.48267, 4.57232 51.48271, ..."
337,Zutphen,4.070172,4.058033,4.082412,4.079431,4.094202,4.092603,4.098449,4.129023,GM0301,"POLYGON ((6.18588 52.12379, 6.18604 52.12421, ..."
338,Zwartewaterland,4.095010,4.084661,4.088307,4.104402,4.119927,4.134670,4.130526,4.120966,GM1896,"MULTIPOLYGON (((6.01886 52.64351, 6.02002 52.6..."
339,Zwijndrecht,3.969942,3.984897,4.010052,4.019172,4.034644,4.041020,4.062210,4.065264,GM0642,"POLYGON ((4.55930 51.82314, 4.55926 51.82329, ..."


## Save output

## Watermark

In [50]:
!python -m pip install watermark --quiet

In [51]:
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


In [52]:
%watermark

Last updated: 2025-01-27T11:05:24.749630+01:00

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.17.2

Compiler    : MSC v.1916 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 142 Stepping 10, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [53]:
%watermark --iversions

numpy     : 1.26.1
matplotlib: 3.8.1
geopandas : 0.14.0
pandas    : 2.1.2



In [54]:
!lsb_release -a

'lsb_release' is not recognized as an internal or external command,
operable program or batch file.
