This notebook contain code on calculating and analysing walkability index for various unit of analysis

In [None]:
import pandas as pd
import geopandas as gpd
import os
from scipy.stats import zscore
import matplotlib.pyplot as plt
from matplotlib import colormaps
import seaborn as sns
from geocube.api.core import make_geocube
import rasterio
import glob
import pickle
from shapely.ops import unary_union
from shapely import Polygon
from statistics import mode
from collections import defaultdict
import numpy as np
from libpysal.weights import Queen
from esda.moran import Moran
from scipy.interpolate import interp1d

In [None]:
# set path for grid files
# you might have grids generated here from the 'generate_grids.ipynb'
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]
print(grids_list)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/walk_index.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

### nuts 3 analysis

In [None]:
nuts_gdf = gpd.read_file('data/NUTS_RG_01M_2024_3035.geojson')
nuts3_gdf = nuts_gdf[nuts_gdf.LEVL_CODE == 3]

# clip by osm eu bbox
osm_eu_gdf = gpd.read_file('data/osm_eu_bbox.shp').to_crs('epsg:3035')
eu_gdf = nuts3_gdf[nuts3_gdf.intersects(osm_eu_gdf.geometry.values[0])]

# open 100km grid file
grids_gdf = gpd.read_file('data/grid_100km_surf.gpkg')
grids_gdf.index += 1

# join for grid_ids and corresponding country polygon
eu_gdf_with_grids = gpd.sjoin(eu_gdf, grids_gdf[['geometry']], how='left', predicate='intersects')
eu_gdf_with_grids.rename(columns ={'index_right': 'grid_id'}, inplace=True)

# filter for a list of countries
# test_gdf = eu_gdf_with_grids.loc[eu_gdf_with_grids.CNTR_CODE.isin(['NL'])]
# grouped_test_gdf = eu_gdf_with_grids.groupby('CNTR_CODE')
eu_gdf_with_grids

In [None]:
# recursive function? not sure if this speeds up!
def process_window(geometry_arr, index_arr, x, y, window_size, poly_gdf, walk_gdf_list, merged_gdf):
    if window_size < 1:  # minimal window size to avoid too deep recursion
        return

    # extract the window
    window = geometry_arr[x:x+window_size, y:y+window_size]
    window_index = index_arr[x:x+window_size, y:y+window_size]
    
    # flatten the array of polygons to a list
    polygons = [poly for row in window for poly in row if poly is not None]
    if polygons:
        combined_polygon = unary_union(polygons)
        
        # intersection and containment checks
        if combined_polygon.intersects(poly_gdf.geometry.values[0]):
            if combined_polygon.within(poly_gdf.geometry.values[0]):
                # add to the list if within the specified geometry
                walk_gdf_list.append(merged_gdf.loc[list(window_index.flatten())])
            else:
                # recurse with smaller window sizes
                if window_size == 1:
                    if window[0][0].intersects(poly_gdf.geometry.values[0]):
                        walk_gdf_list.append(merged_gdf.loc[list(window_index.flatten())])
                else:
                    new_window_size = window_size // 5 if window_size > 4 else window_size // 4
                    for new_x in range(x, x+window_size, new_window_size):
                        for new_y in range(y, y+window_size, new_window_size):
                            process_window(geometry_arr, index_arr, new_x, new_y, new_window_size, poly_gdf, walk_gdf_list, merged_gdf)

In [None]:
grouped_gdf = eu_gdf_with_grids.groupby('NUTS_ID')

for group_id, gdf in grouped_gdf:
    walk_gdf_list = []
    if os.path.exists(f'data/walkability/grids_nuts3/grids_{str(group_id)}.parquet'):
        print(f'nuts 3 grids already calculated, skipping {group_id}')
    else: 
        for i, row in gdf.iterrows():
            print(row.NUTS_ID, row.grid_id)
            if pd.isna(row.grid_id):
                print(f'NAN grid_id, skipping...')
                continue
            try:
                df = pd.read_csv(f'data/processed_data/processed_data/processed_data_{int(row.grid_id)}.csv')
            except Exception as e:
                print(f'error {e} at {int(row.grid_id)}, skipping...')
                continue
        
            df_f = df[:1_000_000]    # gpu threads > 1_000_000 are irrelevant!
            df_ff = df_f.copy()
            df_ff.loc[df_ff['ndvi'] < 0, 'ndvi'] = 0
            df_ff.loc[df_ff['population'] < 0, 'population'] = 0
            df_ff['ent_5'] = df_ff['ent_5'].apply(lambda x: 0 if x < 0 or pd.isna(x) else x)
            df_ff['slope'] = df_ff['slope'].apply(lambda x: 0 if pd.isna(x) else x)
        
            # open the corresponding iso_gdf for iso area weighing
            iso_gdf = gpd.read_parquet(f'data/isochrones/isochrones_{int(row.grid_id)}.parquet')
            iso_gdf['iso_area'] = iso_gdf.area/1e6
            iso_gdf_f = iso_gdf[['index', 'iso_area']].copy()
        
            # open the corresponding grid gdf for geometries
            grid_gdf = gpd.read_parquet(f'data/grids_100_{int(row.grid_id)}.parquet')
            grid_gdf_f = grid_gdf[['index', 'grid_100000_id', 'geometry', 'population']].copy()
            grid_gdf_f = grid_gdf_f.rename(columns={'population': 'population_full'})
            grid_gdf_f.loc[grid_gdf_f['population_full'] < 0, 'population_full'] = 0
        
            # merge df, gdf, and iso_gdf
            temp_merged_gdf = grid_gdf_f.merge(iso_gdf_f, on='index')
            merged_gdf = temp_merged_gdf.merge(df_ff, on='index')
        
            bounds = merged_gdf.total_bounds
            temp_polygon = Polygon([
                (bounds[0], bounds[1]),  # (minx, miny)
                (bounds[0], bounds[3]),  # (minx, maxy)
                (bounds[2], bounds[3]),  # (maxx, maxy)
                (bounds[2], bounds[1]),  # (maxx, miny)
                (bounds[0], bounds[1])   # closing the polygon at the starting point
            ])
        
            if temp_polygon.within(gdf.geometry.values[0]) == True:
                walk_gdf_list.append(merged_gdf)
            else:
                merged_gdf_f = merged_gdf[merged_gdf.intersects(gdf.geometry.values[0])]
                walk_gdf_list.append(merged_gdf_f)
                
                # recursion?
                # index_arr = np.flipud(merged_gdf['index'].to_numpy().reshape(1000, 1000).T)
                # geometry_arr = np.flipud(merged_gdf['geometry'].to_numpy().reshape(1000, 1000).T)
                # window_size = 500
                # width, height = geometry_arr.shape

                # for x in range(0, width, window_size):
                #     for y in range(0, height, window_size):
                #         process_window(geometry_arr, index_arr, x, y, window_size, gdf, walk_gdf_list, merged_gdf)
                        
        try:
            walk_gdf = gpd.GeoDataFrame(pd.concat(walk_gdf_list, ignore_index=True))
            walk_gdf.to_parquet(f'data/walkability/grids_nuts3/grids_{str(group_id)}.parquet')
            print(walk_gdf)
        except Exception as e:
            print(f'skipping {group_id} because of {e}')

In [None]:
# build the index
p_cols = ['street_walk_length', 'iso_area', 'num_street_intersections', 'pub_trans_count', 'ndvi', 'slope', 'ent_5']
data_dict = {}

grouped_gdf = eu_gdf_with_grids.groupby('NUTS_ID')

for group_id, gdf in grouped_gdf:
    print(str(group_id))
    try:
        grid_gdf = gpd.read_parquet(f'data/walkability/grids_nuts3/grids_{str(group_id)}.parquet')
    except Exception as e:
        print(f'skipping {str(group_id)} due to error {e}')
        continue

    # nuts_3_area = grid_gdf.area.sum()/1e6
    # print(len(grid_gdf)/1e2)
    for col in p_cols:
        pop_weighted_value = (grid_gdf['population_full']*grid_gdf[col]).sum() / grid_gdf['population_full'].sum()
        if str(group_id) not in data_dict:
            data_dict[str(group_id)] = {f'{col}_pop_w': pop_weighted_value}
        else:
            data_dict[str(group_id)][f'{col}_pop_w'] = pop_weighted_value

In [None]:
data_dict

In [None]:
data_dict_f = {key: value for key, value in data_dict.items() if not key.startswith('TR')}
data_dict_f

In [None]:
df = pd.DataFrame(data_dict_f).T

# Apply z-score standardization
standardized_df = df.apply(zscore)
standardized_df

In [None]:
standardized_df['walkability'] = (standardized_df['street_walk_length_pop_w'] +
                                  standardized_df['iso_area_pop_w'] +
                                 standardized_df['num_street_intersections_pop_w'] +
                                 standardized_df['ndvi_pop_w'] +
                                 standardized_df['ent_5_pop_w'] +
                                 (-standardized_df['slope_pop_w']) +
                                 standardized_df['pub_trans_count_pop_w'])
standardized_df['walk_index'] = ((standardized_df['walkability'] - standardized_df['walkability'].min()) / (standardized_df['walkability'].max() - standardized_df['walkability'].min()))*100
standardized_df

In [None]:
decile_labels = [i+1 for i in range(10)]
standardized_df['walk_decile'] = pd.qcut(standardized_df['walkability'], 10, labels=decile_labels)

In [None]:
standardized_df.reset_index(inplace=True)
standardized_df.rename(columns={'index': 'NUTS_ID'}, inplace=True)

standardized_gdf = nuts3_gdf.merge(standardized_df, on='NUTS_ID', how='inner')
standardized_gdf

In [None]:
standardized_gdf[standardized_gdf.walk_index >= 90]

In [None]:
standardized_gdf.to_parquet('data/walkability/nuts3_walk.parquet')

### by lau analysis

In [None]:
lau_gdf = gpd.read_file('data/LAU_RG_01M_2023_3035.geojson')
lau_gdf

In [None]:
# clip by osm eu bbox
osm_eu_gdf = gpd.read_file('data/osm_europe/osm_eu_bbox.shp').to_crs('epsg:3035')
eu_gdf = lau_gdf[lau_gdf.intersects(osm_eu_gdf.geometry.values[0])]

# open 100km grid file
grids_gdf = gpd.read_file('data/grid_100km_surf.gpkg')
grids_gdf.index += 1

# join for grid_ids and corresponding country polygon
eu_gdf_with_grids = gpd.sjoin(eu_gdf, grids_gdf[['geometry']], how='left', predicate='intersects')
eu_gdf_with_grids.rename(columns ={'index_right': 'grid_id'}, inplace=True)

# filter for a list of countries
# test_gdf = eu_gdf_with_grids.loc[eu_gdf_with_grids.CNTR_CODE.isin(['NL'])]
# grouped_test_gdf = eu_gdf_with_grids.groupby('CNTR_CODE')
eu_gdf_with_grids

In [None]:
grouped_gdf = eu_gdf_with_grids.groupby('GISCO_ID')

In [None]:
# initialize cache variables
cached_grid_id = None
cached_df = None
cached_iso_gdf = None
cached_grid_gdf = None

for group_id, gdf in grouped_gdf:
    walk_gdf_list = []

    if os.path.exists(f'data/walkability/grids_lau/grids_{str(group_id)}.parquet'):
        print(f'lau grids already calculated, skipping {group_id}')
    else: 
        for i, row in gdf.iterrows():
            print(row.GISCO_ID, row.grid_id)
            if pd.isna(row.grid_id):
                print(f'NAN grid_id, skipping...')
                continue

            # cache check and loading data
            if row.grid_id != cached_grid_id:
                try:
                    df = pd.read_csv(f'data/processed_data/processed_data/processed_data_{int(row.grid_id)}.csv')
                    iso_gdf = gpd.read_parquet(f'data/isochrones/isochrones_{int(row.grid_id)}.parquet')
                    grid_gdf = gpd.read_parquet(f'data/grids_100_{int(row.grid_id)}.parquet')

                    cached_grid_id = row.grid_id
                    cached_df = df
                    cached_iso_gdf = iso_gdf
                    cached_grid_gdf = grid_gdf
                except Exception as e:
                    print(f'error {e} at {int(row.grid_id)}, skipping...')
                    continue
            else:
                print(f'using cached data for {row.GISCO_ID, row.grid_id}')
                df = cached_df
                iso_gdf = cached_iso_gdf
                grid_gdf = cached_grid_gdf

            # process the data
            df_f = df[:1_000_000]  # gpu threads > 1_000_000 are irrelevant!
            df_ff = df_f.copy()
            df_ff.loc[df_ff['ndvi'] < 0, 'ndvi'] = 0
            df_ff.loc[df_ff['population'] < 0, 'population'] = 0
            df_ff['ent_5'] = df_ff['ent_5'].apply(lambda x: 0 if x < 0 or pd.isna(x) else x)
            df_ff['slope'] = df_ff['slope'].apply(lambda x: 0 if pd.isna(x) else x)
        
            iso_gdf['iso_area'] = iso_gdf.area / 1e6
            iso_gdf_f = iso_gdf[['index', 'iso_area']].copy()
        
            grid_gdf_f = grid_gdf[['index', 'grid_100000_id', 'geometry', 'population']].copy()
            grid_gdf_f = grid_gdf_f.rename(columns={'population': 'population_full'})
            grid_gdf_f.loc[grid_gdf_f['population_full'] < 0, 'population_full'] = 0

            temp_merged_gdf = grid_gdf_f.merge(iso_gdf_f, on='index')
            merged_gdf = temp_merged_gdf.merge(df_ff, on='index')

            # filter by intersection with the geometry of the current group
            merged_gdf_f = merged_gdf[merged_gdf.intersects(gdf.geometry.values[0])]
            walk_gdf_list.append(merged_gdf_f)
                    
        try:
            walk_gdf = gpd.GeoDataFrame(pd.concat(walk_gdf_list, ignore_index=True))
            walk_gdf.to_parquet(f'data/walkability/grids_lau/grids_{str(group_id)}.parquet')
        except Exception as e:
            print(f'skipping {group_id} because of {e}')

In [None]:
# build the index
p_cols = ['street_walk_length', 'iso_area', 'num_street_intersections', 'pub_trans_count', 'ndvi', 'slope', 'ent_5']
data_dict = {}

for group_id, gdf in grouped_gdf:
    print(str(group_id))
    try:
        grid_gdf = gpd.read_parquet(f'data/walkability/grids_lau/grids_{str(group_id)}.parquet')
    except Exception as e:
        print(f'skipping {str(group_id)} due to error {e}')
        continue

    # add degree of urbanization
    dou_data = []
    if len(set(grid_gdf.grid_100000_id.values)) != 1:
        grouped_grid_gdf = grid_gdf.groupby('grid_100000_id')
        for grid_id, sub_grouped_grid_gdf in grouped_grid_gdf:
            indices_oi = list(sub_grouped_grid_gdf['index'].values)
            
            grid_100k_gdf = gpd.read_parquet(f'data/grids_100_{grid_id}.parquet')  
            dou_data.append(list(grid_100k_gdf.loc[indices_oi, 'dou'].values))

    else:
        indices_oi = list(grid_gdf['index'].values)
        
        grid_id = grid_gdf.grid_100000_id.values[0]
        grid_100k_gdf = gpd.read_parquet(f'data/grids_100_{grid_id}.parquet')  
        dou_data.append(list(grid_100k_gdf.loc[indices_oi, 'dou'].values))
        
    mode_dou = mode([element for sublist in dou_data for element in (sublist if isinstance(sublist, list) else [sublist])])
    print(mode_dou)
    data_dict[str(group_id)] = {'dou': mode_dou}
    
    # nuts_3_area = grid_gdf.area.sum()/1e6
    # print(len(grid_gdf)/1e2)
    for col in p_cols:
        pop_weighted_value = (grid_gdf['population_full']*grid_gdf[col]).sum() / grid_gdf['population_full'].sum()
        # if str(group_id) not in data_dict:
        #     data_dict[str(group_id)] = {f'{col}_pop_w': pop_weighted_value}
        # else:
        data_dict[str(group_id)][f'{col}_pop_w'] = pop_weighted_value
    # break

In [None]:
## save results to a pickle file so you don't have to process them again
# pickle.dump(data_dict, open('/Volumes/ssd1/eurostat_grid/grids_100/logs/data_grids_lau.p', 'wb'))

In [None]:
with open('logs/data_grids_lau.p', 'rb') as f:
    data_dict = pickle.load(f)

In [None]:
data_dict

In [None]:
df = pd.DataFrame(data_dict).T
df.dropna(inplace=True)
df = df[df.dou != 10]
df

In [None]:
# apply z-score standardization
columns_to_zscore = ['street_walk_length_pop_w', 'iso_area_pop_w',
                     'num_street_intersections_pop_w', 'pub_trans_count_pop_w',
                     'ndvi_pop_w', 'slope_pop_w', 'ent_5_pop_w']

df[columns_to_zscore] = df[columns_to_zscore].apply(zscore)
df

In [None]:
df['walkability'] = (df['street_walk_length_pop_w'] +
                     df['iso_area_pop_w'] +
                     df['num_street_intersections_pop_w'] +
                     df['ndvi_pop_w'] +
                     df['ent_5_pop_w'] +
                     (-df['slope_pop_w']) +
                     df['pub_trans_count_pop_w'])
df['walk_index'] = ((df['walkability'] - df['walkability'].min()) / (df['walkability'].max() - df['walkability'].min()))*100
df

In [None]:
decile_labels = [i+1 for i in range(10)]
df['walk_decile'] = pd.qcut(df['walkability'], 10, labels=decile_labels)
df

In [None]:
df.reset_index(inplace=True)
df.rename(columns={'index': 'GISCO_ID'}, inplace=True)

gdf = lau_gdf.merge(df, on='GISCO_ID', how='inner')
gdf.dropna(inplace=True)
gdf

In [None]:
# # save

# gdf.to_parquet('data/walkability/lau_walk.parquet')

#### correlation analysis

In [None]:
sns.set_theme(style="white")

# load the data (replace with actual file path)
gdf = gpd.read_parquet('data/walkability/lau_walk.parquet')

# define numeric columns and their abbreviations
numeric_cols = ['street_walk_length_pop_w', 'iso_area_pop_w',
                'num_street_intersections_pop_w', 'pub_trans_count_pop_w',
                'ndvi_pop_w', 'slope_pop_w', 'ent_5_pop_w', 'walkability']

abbreviations = {
    'street_walk_length_pop_w': 'SWL',
    'iso_area_pop_w': 'ISO',
    'num_street_intersections_pop_w': 'SI',
    'pub_trans_count_pop_w': 'PT',
    'ndvi_pop_w': 'GS',
    'slope_pop_w': 'SLOPE',
    'ent_5_pop_w': 'LUM',
    'walkability': 'WALK'
}

# eename columns
gdf.rename(columns=abbreviations, inplace=True)

# compute correlation matrix
corr = gdf[list(abbreviations.values())].corr(method='spearman')

# mask and melt correlation matrix
mask = np.tril(np.ones_like(corr, dtype=bool))
melt = corr.mask(mask).melt(ignore_index=False).reset_index()
melt["size"] = melt["value"].abs()

fig, ax = plt.subplots(figsize=(8, 6))

# normalize colorbar
cmap = plt.cm.RdBu
norm = plt.Normalize(-1, 1)
sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
cbar = plt.colorbar(sm, ax=ax)
cbar.ax.tick_params(labelsize="x-small")

# plot lower triangle (scatter plot with normalized hue and square markers)
sns.scatterplot(ax=ax, data=melt, x="index", y="variable", size="size",
                hue="value", hue_norm=norm, palette=cmap,
                style=0, markers=["s"], legend=False)

# format grid
xmin, xmax = (-0.5, corr.shape[0] - 0.5)
ymin, ymax = (-0.5, corr.shape[1] - 0.5)
ax.vlines(np.arange(xmin, xmax + 1), ymin, ymax, lw=1, color="silver")
ax.hlines(np.arange(ymin, ymax + 1), xmin, xmax, lw=1, color="silver")
ax.set(aspect=1, xlim=(xmin, xmax), ylim=(ymax, ymin), xlabel="", ylabel="")
ax.tick_params(labelbottom=False, labeltop=True)
plt.xticks(rotation=90)

# annotate upper triangle
for y in range(corr.shape[0]):
    for x in range(corr.shape[1]):
        value = corr.mask(mask).to_numpy()[y, x]
        if pd.notna(value):
            plt.text(x, y, f"{value:.2f}", size="x-small",
                     ha="center", va="center")

# plt.savefig('plots_paper/corrmatrix.png', dpi=720, bbox_inches='tight')
# plt.show()

In [None]:
corr

#### pair plot

In [None]:
# for pair plot

sampled_gdf = gdf[gdf['dou'].isin([13, 21, 22, 23, 30])].copy()
sampled_gdf['dou'] = sampled_gdf['dou'].astype('category')
sampled_gdf

In [None]:
# create directory for plots
output_dir = "joint_plots"
os.makedirs(output_dir, exist_ok=True)

# define variable pairs
variable_pairs = [
    ('SWL', 'WALK'),
    ('ISO', 'WALK'),
    ('SI', 'WALK'),
    ('PT', 'WALK'),
    ('GS', 'WALK'),
    ('SLOPE', 'WALK'),
    ('LUM', 'WALK'),
]

abbreviations = {
    'street_walk_length_pop_w': 'SWL',
    'iso_area_pop_w': 'ISO',
    'num_street_intersections_pop_w': 'SI',
    'pub_trans_count_pop_w': 'PT',
    'ndvi_pop_w': 'GS',
    'slope_pop_w': 'SLOPE',
    'ent_5_pop_w': 'LUM',
    'walkability': 'WALK'
}

# rename columns
sampled_gdf.rename(columns=abbreviations, inplace=True)

# define palette
unique_categories = sampled_gdf['dou'].cat.categories if sampled_gdf['dou'].dtype.name == 'category' else sampled_gdf['dou'].unique()
cmap = plt.get_cmap('Accent')
palette = {category: cmap(i / len(unique_categories)) for i, category in enumerate(unique_categories)}

# define legend labels
dou_labels = {
    13: "Rural cluster",
    21: "Suburban or peri-urban",
    22: "Semi-dense urban cluster",
    23: "Dense urban cluster",
    30: "Urban centre"
}

# generate and save plots
for x_var, y_var in variable_pairs:
    plt.figure(figsize=(14, 8))
    # sns.set_context("paper", font_scale=1.5)
    
    # Create jointplot
    joint = sns.jointplot(
        data=sampled_gdf,
        x=x_var,
        y=y_var,
        hue='dou',
        palette=palette,
        kind='scatter',
        marker='+',
        s=100,
        alpha=0.5,
        height=5,
        ratio=5,
        space=0.2,
    )
    
    # add regression lines
    for hue_val, color in palette.items():
        subset = sampled_gdf[sampled_gdf['dou'] == hue_val]
        sns.regplot(
            data=subset,
            x=x_var,
            y=y_var,
            scatter=False,
            line_kws={'color': 'black', 'linewidth': 1, 'linestyle': '--', 'alpha': 0.8},
            ax=joint.ax_joint
        )
        sns.regplot(
            data=subset,
            x=x_var,
            y=y_var,
            scatter=False,
            line_kws={'color': color, 'linewidth': 2, 'alpha': 0.5},
            ax=joint.ax_joint
        )
    
    # set labels
    joint.set_axis_labels(x_var, y_var, fontsize=20)
    joint.ax_joint.tick_params(labelsize=20)
    joint.ax_marg_x.tick_params(labelsize=20)
    joint.ax_marg_y.tick_params(labelsize=20)
    joint.ax_joint.get_legend().remove()
    # # Rotate x-axis labels
    # for label in joint.ax_joint.get_xticklabels():
    #     label.set_rotation(45)
    
    # # create custom legend
    # handles = [
    #     plt.Line2D(
    #         [0], [0],
    #         marker='+',
    #         color=cmap(i / len(unique_categories)),
    #         linestyle='',
    #         markersize=8
    #     )
    #     for i, _ in enumerate(unique_categories)
    # ]
    # labels = [dou_labels.get(cat, str(cat)) for cat in unique_categories]
    
    # # add legend to joint plot
    # joint.ax_joint.legend(handles, labels, title='Dou', fontsize=8, title_fontsize=9, loc='upper right')
    
    # save as SVG
    plot_filename = f"{x_var}_vs_{y_var}.png"
    joint.savefig(os.path.join(output_dir, plot_filename), format="png", bbox_inches='tight', dpi=150)
    plt.show()
    plt.close()
    
    # break

In [None]:
# custom legend for jointplot

unique_categories = sampled_gdf['dou'].cat.categories if sampled_gdf['dou'].dtype.name == 'category' else sampled_gdf['dou'].unique()
cmap = plt.get_cmap('Accent')

# create custom legend handles
handles = [
    plt.Line2D(
        [0], [0],
        marker='+',
        color=cmap(i / len(unique_categories)),
        linestyle='',
        markersize=30,  # Larger marker size for readability
        markeredgewidth=4  # Bold marker edges
    )
    for i, _ in enumerate(unique_categories)
]

# create labels
labels = [dou_labels.get(cat, str(cat)) for cat in unique_categories]

# create figure for legend
fig, ax = plt.subplots(figsize=(10, 1))  # Keep the width manageable, height minimal
legend = ax.legend(
    handles,
    labels,
    loc='center',
    ncol=5,  # Arrange in a single row
    fontsize=20,  # Larger font size for readability
    handletextpad=1.0,  # Space between marker and text
    columnspacing=2.0,
    frameon=False# Space between columns
)
ax.axis('off')  # Turn off the axis
plt.tight_layout()

# save the legend as an image
plt.savefig("joint_plots/custom_legend.svg", format="svg", bbox_inches="tight")
plt.show()
plt.close()

### country analysis

In [None]:
# country-wise analysis
nuts_gdf = gpd.read_file('data/NUTS_RG_01M_2024_3035.geojson')
nuts3_gdf = nuts_gdf[nuts_gdf.LEVL_CODE == 3]

# clip by osm eu bbox
osm_eu_gdf = gpd.read_file('data/osm_europe/osm_eu_bbox.shp').to_crs('epsg:3035')
eu_gdf = nuts3_gdf[nuts3_gdf.intersects(osm_eu_gdf.geometry.values[0])]

eu_gdf

In [None]:
grouped_gdf = eu_gdf.groupby('CNTR_CODE')
for group_id, _ in grouped_gdf:
    print(group_id)

In [None]:
grouped_gdf = eu_gdf.groupby('CNTR_CODE')

for group_id, gdf in grouped_gdf:
    # print(group_id, gdf)
    if os.path.exists(f'data/walkability/grids_country/grids_{str(group_id)}.parquet'):
        print(f'country grids already calculated, skipping {group_id}')
    else:
        country_nuts3_list = []
        for i, row in gdf.iterrows():
            try:
                grids_nuts3_gdf = gpd.read_parquet(f'data/walkability/grids_nuts3/grids_{row.NUTS_ID}.parquet')
            except Exception as e:
                print(f'skipping {row.NUTS_ID} because of {e}')
                
            if len(set(grids_nuts3_gdf.grid_100000_id.values)) != 1:
                grouped_grids_nuts3_gdf = grids_nuts3_gdf.groupby('grid_100000_id')
                for grid_id, nuts3_grid_gdf in grouped_grids_nuts3_gdf:
                    indices_to_join_on = list(nuts3_grid_gdf['index'].values)
                    
                    grid_gdf = gpd.read_parquet(f'data/grids_100_{grid_id}.parquet')  
                    column_data = grid_gdf.loc[indices_to_join_on, 'dou']
                    
                    nuts3_grid_gdf['dou'] = nuts3_grid_gdf['index'].map(column_data)
                    country_nuts3_list.append(nuts3_grid_gdf)
            else:
                indices_to_join_on = list(grids_nuts3_gdf['index'].values)
                
                grid_id = grids_nuts3_gdf.grid_100000_id.values[0]
                grid_gdf = gpd.read_parquet(f'data/grids_100_{grid_id}.parquet')  
                column_data = grid_gdf.loc[indices_to_join_on, 'dou']
                
                grids_nuts3_gdf['dou'] = grids_nuts3_gdf['index'].map(column_data)
                country_nuts3_list.append(grids_nuts3_gdf)
    
        country_gdf = gpd.GeoDataFrame(pd.concat(country_nuts3_list, ignore_index=True))
        country_gdf.to_parquet(f'data/walkability/grids_country/grids_{str(group_id)}.parquet')

In [None]:
# calculate index for each degree of urbanization

country_gdf_paths = [parquet for parquet in glob.glob("data/walkability/grids_country/*.parquet")]

data_dict = {}
p_cols = ['street_walk_length', 'iso_area', 'num_street_intersections', 'pub_trans_count', 'ndvi', 'slope', 'ent_5']

for country_gdf_path in country_gdf_paths:
    country_code = country_gdf_path.split('_')[-1].split('.')[0]
    print(country_code)
    country_gdf = gpd.read_parquet(country_gdf_path)
    dou_grouped_country_gdf = country_gdf.groupby('dou')
    inner_data_dict = {}
    for dou, gdf in dou_grouped_country_gdf:
        for col in p_cols:
            total_pop = gdf['population_full'].sum()
            if total_pop != 0:
                pop_weighted_value = (gdf['population_full']*gdf[col]).sum() / total_pop
            else:
                pop_weighted_value = 0

            if dou not in inner_data_dict:
                inner_data_dict[dou] = {f'{col}_pop_w': pop_weighted_value}
            else:
                inner_data_dict[dou][f'{col}_pop_w'] = pop_weighted_value
                
    if country_code not in data_dict:
        data_dict[country_code] = inner_data_dict
    # break

In [None]:
data_dict

In [None]:
# calculate index for each degree of urbanization for EU-35 and EU-27

# define your paths and columns
country_gdf_paths = glob.glob("data/walkability/grids_country/*.parquet")
p_cols = ['street_walk_length', 'iso_area', 'num_street_intersections', 'pub_trans_count', 'ndvi', 'slope', 'ent_5']

# initialize accumulators:
#   - denoms will track the total population for each group
#   - numerators will track the sum of (population * col) for each column and group
denoms = defaultdict(float)
numerators = defaultdict(lambda: {col: 0.0 for col in p_cols})

# process each file one by one
for country_gdf_path in country_gdf_paths:
    country_code = country_gdf_path.split('_')[-1].split('.')[0]
    print("Processing country code:", country_code)
    
    # read the file (only one at a time)
    gdf = gpd.read_parquet(country_gdf_path)
    
    # group the file by 'dou'
    for dou, group in gdf.groupby('dou'):
        pop_sum = group['population_full'].sum()
        denoms[dou] += pop_sum  # update the denominator
        
        for col in p_cols:
            # update the numerator: population-weighted sum for this column
            numerators[dou][col] += (group['population_full'] * group[col]).sum()

# now, compute the weighted averages for each group and each column
data_dict = {'EU':{}}
for dou in denoms:
    data_dict['EU'][dou] = {}
    for col in p_cols:
        if denoms[dou] != 0:
            data_dict['EU'][dou][f'{col}_pop_w'] = numerators[dou][col] / denoms[dou]
        else:
            data_dict['EU'][dou][f'{col}_pop_w'] = 0

print(data_dict)

In [None]:
# pickle.dump(data_dict, open('logs/data_grids_country_full.p', 'wb'))

In [None]:
with open('/Volumes/ssd1/eurostat_grid/grids_100/logs/data_grids_country.p', 'rb') as f:
    data_dict_1 = pickle.load(f)

with open('/Volumes/ssd1/eurostat_grid/grids_100/logs/data_grids_country_full.p', 'rb') as f:
    data_dict_2 = pickle.load(f)

In [None]:
data_dict = {**data_dict_1, **data_dict_2}
data_dict

In [None]:
len(data_dict)

In [None]:
# convert the nested dictionary into a DataFrame
df = pd.json_normalize(
    [
        {"CNTR_CODE": key, "dou": dou, **vals}
        for key, dou_dict in data_dict.items()
        for dou, vals in dou_dict.items()
    ]
)

df = df.loc[df['dou'] != 10]
df

In [None]:
# apply z-score standardization
p_cols = ['street_walk_length', 'iso_area', 'num_street_intersections', 'pub_trans_count', 'ndvi', 'slope', 'ent_5']
standardized_df = df.copy()

for col in p_cols:
    standardized_df[f'{col}_pop_w'] = zscore(df[f'{col}_pop_w'])

standardized_df

In [None]:
standardized_df['walkability'] = (standardized_df['street_walk_length_pop_w'] +
                                  standardized_df['iso_area_pop_w'] +
                                 standardized_df['num_street_intersections_pop_w'] +
                                 standardized_df['ndvi_pop_w'] +
                                 standardized_df['ent_5_pop_w'] +
                                 (-standardized_df['slope_pop_w']) +
                                 standardized_df['pub_trans_count_pop_w'])
standardized_df['walk_index'] = ((standardized_df['walkability'] - standardized_df['walkability'].min()) / (standardized_df['walkability'].max() - standardized_df['walkability'].min()))*100

decile_labels = [i+1 for i in range(10)]
standardized_df['walk_decile'] = pd.qcut(standardized_df['walkability'], 10, labels=decile_labels)
standardized_df

In [None]:
# standardized_df.to_csv('test.csv')

In [None]:
# # Plot the updated scatter plot with 'x' markers and dotted lines
# plt.figure(figsize=(12, 8))

# # Get unique DOU values and assign discrete colors
# unique_dou = df['dou'].unique()
# discrete_colors = plt.cm.get_cmap('tab10_r', len(unique_dou))

# # Draw lines connecting points with the same DOU
# for i, dou_value in enumerate(unique_dou):
#     dou_df = standardized_df[standardized_df['dou'] == dou_value]  # Adjust to the correct DataFrame
#     plt.plot(
#         dou_df['CNTR_CODE'],
#         dou_df['decile_class'],  # Replace 'iso_area_pop_w' with 'decile_class' if needed
#         marker='x',
#         linestyle='--',
#         linewidth=0.5,
#         color=discrete_colors(i),  # Map color from colormap
#         markersize=4,  # Increase marker size
#         markeredgewidth=1,  # Make marker edges bold
#         label=f"DOU {dou_value}"
#     )

# # Customize the plot
# plt.xlabel('CNTR_CODE')
# plt.ylabel('Walkability Decile')  # Replace with 'Walkability Decile' if needed
# # plt.title('Comparison of ISO_AREA_POP_W across Countries with DOU Connection Lines (Dotted)')
# plt.legend(title="Degree of Urbanization", bbox_to_anchor=(1.05, 1), loc='upper left')

# plt.grid(False)
# plt.gca().spines['top'].set_visible(False)
# plt.gca().spines['right'].set_visible(False)
# plt.tight_layout()

# plt.savefig("test_plot.svg", format="svg", bbox_inches="tight")
# # Display plot
# plt.show()


In [None]:
# # Create a strip-like visualization for each country with points inside the strips
# fig = plt.figure(figsize=(16, 8))

# # Get unique countries and DOU values
# unique_countries = standardized_df['CNTR_CODE'].unique()
# unique_dou = standardized_df['dou'].unique()
# discrete_colors = plt.cm.get_cmap('tab10_r', len(unique_dou))

# # Create a strip (subplot) for each country
# for i, cntr_code in enumerate(unique_countries):
#     ax = fig.add_subplot(1, len(unique_countries), i + 1)

#     # Add points for the decile_class inside the strip
#     country_df = standardized_df[standardized_df['CNTR_CODE'] == cntr_code]
#     for _, row in country_df.iterrows():        
#         ax.scatter(
#             0.5,  # Fixed x-coordinate for all points (center of the strip)
#             row['decile_class'],  # Y position corresponds to decile_class
#             color=discrete_colors(unique_dou.tolist().index(row['dou'])),
#             s=50,  # Marker size
#             alpha=0.8,
#             label=f"DOU {row['dou']}" if f"DOU {row['dou']}" not in ax.get_legend_handles_labels()[1] else ""  # Add legend only once per DOU
#         )
    
#     # Customize the strip
#     ax.set_xlim(0, 1)  # Limit x-axis to create a strip effect
#     ax.set_ylim(0.5, 10.5)  # Match y-axis range to decile_class
#     ax.set_xticks([])  # Remove x-axis ticks
#     ax.spines['top'].set_visible(True)  # Add the top spine
#     ax.spines['right'].set_visible(True)  # Add the right spine
#     ax.spines['bottom'].set_visible(True)  # Add the bottom spine
#     ax.spines['left'].set_visible(True)  # Add the left spine
#     ax.set_yticks(range(1, 11))  # Add y-axis ticks for decile_class
#     if i == 0:
#         ax.set_ylabel('Walkability Decile', fontsize=10)
#     else:
#         ax.set_yticklabels([])  # Remove y-axis labels for all but the first strip
#     ax.set_xlabel(cntr_code, fontsize=10, labelpad=10)  # Add country name at the bottom

# # Adjust the layout for better spacing
# plt.subplots_adjust(wspace=0.4)

# # Add a global legend
# handles, labels = ax.get_legend_handles_labels()
# fig.legend(
#     handles=handles,
#     labels=labels,
#     title="Degree of Urbanization",
#     bbox_to_anchor=(0.5, -0.1),
#     loc='lower center',
#     ncol=6
# )

# # Save and display the plot
# plt.savefig("strip_plot_countries_refined.svg", format="svg", bbox_inches="tight")
# plt.show()

#### plots

In [None]:
# data for latex tikz plot

# list of EU-27 countries based on provided names
eu_27_countries = [
    "AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR", "DE", "EL", 
    "HU", "IE", "IT", "LV", "LT", "LU", "MT", "NL", "PL", "PT", "RO", "SK", 
    "SI", "ES", "SE", "EU"
]

# filter the dataset to keep only EU-27 countries
df_eu27 = standardized_df[standardized_df['CNTR_CODE'].isin(eu_27_countries)]

# group by country and walk_decile, then collect unique DOU values
grouped_data = df_eu27.groupby(['CNTR_CODE', 'walk_decile'])['dou'].apply(list).reset_index()
grouped_data.dropna(inplace=True)

# create an ordered list of countries with 'EU' first
ordered_countries = sorted(grouped_data['CNTR_CODE'].unique(), key=lambda x: (x != 'EU', x))

# convert CNTR_CODE to a categorical with our custom ordering and re-sort
grouped_data['CNTR_CODE'] = pd.Categorical(grouped_data['CNTR_CODE'], categories=ordered_countries, ordered=True)
grouped_data.sort_values('CNTR_CODE', inplace=True)

# now build the mapping for y-axis positions using the same ordering:
country_index_map = {country: i+1 for i, country in enumerate(ordered_countries)}

# check ordering by printing the unique values
print(", ".join(map(str, grouped_data.CNTR_CODE.unique())))

# mapping for Degree of Urbanization to LaTeX color names
dou_color_map = {
    11: "dou11",
    12: "dou12",
    13: "dou13",
    21: "dou21",
    22: "dou22",
    23: "dou23",
    30: "dou30",
}

# generate LaTeX commands
latex_commands = []
for _, row in grouped_data.iterrows():
    country = row['CNTR_CODE']
    decile = row['walk_decile']
    dou_list = row['dou']
    
    # convert DOU values to LaTeX color names
    dou_colors = [dou_color_map[dou] for dou in dou_list if dou in dou_color_map]

    # get country row index from our mapping
    y_pos = country_index_map[country]

    # generate LaTeX command if there are any valid colors
    if dou_colors:
        color_str = ",".join(dou_colors)
        latex_commands.append(f"\\drawGroupPoints{{{decile}}}{{{y_pos}}}{{{len(dou_colors)}}}{{{color_str}}}")

# display the generated LaTeX code
latex_code = "\n".join(latex_commands)
print(latex_code)

### city analysis

In [None]:
lau_gdf = gpd.read_file('data/LAU_RG_01M_2023_3035.geojson')

# clip by osm eu bbox
osm_eu_gdf = gpd.read_file('data/osm_europe/osm_eu_bbox.shp').to_crs('epsg:3035')
eu_gdf = lau_gdf[lau_gdf.intersects(osm_eu_gdf.geometry.values[0])]

eu_gdf

In [None]:
top_20_units = eu_gdf.sort_values(by='POP_2023', ascending=False).head(20)
top_20_units

In [None]:
# top_20_units.to_parquet('top_20_cities.parquet')

In [None]:
grid_gdfs = []

for i, row in top_20_units.iterrows():
    grid_gdf = gpd.read_parquet(f'data/walkability/grids_lau/grids_{row.GISCO_ID}.parquet')
    grid_gdf['GISCO_ID'] = row.GISCO_ID
    grid_gdf['LAU_Name'] = row.LAU_NAME
    grid_gdfs.append(grid_gdf)

t_20_grids_gdf = gpd.GeoDataFrame(pd.concat(grid_gdfs, ignore_index=True))
t_20_grids_gdf

In [None]:
# zscore and build index
columns_to_zscore = ['street_walk_length', 'iso_area',
                     'num_street_intersections', 'pub_trans_count',
                     'ndvi', 'slope', 'ent_5']

t_20_grids_gdf[columns_to_zscore] = t_20_grids_gdf[columns_to_zscore].apply(zscore)
t_20_grids_gdf

In [None]:
t_20_grids_gdf['walkability'] = (t_20_grids_gdf['street_walk_length'] +
                     t_20_grids_gdf['iso_area'] +
                     t_20_grids_gdf['num_street_intersections'] +
                     t_20_grids_gdf['ndvi'] +
                     t_20_grids_gdf['ent_5'] +
                     (-t_20_grids_gdf['slope']) +
                     t_20_grids_gdf['pub_trans_count'])
t_20_grids_gdf['walk_index'] = ((t_20_grids_gdf['walkability'] - t_20_grids_gdf['walkability'].min()) / (t_20_grids_gdf['walkability'].max() - t_20_grids_gdf['walkability'].min()))*100
t_20_grids_gdf

In [None]:
decile_labels = [i+1 for i in range(10)]
t_20_grids_gdf['walk_decile'] = pd.qcut(t_20_grids_gdf['walkability'], 10, labels=decile_labels)
t_20_grids_gdf

In [None]:
# t_20_grids_gdf.to_parquet('test_t_20_units.parquet')

In [None]:
t_20_grids_gdf = gpd.read_parquet('test_t_20_units.parquet')
print(t_20_grids_gdf)

In [None]:
grouped = t_20_grids_gdf.groupby('GISCO_ID')

# placeholder for results
results = []

# loop through each group
for group_id, group_gdf in grouped:
    try:
        # ensure geometries are valid
        group_gdf = group_gdf[group_gdf.geometry.notnull()]
        group_gdf = group_gdf[group_gdf.is_valid]

        # skip empty or too-small groups
        if group_gdf.shape[0] < 2:
            print(f"Skipping group {group_id}: too few observations")
            continue

        # create spatial weights matrix (Queen contiguity)
        w = Queen.from_dataframe(group_gdf)

        # standardize the variable of interest
        y = group_gdf['walk_index'].values 
        # calculate Moran's I
        mi = Moran(y, w)

        results.append({
            'gisco_id': group_id,
            'lau_name': group_gdf.LAU_Name.unique()[0],
            'moran_I': mi.I,
            'expected_I': mi.EI,
            'p_value': mi.p_norm,
            'z_score': mi.z_norm
        })

    except Exception as e:
        print(f"Error processing group {group_id}: {e}")

In [None]:
results

### city analysis population

In [None]:
top_20_gdf = gpd.read_parquet('top_20_cities.parquet')  # created from 'city analysis'
print(top_20_gdf.LAU_NAME.unique())

In [None]:
grid_gdfs = []

for i, row in top_20_gdf.iterrows():
    grid_gdf = gpd.read_parquet(f'data/walkability/grids_lau/grids_{row.GISCO_ID}.parquet')
    grid_gdf['GISCO_ID'] = row.GISCO_ID
    grid_gdf['LAU_NAME'] = row.LAU_NAME
    grid_gdfs.append(grid_gdf)

t_20_grids_gdf = gpd.GeoDataFrame(pd.concat(grid_gdfs, ignore_index=True))
t_20_grids_gdf

In [None]:
# zscore and build index
columns_to_zscore = ['street_walk_length', 'iso_area',
                     'num_street_intersections', 'pub_trans_count',
                     'ndvi', 'slope', 'ent_5']

t_20_grids_gdf[columns_to_zscore] = t_20_grids_gdf[columns_to_zscore].apply(zscore)

t_20_grids_gdf['walkability'] = (t_20_grids_gdf['street_walk_length'] +
                     t_20_grids_gdf['iso_area'] +
                     t_20_grids_gdf['num_street_intersections'] +
                     t_20_grids_gdf['ndvi'] +
                     t_20_grids_gdf['ent_5'] +
                     (-t_20_grids_gdf['slope']) +
                     t_20_grids_gdf['pub_trans_count'])
t_20_grids_gdf['walk_index'] = ((t_20_grids_gdf['walkability'] - t_20_grids_gdf['walkability'].min()) / (t_20_grids_gdf['walkability'].max() - t_20_grids_gdf['walkability'].min()))*100

decile_labels = [i+1 for i in range(10)]
t_20_grids_gdf['walk_decile'] = pd.qcut(t_20_grids_gdf['walkability'], 10, labels=decile_labels)
t_20_grids_gdf

In [None]:
bin_edges = np.arange(0, 101, 2)  # bins: 0-2, 2-4, ..., 98-100
bin_labels = (bin_edges[:-1] + bin_edges[1:]) / 2  # midpoints for interpolation

# group data and calculate population percentages
g_t_20_grids_gdf = t_20_grids_gdf.groupby('LAU_NAME')
results_dict = {}

for lau_name, gdf in g_t_20_grids_gdf:
    # assign each walk_index to a bin
    gdf['walk_index_bin'] = np.digitize(gdf['walk_index'], bins=bin_edges, right=True) - 1
    gdf['walk_index_bin'] = gdf['walk_index_bin'].apply(lambda x: bin_labels[x] if 0 <= x < len(bin_labels) else np.nan)

    # group by binned walk_index and compute population percentages
    pop_by_bin = gdf.groupby('walk_index_bin')['population_full'].sum()
    total_population = pop_by_bin.sum()
    pop_percentage = (pop_by_bin / total_population) * 100

    results_dict[lau_name] = pop_percentage.to_dict()

In [None]:
results_dict

In [None]:
# define a function to translate the dictionary keys
def translate_dict_keys(data_dict):
    translation_map = {
        "Berlin, Stadt": "Berlin",
        "Frankfurt am Main, Stadt": "Frankfurt",
        "Grad Zagreb": "Zagreb",
        "Hamburg, Freie und Hansestadt": "Hamburg",
        "Kraków": "Krakow",
        "Köln, Stadt": "Cologne",
        "Municipiul Bucureşti": "Bucharest",
        "München, Landeshauptstadt": "Munich",
        "Praha": "Prague",
        "València": "Valencia",
        "Warszawa": "Warsaw",
        "Столична": "Sofia"
    }
    
    # rranslate dictionary keys
    translated_dict = {translation_map.get(city, city): values for city, values in data_dict.items()}
    return translated_dict

translated_results_dict = translate_dict_keys(results_dict)
translated_results_dict

In [None]:
# for poster only 10 cities
# you can simply change this to see plot for all 20 cities

poster_cities = ['Amsterdam', 'Barcelona', 'Berlin', 'Marseille', 'Munich', 'Oslo', 'Paris', 'Prague', 'Valencia', 'Warsaw']
poster_dict = {k: translated_results_dict[k] for k in poster_cities if k in translated_results_dict}

In [None]:
# extract GISCO IDs and assign colors
gisco_ids = list(poster_dict.keys())
colors = plt.cm.tab10(np.linspace(0, 1, len(gisco_ids)))  # Unique color per GISCO ID
# colors = plt.cm.tab20(np.linspace(0, 1, len(gisco_ids)))  # Unique color per GISCO ID -> 20 cities

plt.figure(figsize=(6, 10))

# define smoother x-values (continuous walk_index from 0 to 100)
smooth_x = np.linspace(0, 100, 500)  # 500 points for smoothness

# loop through each GISCO ID and plot its smooth curve
for i, gisco_id in enumerate(gisco_ids):
    walk_index_bins = np.array(list(poster_dict[gisco_id].keys()))
    pop_counts = np.array(list(poster_dict[gisco_id].values()))

    # sort by walk_index (x-axis)
    sorted_indices = np.argsort(walk_index_bins)
    sorted_walk_index = walk_index_bins[sorted_indices]
    sorted_pop = pop_counts[sorted_indices]

    # compute cumulative population percentage (y-axis)
    cumulative_population = np.cumsum(sorted_pop)
    cumulative_population = (cumulative_population / cumulative_population[-1]) * 100  # normalize to 0-100%

    # interpolation function (cubic spline) with safe extrapolation limits
    interp_func = interp1d(sorted_walk_index, cumulative_population, kind='cubic', bounds_error=False, fill_value=(0, 100))

    # get smooth y-values and prevent values from going below 0 or above 100
    smooth_y = np.clip(interp_func(smooth_x), 0, 100)

    # plot smooth curve and filled area
    plt.fill_between(smooth_x, smooth_y, alpha=0, color=colors[i])
    plt.plot(smooth_x, smooth_y, label=gisco_id, color=colors[i], linewidth=2)

# customize plot
plt.yticks(np.arange(0, 110, 10))
plt.xticks(np.arange(0, 110, 10))

plt.xlabel("Walkability Index")
plt.ylabel("Cumulative Population %")
plt.ylim(0, 100)  # Explicitly enforce Y-axis limits
plt.xlim(30, 100)
plt.legend(loc="upper left")
# plt.grid(True, linestyle="--", alpha=0.5)

plt.savefig("plots_paper/curve_poster.svg", format="svg", bbox_inches='tight')
plt.show()

In [None]:
# group data and calculate population percentages
g_t_20_grids_gdf = t_20_grids_gdf.groupby('LAU_NAME')
n_results_dict = {}

for lau_name, gdf in g_t_20_grids_gdf:
    # group by binned walk_index and compute population percentages
    pop_decile = gdf.groupby('walk_decile')['population_full'].sum()
    total_population = pop_decile.sum()
    pop_percentage = (pop_decile / total_population) * 100

    # store results in dictionary with GISCO ID
    n_results_dict[lau_name] = pop_percentage.to_dict()

In [None]:
translated_n_results_dict = translate_dict_keys(n_results_dict)
translated_n_results_dict

In [None]:
# calculate the total percent of people below the 6th decile
city_names = []
total_below_6th_decile = []

for city, deciles in translated_n_results_dict.items():
    total = sum(deciles[d] for d in range(1, 6))  # sum values for 1st to 5th decile
    city_names.append(city)
    total_below_6th_decile.append(total)

# sort the data for better visualization
sorted_indices = sorted(range(len(total_below_6th_decile)), key=lambda i: total_below_6th_decile[i])
sorted_city_names = [city_names[i] for i in sorted_indices]
sorted_totals = [total_below_6th_decile[i] for i in sorted_indices]

# create the horizontal bar chart with styling similar to the provided image
fig, ax = plt.subplots(figsize=(6, 10)) 

bars = ax.barh(sorted_city_names, sorted_totals, color='orange', edgecolor='black', height=0.5)

# add percentages as text labels
max_value = max(sorted_totals)
for bar, value in zip(bars, sorted_totals):
    ax.text(bar.get_width() + max_value * 0.02, bar.get_y() + bar.get_height()/2, f"{value:.1f}%", va='center')

# styling the plot
ax.set_xlim(0, max_value * 1.13) 
ax.set_xlabel(r"% Pop <6th decile")
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')

plt.savefig("plots_paper/barh.svg", format="svg", bbox_inches='tight')
plt.show()