In [None]:
# Import required libraries
import geopandas as gpd
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import json
from shapely import wkt
from shapely.ops import unary_union
from sklearn.neighbors import NearestNeighbors

## Load UK Postal Code Shapefile

In [None]:
# Read the UK postal code GeoJSON
shapefile_path = 'uk.geojson'
gdf = gpd.read_file(shapefile_path)

print(f"Loaded {len(gdf)} postal code areas")
print(f"CRS: {gdf.crs}")
print(f"\nColumns: {gdf.columns.tolist()}")

In [None]:
# The source data uses 'name' for the postal code column, which collides with
# the commuting zone 'name' column during spatial joins. Rename to avoid ambiguity.
gdf = gdf.rename(columns={'name': 'postcode'})
pc_column = 'postcode'
print(f"Postal code column: {pc_column}")
print(f"\nSample postal codes: {gdf[pc_column].head().tolist()}")

## Load Meta Commuting Zones for United Kingdom

In [None]:
meta_cz = pd.read_csv('../meta_commuting_zones.csv')

print(f"Total commuting zones loaded: {len(meta_cz)}")

meta_cz_uk = meta_cz[meta_cz['country'] == 'United Kingdom'].copy()

# Convert WKT geography to GeoDataFrame
meta_cz_uk['geometry'] = meta_cz_uk['geography'].apply(wkt.loads)

# UK commuting zones have multiple rows per zone name, so consolidate by merging geometries
consolidated = []
for name, group in meta_cz_uk.groupby('name'):
    merged_geom = unary_union(group['geometry'].tolist())
    fbcz_id = group['fbcz_id'].iloc[0]
    win_population = group['win_population'].sum()
    area = group['area'].sum()
    consolidated.append({
        'fbcz_id': fbcz_id,
        'name': name,
        'win_population': win_population,
        'area': area,
        'geometry': merged_geom
    })

meta_cz_gdf = gpd.GeoDataFrame(consolidated, geometry='geometry', crs='EPSG:4326')

print(f"Created GeoDataFrame with {len(meta_cz_gdf)} commuting zones")
print(f"CRS: {meta_cz_gdf.crs}")
print(f"\nGeometry types:")
print(meta_cz_gdf.geometry.geom_type.value_counts())

In [None]:
# Plot the individual (pre-merge) commuting zone shapes
# Each zone may have multiple disjoint polygons before unary_union consolidation
meta_cz_uk_gdf = gpd.GeoDataFrame(meta_cz_uk, geometry='geometry', crs='EPSG:4326')

pre_merge_geojson = json.loads(meta_cz_uk_gdf.to_json())

fig = px.choropleth_map(
    meta_cz_uk_gdf,
    geojson=pre_merge_geojson,
    locations=meta_cz_uk_gdf.index,
    color='name',
    hover_name='name',
    hover_data={'fbcz_id': True, 'win_population': ':,.0f', 'area': ':,.2f'},
    labels={'win_population': 'Population', 'area': 'Area (kmÂ²)', 'fbcz_id': 'Zone ID'},
    map_style="carto-positron",
    center={"lat": 54.5, "lon": -3.5},
    zoom=4.75,
    opacity=0.6,
    title=f"UK Commuting Zones - Pre-Merge ({len(meta_cz_uk_gdf)} individual shapes)"
)

fig.update_layout(
    height=600,
    width=1000,
    margin={"r": 0, "t": 50, "l": 0, "b": 0},
    showlegend=False
)

fig.show()

print(f"Pre-merge shape count: {len(meta_cz_uk_gdf)}")
print(f"Post-merge zone count: {len(meta_cz_gdf)}")
print(f"Zones with multiple shapes: {(meta_cz_uk_gdf.groupby('name').size() > 1).sum()}")

## Calculate Postal Code Centroids

In [None]:
# Project to British National Grid (EPSG:27700) to calculate accurate centroids
gdf_projected = gdf.to_crs(epsg=27700)

# Calculate centroids in the projected CRS (meters)
gdf_projected['centroid'] = gdf_projected.geometry.centroid

# Create the WGS84 geodataframe for later use
gdf_wgs84 = gdf.to_crs(epsg=4326)

# Project the calculated centroids back to WGS84 (lat/lon)
gdf_wgs84['centroid'] = gdf_projected['centroid'].to_crs(epsg=4326)

print(f"Calculated centroids for {len(gdf_wgs84)} postal codes")
print(f"\nSample centroids:")
print(gdf_wgs84[[pc_column, 'centroid']].head())

In [None]:
sample_gdf = gdf_wgs84.head(10).copy()

center_lat = sample_gdf['centroid'].y.mean()
center_lon = sample_gdf['centroid'].x.mean()

fig = px.choropleth_map(
    sample_gdf,
    geojson=json.loads(sample_gdf.geometry.to_json()),
    locations=sample_gdf.index,
    color=pc_column,
    hover_name=pc_column,
    map_style="carto-positron",
    zoom=8,
    center={"lat": center_lat, "lon": center_lon},
    opacity=0.5,
    title="Sample Postal Codes with Centroids"
)

fig.add_trace(go.Scattermap(
    lat=sample_gdf['centroid'].y,
    lon=sample_gdf['centroid'].x,
    mode='markers',
    marker=dict(size=8, color='red'),
    name='Centroids',
    text=sample_gdf[pc_column],
    hoverinfo='text'
))

fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()

## Spatial Join: Assign Postal Codes to Commuting Zones

In [None]:
# Create GeoDataFrame with centroids
centroids_gdf = gpd.GeoDataFrame(
    gdf_wgs84[[pc_column]],
    geometry=gdf_wgs84['centroid'],
    crs='EPSG:4326'
)

# Spatial join to find which commuting zone each centroid falls into
pc_to_cz = gpd.sjoin(
    centroids_gdf,
    meta_cz_gdf[['fbcz_id', 'name', 'geometry']],
    how='left',
    predicate='within'
)

# Count matches
matched_count = pc_to_cz['fbcz_id'].notna().sum()
unmatched_count = pc_to_cz['fbcz_id'].isna().sum()

print(f"Spatial join results:")
print(f"  Matched postal codes: {matched_count} ({matched_count/len(pc_to_cz)*100:.1f}%)")
print(f"  Unmatched postal codes: {unmatched_count} ({unmatched_count/len(pc_to_cz)*100:.1f}%)")

## Nearest Neighbor Assignment for Unmatched Postal Codes

In [None]:
# For postal codes that don't fall within any commuting zone,
# assign them to the nearest commuting zone

if unmatched_count > 0:
    print(f"Processing {unmatched_count} unmatched postal codes...\n")

    unmatched_mask = pc_to_cz['fbcz_id'].isna()
    unmatched_indices = pc_to_cz[unmatched_mask].index
    unmatched_coords = np.array([
        [centroids_gdf.loc[idx, 'geometry'].x, centroids_gdf.loc[idx, 'geometry'].y]
        for idx in unmatched_indices
    ])

    cz_centroids = meta_cz_gdf.geometry.centroid
    cz_coords = np.array([[pt.x, pt.y] for pt in cz_centroids])

    # Fit nearest neighbor model using haversine distance (great circle)
    nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric='haversine')
    nn.fit(np.radians(cz_coords))  # Convert to radians for haversine

    # Find nearest commuting zone for each unmatched postal code
    distances, indices = nn.kneighbors(np.radians(unmatched_coords))

    # Assign nearest commuting zones
    for i, idx in enumerate(unmatched_indices):
        nearest_cz_idx = meta_cz_gdf.index[indices[i][0]]
        pc_to_cz.loc[idx, 'fbcz_id'] = meta_cz_gdf.loc[nearest_cz_idx, 'fbcz_id']
        pc_to_cz.loc[idx, 'name'] = meta_cz_gdf.loc[nearest_cz_idx, 'name']
        pc_to_cz.loc[idx, 'index_right'] = nearest_cz_idx

    # Convert distances from radians to kilometers (Earth radius ~ 6371 km)
    distances_km = distances * 6371

    print(f"Assigned {len(unmatched_indices)} postal codes using nearest neighbor")
    print(f"\nDistance statistics (km):")
    print(f"  Mean: {distances_km.mean():.2f} km")
    print(f"  Median: {np.median(distances_km):.2f} km")
    print(f"  Max: {distances_km.max():.2f} km")
    print(f"  Min: {distances_km.min():.2f} km")
else:
    print("All postal codes were successfully matched directly!")

final_matched = pc_to_cz['fbcz_id'].notna().sum()
print(f"\nTotal postal codes assigned: {final_matched} / {len(pc_to_cz)} (100%)")

## Distribution Analysis

In [None]:
# Analyze distribution of postal codes across commuting zones
cz_distribution = pc_to_cz.groupby('name').size().sort_values(ascending=False)

print("Postal Codes per Commuting Zone\n" + "="*50)
print(f"\nTop 15 commuting zones by postal code count:")
print(cz_distribution.head(15))
print(f"\nTotal commuting zones with postal codes: {len(cz_distribution)}")
print(f"\nStatistics:")
print(f"  Mean: {cz_distribution.mean():.1f} postal codes per zone")
print(f"  Median: {cz_distribution.median():.1f} postal codes per zone")
print(f"  Max: {cz_distribution.max()} postal codes")
print(f"  Min: {cz_distribution.min()} postal codes")

## Prepare Data for Visualization

In [None]:
# Add postal code counts to commuting zones GeoDataFrame
cz_counts = pc_to_cz.groupby('fbcz_id').size().reset_index(name='postal_code_count')
meta_cz_viz = meta_cz_gdf.merge(cz_counts, on='fbcz_id', how='left')
meta_cz_viz['postal_code_count'] = meta_cz_viz['postal_code_count'].fillna(0).astype(int)

print(f"Prepared {len(meta_cz_viz)} commuting zones for visualization")
print(f"\nCommuting zones with postal codes: {(meta_cz_viz['postal_code_count'] > 0).sum()}")
print(f"\nGeometry types in visualization data:")
print(meta_cz_viz.geometry.geom_type.value_counts())

## Interactive Plotly Map

This map shows UK Meta Commuting Zones colored by the number of postal codes assigned to each zone. The map handles both Polygon and MultiPolygon geometries.

In [None]:
# Create interactive choropleth map
cz_geojson = json.loads(meta_cz_viz.to_json())

fig = px.choropleth_map(
    meta_cz_viz,
    geojson=cz_geojson,
    locations=meta_cz_viz.index,
    color='postal_code_count',
    hover_name='name',
    hover_data={
        'fbcz_id': True,
        'postal_code_count': ':,',
        'win_population': ':,.0f',
        'area': ':,.2f'
    },
    labels={
        'postal_code_count': 'Postal Codes',
        'win_population': 'Population',
        'area': 'Area (km\u00b2)',
        'fbcz_id': 'Zone ID'
    },
    map_style="carto-positron",
    center={"lat": 54.5, "lon": -3.5},
    zoom=4.75,
    opacity=0.65,
    color_continuous_scale="Viridis",
    title="UK Meta Commuting Zones - Postal Code Coverage"
)

fig.update_layout(
    height=600,
    width=1000,
    margin={"r":0, "t":50, "l":0, "b":0}
)

fig.show()

print(f"\nMap created successfully")
print(f"  Zones displayed: {len(meta_cz_viz)}")
print(f"  Geometry types: Polygon ({(meta_cz_viz.geometry.geom_type == 'Polygon').sum()}), "
      f"MultiPolygon ({(meta_cz_viz.geometry.geom_type == 'MultiPolygon').sum()})")

## Export Results

In [None]:
# Create a mapping of unique commuting zones to numeric IDs
unique_zones = pc_to_cz[['fbcz_id', 'name']].drop_duplicates().sort_values('fbcz_id').reset_index(drop=True)
unique_zones['NumericID'] = range(1, len(unique_zones) + 1)

# Export postal code to commuting zone mapping
output_df = pc_to_cz[[pc_column, 'fbcz_id', 'name']].copy()
output_df = output_df.merge(unique_zones[['fbcz_id', 'NumericID']], on='fbcz_id', how='left')
output_df.columns = ['PostCode', 'CommutingZoneID', 'CommutingZoneName', 'NumericID']

# Reorder columns
output_df = output_df[['PostCode', 'NumericID', 'CommutingZoneID', 'CommutingZoneName']]

output_file = 'uk_postal_code_to_commuting_zone.csv'
output_df.to_csv(output_file, index=False)

print(f"Exported mapping to: {output_file}")
print(f"  Total mappings: {len(output_df):,}")
print(f"  Unique commuting zones: {len(unique_zones)}")
print(f"\nSample mappings:")
output_df.head(10)

In [None]:
# Add numeric IDs to the visualization data for export
meta_cz_viz_export = meta_cz_viz.merge(unique_zones[['fbcz_id', 'NumericID']], on='fbcz_id', how='left')

# Export commuting zones with postal code counts as GeoJSON
output_geojson = 'uk_commuting_zones.geojson'
meta_cz_viz_export.to_file(output_geojson, driver='GeoJSON')

print(f"Exported commuting zones to: {output_geojson}")
print(f"  Includes {len(meta_cz_viz_export)} zones with numeric IDs and postal code counts")