# Hard-lockdown scenario

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib
import seaborn as sns
import numpy as np
import os
from tqdm import tqdm
from dateutil.relativedelta import relativedelta
from matplotlib.ticker import FuncFormatter
from matplotlib.lines import Line2D
from shapely.geometry import Point, LineString
from utils import *

matplotlib.rcParams['figure.dpi'] = 70

## Total infections (bar plots)

In [None]:
# Define colors to differentiate between visitors and residents
colors = ['#FF2C00', '#0C5DA5']

# Define population on 2020
population_by_district = {'Wijk 42 Ypenburg': 21_643,       # 42
                          'Wijk 28 Centrum': 26_833}        # 28
                        #   'Wijk 44 Leidschenveen': 20_896,  # 44
                        #   'Wijk 29 Schildersbuurt': 31_669} # 29

# Load district shapefile
districts = gpd.read_file('../data/processed/cbs/wijk_buurt_kaart/districts.json')

scenario = 'lockdown'

# List all files corresponding to experimental setup
file_paths = find_files(f'../results/{scenario}', 'infectedPersons.csv')

# Print number of files/experiments
print(f"Number of experiments: {len(file_paths)}")

# Define number of runs to load
n_runs = 10

# Extract all experiment names
experiment_names = [os.path.basename(os.path.dirname(
    file_path)) for file_path in file_paths][:n_runs]

In [None]:
# Store start and end dates to find the smallest and largest
start_end_dates = {}

print('Loading experiment results...')
for experiment_name, file_path in tqdm(zip(experiment_names[:n_runs], file_paths[:n_runs]), total=len(file_paths[:n_runs])):
    # Load experiment results
    results = pd.read_csv(file_path)

    # Add date_time column
    results['date_time'] = results['Time(h)'].apply(lambda x: relativedelta(
        years=50, months=2) + datetime.datetime(*time.gmtime(x * 3600)[:6]))

    # Get the start and end dates of the experiment
    experiment_start = results['date_time'].min().date()
    experiment_end = results['date_time'].max().date()

    # Store start and end dates
    start_end_dates[experiment_name] = (
        experiment_start, experiment_end)
    
# Find the smallest and largest start and end dates
min_start = min(start for start, _ in start_end_dates.values())
max_end = max(end for _, end in start_end_dates.values())

print(f"Smallest start date: {min_start}")
print(f"Largest end date: {max_end}")

# Define complete dates
complete_dates = pd.date_range(start=min_start, end=max_end, freq='d')
complete_dates[:5]

In [None]:
# Iterate over start and end dates and print the number of days in the simulation
for experiment_name, (start, end) in start_end_dates.items():
    print(f"{experiment_name}: {start} - {end}")
    print(f"Number of days in simulation: {(end - start).days}")

In [None]:
# Store infections in a dictionary
all_infections = {}
infections_by_day = {}
district_infections_by_day = {}
resident_visitor_infections_by_day = {}
district_resident_visitor_infections_by_day = {}
district_resident_visitor_infections_by_day_location = {}

n_runs = 10

experiment_names = [os.path.basename(os.path.dirname(
    file_path)) for file_path in file_paths][:n_runs]

# Process each file
print('Loading experiment results...')
for experiment_name, file_path in tqdm(zip(experiment_names[:n_runs], file_paths[:n_runs]), total=len(file_paths[:n_runs])): 
    # Load experiment results
    results = pd.read_csv(file_path)
    
    # Convert to GeoDataFrame with infection location as geometry
    results = gpd.GeoDataFrame(results, geometry=gpd.points_from_xy(results['infectLocationLon'], results['infectLocationLat']))
    
    # Add date_time column
    results['date_time'] = results['Time(h)'].apply(lambda x: relativedelta(years=50, months=2) + datetime.datetime(*time.gmtime(x * 3600)[:6]))   
    
    # Add residence and infection districts to the data
    results = results.pipe(assign_residence_district, districts)\
                     .pipe(assign_infection_district, districts)
    
    # Add dummy variable indicating infection
    results['infection'] = 1

    # Add infection_type column based on whether infection occurred in district of residence
    results['infection_type'] = 'Visitor'  # Default to visitor
    
    # If infection district matches residence district, mark as resident
    mask = results['infection_district_name'] == results['residence_district_name']
    results.loc[mask, 'infection_type'] = 'Resident'

    # 1. Store all results
    all_infections[experiment_name] = results

    # 2. CALCULATE INFECTIONS BY DAY
    results_grouped = results.groupby(results['date_time'].dt.date)[
        'infection'].sum()

    # Reindex to include all complete_dates, filling missing rows with 0 for 'infection' and keeping other columns as NaN
    results_grouped = results_grouped.reindex(
        complete_dates, fill_value=np.nan)

    # Fill missing 'infection' values with 0
    results_grouped = results_grouped.fillna(0)
    
    # Store infections by day
    infections_by_day[experiment_name] = results_grouped

    # 3. DIFFERENTIATE BETWEEN INFECTIONS OF RESIDENTS AND VISITORS
    results_grouped = results.groupby(['infection_type', results['date_time'].dt.date])['infection'].sum()

    # Unstack so that 'date' is the index and 'infection_type' is columns
    results_grouped = results_grouped.unstack(level=0)

    # Reindex on the date index (second index originally), not the first
    results_grouped = results_grouped.reindex(complete_dates, fill_value=np.nan)
    results_grouped = results_grouped.fillna(0)

    # Stack back with infection_type as the first index
    results_grouped = results_grouped.stack().swaplevel(0, 1).sort_index()
    resident_visitor_infections_by_day[experiment_name] = results_grouped

    # 4. CALCULATE INFECTIONS BY DAY FOR THE DISTRICTS OF INTEREST
    results_grouped = results.groupby(['infection_district_name', results['date_time'].dt.date])['infection'].sum()
    results_grouped = results_grouped.unstack(level=0)
    results_grouped = results_grouped.reindex(complete_dates, fill_value=np.nan)
    results_grouped = results_grouped.fillna(0)
    results_grouped = results_grouped.stack().swaplevel(0, 1).sort_index()
    district_infections_by_day[experiment_name] = results_grouped

    # 5. CALCULATE INFECTIONS BY TYPE AND DISTRICT
    results_grouped = results.groupby(['infection_district_name', 'infection_type', results['date_time'].dt.date])['infection'].sum()

    # Unstack so that 'date' is the index and ('infection_district_name', 'infection_type') are columns
    results_grouped = results_grouped.unstack(level=2)

    # Reindex to ensure all complete_dates are present for each (district, type) pair
    results_grouped = results_grouped.reindex(columns=complete_dates, fill_value=np.nan)

    # Fill missing infection values with 0
    results_grouped = results_grouped.fillna(0)

    # Stack back so that index is (infection_district_name, infection_type, date)
    results_grouped = results_grouped.stack() 
    district_resident_visitor_infections_by_day[experiment_name] = results_grouped

    # 6. INFECTIONS BY LOCATION AND TYPE AND DISTRICT
    results_grouped = results.groupby(['infection_district_name', 'infection_type',
                                      'infectLocationType', results['date_time'].dt.date])['infection'].sum()

    # Unstack so that 'date' is the index and ('infection_district_name', 'infectLocationType', 'infection_type') are columns
    results_grouped = results_grouped.unstack(level=3)

    # Reindex to ensure all complete_dates are present for each (district, location, type) combination
    results_grouped = results_grouped.reindex(columns=complete_dates, fill_value=np.nan)

    # Fill missing infection values with 0
    results_grouped = results_grouped.fillna(0)

    # Stack back so that index is (infection_district_name, infectLocationType, infection_type, date)
    results_grouped = results_grouped.stack()
    district_resident_visitor_infections_by_day_location[experiment_name] = results_grouped
    
infections_by_day = pd.concat(infections_by_day.values(), axis=1)
infections_by_day.columns = experiment_names
resident_visitor_infections_by_day = pd.concat(resident_visitor_infections_by_day.values(), axis=1)
resident_visitor_infections_by_day.columns = experiment_names
district_infections_by_day = pd.concat(district_infections_by_day.values(), axis=1)
district_infections_by_day.columns = experiment_names
district_resident_visitor_infections_by_day = pd.concat(district_resident_visitor_infections_by_day.values(), axis=1)
district_resident_visitor_infections_by_day.columns = experiment_names
district_resident_visitor_infections_by_day_location = pd.concat(district_resident_visitor_infections_by_day_location.values(), axis=1)
district_resident_visitor_infections_by_day_location.columns = experiment_names

In [None]:
resident_color = '#0C5DA5'
visitor_color = '#FF2C00'

In [None]:
districts = {'Wijk 28 Centrum': 'Central',
             'Wijk 42 Ypenburg': 'Outer residential'}
# districts = {'Wijk 29 Schildersbuurt': 'Inner-residential', 'Wijk 44 Leidschenveen': 'Outer-residential'}

# Initialize lists to store results
resident_percentages = []
visitor_percentages = []
resident_errors = []
visitor_errors = []

# Iterate over districts
for district_code, district_name in districts.items():
    district_resident = district_resident_visitor_infections_by_day.loc[(
        district_code, 'Resident')]
    district_visitor = district_resident_visitor_infections_by_day.loc[(
        district_code, 'Visitor')]
        
    x = district_resident.sum(axis=0)
    y = district_visitor.sum(axis=0)

    pct_res = (x / population_by_district[district_code]) * 100
    pct_vis = (y / population_by_district[district_code]) * 100

    mean_res = pct_res.mean()
    mean_vis = pct_vis.mean()
    err_res = pct_res.std(ddof=1)
    err_vis = pct_vis.std(ddof=1)

    # Store results
    resident_percentages.append(mean_res)
    visitor_percentages.append(mean_vis)
    resident_errors.append(err_res)
    visitor_errors.append(err_vis)

# Define colors
resident_color = '#0C5DA5'

# Create grouped bar plot
fig, ax = plt.subplots(figsize=(6, 4))

district_names = ['Central', 'Outer residential']
bar_width = 0.35
x_pos = np.arange(len(district_names))

# Create bars
bars1 = ax.bar(x_pos - bar_width/2, resident_percentages, bar_width,
               color=resident_color, label='Resident',
               yerr=resident_errors, capsize=8, ecolor='black', error_kw=dict(lw=1))

bars2 = ax.bar(x_pos + bar_width/2, visitor_percentages, bar_width,
               color=visitor_color, label='Visitor',
               yerr=visitor_errors, capsize=8, ecolor='black', error_kw=dict(lw=1))

# Format y-axis as percentages
# ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x:.0f}%'))

# Set labels and ticks
ax.set_xlabel('District')
ax.set_ylabel('Infections by type (% of district population)')
ax.set_xticks(x_pos)
ax.set_xticklabels(district_names)

# Set y axis to start at 0 and end at 100
ax.set_ylim(0, 100)

# Annotate bars with percentages
for bar, pct in zip(bars1, resident_percentages):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{pct:.2f}%', ha='center', va='bottom')

for bar, pct in zip(bars2, visitor_percentages):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{pct:.2f}%', ha='center', va='bottom')

# Add legend and styling
ax.legend(loc='upper right', title='Infection type', frameon=False)
sns.despine(ax=ax)
fig.tight_layout()

formats = ['png', 'pdf', 'svg', 'eps']
for format in formats:
    plt.savefig(
        f'../figures/fig5b.{format}', dpi=300, bbox_inches='tight')

In [None]:
districts = {'Wijk 28 Centrum': 'Central',
             'Wijk 42 Ypenburg': 'Outer residential'}

# Create a table with statistics for both districts and infection types
stats_data = []

for district_key, district_name in districts.items():
    for infection_type in ['Resident', 'Visitor']:
        # Get the data for this district and infection type
        data = district_resident_visitor_infections_by_day.loc[(
            district_key, infection_type)].sum(axis=0)

        # Calculate statistics
        stats = {
            'District': district_name,
            'Infection type': infection_type,
            'Mean': data.mean(),
            'Median': data.median(),
            'Min': data.min(),
            'Max': data.max(),
            'Std': data.std(),
            'P10': data.quantile(0.1),
            'P90': data.quantile(0.9)
        }
        stats_data.append(stats)

# Create DataFrame and display
stats_df = pd.DataFrame(stats_data)
stats_df = stats_df.round(2)
stats_df

In [None]:
# Get total infections by district and rank them
district_infections = district_resident_visitor_infections_by_day.groupby('infection_district_name').sum().mean(axis=1)
district_infections_ranked = district_infections.sort_values(ascending=False).astype(int)

# Clean district names by removing 'Wijk ' prefix and trailing numbers
district_names_cleaned = (district_infections_ranked.index
                         .str.replace('Wijk ', '')
                         .str.replace(r'\d+', '', regex=True)
                         .str.strip())

# Replace the index with cleaned names
district_infections_ranked.index = district_names_cleaned
district_infections_ranked

In [None]:
print((district_infections_ranked / district_infections_ranked.sum() * 100).round(2))

In [None]:
# Print unique index of the 3 level
results_grouped.index.unique(level=2)

In [None]:
# Specify locations of interest
locations_of_interest = ['Accommodation', 'Supermarket']

res_means, vis_means = [], []
res_errs,  vis_errs = [], []
cat_labels = []  # e.g., "Central\nAccommodation", "Central\nSupermarket", ...

for district_code, district_name in districts.items():
    d_res = district_resident_visitor_infections_by_day_location.loc[(
        district_code, 'Resident')]
    d_vis = district_resident_visitor_infections_by_day_location.loc[(
        district_code, 'Visitor')]

    # Sort the index to enable MultiIndex slicing
    d_res = d_res.sort_index()
    d_vis = d_vis.sort_index()

    # Subset to include the dates after the first 20 days
    # d_res = d_res.loc[(slice(None), slice(
    #     pd.Timestamp('2020-03-23'), None)), :]
    # d_vis = d_vis.loc[(slice(None), slice(
    #     pd.Timestamp('2020-03-23'), None)), :]

    # Sum across days to get totals PER LOCATION per run (columns are runs)
    x = d_res.groupby(level=0).sum()
    y = d_vis.groupby(level=0).sum()

    # Ensure both locations exist even if missing
    x = x.reindex(locations_of_interest).fillna(0)
    y = y.reindex(locations_of_interest).fillna(0)

    denom = population_by_district[district_code]

    for loc in locations_of_interest:
        # Series over runs for this (district, loc)
        res_series = x.loc[loc]  # totals per run
        vis_series = y.loc[loc]

        pct_res = (res_series / denom) * 100.0
        pct_vis = (vis_series / denom) * 100.0

        # Mean and (sample) std across runs; guard single-run case
        res_mean = float(pct_res.mean())
        vis_mean = float(pct_vis.mean())
        res_std = float(pct_res.std(ddof=1)) if getattr(
            pct_res, "size", len(pct_res)) > 1 else 0.0
        vis_std = float(pct_vis.std(ddof=1)) if getattr(
            pct_vis, "size", len(pct_vis)) > 1 else 0.0

        res_means.append(res_mean)
        vis_means.append(vis_mean)
        res_errs.append(res_std)
        vis_errs.append(vis_std)

        # Put the location on the x-axis (with district on a new line)
        cat_labels.append(f"{district_name}\n{loc}")

# ---- Plot ----
# widen if many categories
fig, ax = plt.subplots(figsize=(max(6, 0.45*len(cat_labels)), 4))

bar_w = 0.35
x_pos = np.arange(len(cat_labels))

bars1 = ax.bar(x_pos - bar_w/2, res_means, bar_w, label='Resident', color=resident_color,
               yerr=res_errs, capsize=6, ecolor='black')
bars2 = ax.bar(x_pos + bar_w/2, vis_means, bar_w, label='Visitor', color=visitor_color,
               yerr=vis_errs, capsize=6, ecolor='black')

ax.set_xticks(x_pos)
ax.set_xticklabels(cat_labels, rotation=0, ha='center')
ax.set_xlabel('District × Infection location')
ax.set_ylabel('Infections per location (% of district population)')
# ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: f'{x:.0f}%'))

# y-limit that includes error bars
tops = [m+e for m, e in zip(res_means, res_errs)] + \
    [m+e for m, e in zip(vis_means, vis_errs)]
# ymax = max(tops) if tops else 1.0
# ax.set_ylim(0, ymax * 1.15)
ax.set_ylim(0, 100)

# Annotate bars (may get busy with many categories)
for bars, vals in ((bars1, res_means), (bars2, vis_means)):
    for bar, val in zip(bars, vals):
        ax.text(bar.get_x() + bar.get_width()/2., bar.get_height(),
                f'{val:.1f}%', ha='center', va='bottom', fontsize=8)

# Despine and legend
for spine in ['top', 'right']:
    ax.spines[spine].set_visible(False)
ax.legend(loc='upper right', title='Infection type', frameon=False)

fig.tight_layout()

for format in ['png', 'pdf', 'svg', 'eps']:
    fig.savefig(
        f'../figures/fig6.{format}', dpi=300, bbox_inches='tight')

## Spatial distribution of infections (maps)

In [None]:
# Reload districts since this variable was overwritten
districts = gpd.read_file(
    '../data/processed/cbs/wijk_buurt_kaart/districts.json')

In [None]:
# Store infections in a dictionary
all_infections = {}
infections_by_day = {}
district_infections_by_day = {}
resident_visitor_infections_by_day = {}
district_resident_visitor_infections_by_day = {}

n_runs = 1

experiment_names = [os.path.basename(os.path.dirname(
    file_path)) for file_path in file_paths][:n_runs]

# Process each file
print('Loading experiment results...')
for experiment_name, file_path in tqdm(zip(experiment_names[:n_runs], file_paths[:n_runs]), total=len(file_paths[:n_runs])): 
    # Load experiment results
    results = pd.read_csv(file_path)
        
    # Add date_time column
    results['date_time'] = results['Time(h)'].apply(lambda x: relativedelta(years=50, months=2) + datetime.datetime(*time.gmtime(x * 3600)[:6]))   
    
    # Add residence and infection districts to the data
    results = results.pipe(assign_residence_district, districts)\
                     .pipe(assign_infection_district, districts)
    
    # Add dummy variable indicating infection
    results['infection'] = 1

    # Add infection_type column based on whether infection occurred in district of residence
    results['infection_type'] = 'Visitor'  # Default to visitor
    
    # If infection district matches residence district, mark as resident
    mask = results['infection_district_name'] == results['residence_district_name']
    results.loc[mask, 'infection_type'] = 'Resident'

    # Find the infection peak
    start = results['date_time'].min()
    end = results['date_time'].max()
    complete_dates = pd.date_range(start=start, end=end, freq='d')

    results_grouped = results.groupby(results['date_time'].dt.date)[
        'infection'].sum()

    # Print the infection peak
    print(results_grouped.idxmax())

# Subset results to the infection peak
results = results[results['date_time'].dt.date == results_grouped.idxmax()]

In [None]:
# Filter to focus districts and prepare data
focus_districts_names = ['Wijk 28 Centrum', 'Wijk 42 Ypenburg']
district_labels = {'Wijk 28 Centrum': 'Central', 'Wijk 42 Ypenburg': 'Outer residential'}

# Filter results to focus districts and remove rows with missing coordinates
coord_cols = ['homeLat', 'homeLon', 'infectLocationLat', 'infectLocationLon']
gdf = results[
    (results['residence_district_name'].isin(focus_districts_names)) | 
    (results['infection_district_name'].isin(focus_districts_names))
].dropna(subset=coord_cols).copy()

# Create line geometries from home to infection locations
home_points = [Point(lon, lat) for lat, lon in zip(gdf['homeLat'], gdf['homeLon'])]
infection_points = [Point(lon, lat) for lat, lon in zip(gdf['infectLocationLat'], gdf['infectLocationLon'])]
lines = [LineString([home, infection]) for home, infection in zip(home_points, infection_points)]

# Create GeoDataFrame with lines
lines_gdf = gpd.GeoDataFrame(
    gdf[['residence_district_name', 'infection_district_name', 'infection_type']].reset_index(drop=True),
    geometry=lines,
    crs='EPSG:4326'
).to_crs("EPSG:28992")

# Prepare district boundaries
focus_districts = districts[districts['WK_NAAM'].isin(focus_districts_names)].to_crs("EPSG:28992")
city_boundary = districts.to_crs("EPSG:28992").dissolve()

# Create plot
fig, ax = plt.subplots(figsize=(12, 8))
color_map = {'Resident': '#0C5DA5', 'Visitor': '#FF2C00'}

# Plot city boundary and focus districts
city_boundary.boundary.plot(ax=ax, color='gray', linewidth=1, alpha=0.75)
focus_districts.plot(ax=ax, facecolor='none', edgecolor='black', linewidth=1, alpha=1)

# Add district labels
for _, row in focus_districts.iterrows():
    centroid = row.geometry.centroid
    label = district_labels.get(row['WK_NAAM'], row['WK_NAAM'])
    ax.annotate(label, (centroid.x, centroid.y), ha='center', va='center', 
                fontsize=10)

# Plot infection lines by type
for infection_type, group in lines_gdf.groupby('infection_type', dropna=False):
    alpha_value = 0.5 if infection_type == 'Resident' else 0.5
    group.plot(ax=ax, linewidth=0.5, alpha=alpha_value, 
               color=color_map.get(infection_type, 'gray'), label=infection_type)

# Configure plot
ax.set_aspect('equal')
legend_elements = [Line2D([0], [0], color=color, lw=2, label=label) 
                   for label, color in color_map.items()]
ax.legend(handles=legend_elements, frameon=False, loc='upper right', title='Infection type')
ax.axis('off');

# Save in 4 formats
formats = ['png', 'pdf', 'svg', 'eps']
for format in formats:
    plt.savefig(
        f'../figures/fig5a.{format}', dpi=300, bbox_inches='tight')