In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from scipy.stats import gamma, lognorm, norm, skewnorm
import random
import os
import rasterio
import glob
import datetime as dt
from scipy.optimize import curve_fit

def sample_categorical_census(df: pd.DataFrame, category_col: str, value_col: str, ignore_categories: list):
    """
    Sample from a categorical distribution.
    """

    # Filter ignored categories
    if ignore_categories:
        df = df[~df[category_col].isin(ignore_categories)]

    totals = df.groupby(category_col)[value_col].sum()
    probs = totals / totals.sum()

    samples = np.random.choice(
        probs.index.to_numpy(),
        size=1,
        p=probs.to_numpy()
    )

    return samples[0]


## Hydrological

### 24-Hour Precipitation

In [None]:
precipitation = pd.read_csv("data/precipitation.csv")
rainfall = precipitation["rainfall"]
pct_zero = (rainfall == 0).sum() / len(rainfall)
zero_sample = np.random.binomial(n=1, p=pct_zero, size=1)[0]

precipitation_nonzero = precipitation[precipitation["rainfall"] != 0].copy()
rainfall_nonzero = precipitation["rainfall"]
max_prec = max(rainfall_nonzero)
shape, loc, scale = gamma.fit(rainfall_nonzero)
prec_sample = 0 if zero_sample == 1 else gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]
print(prec_sample)

### Groundwater level (WORK IN PREC)

In [None]:
# Choose random groundwater station, Gamma

# mAOD
# add prec dependency, modify ground water level based on previous prec_sample

groundwater_files = [f for f in os.listdir("data/groundwater_level") if f.lower().endswith(".csv")]
gw_file = random.choice(groundwater_files)

path = os.path.join("data/groundwater_level", gw_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
a, loc, scale = skewnorm.fit(values)
gw_sample = skewnorm.rvs(a, loc, scale, size=1)[0]

print(f"Selected file: {gw_file}")
print(gw_sample)

### River flow (WORK IN PREC)

In [None]:
# Random choose river station, Gamma

# m3/s
# add prec dependency, modify ground water level based on previous prec_sample

river_flow_files = [f for f in os.listdir("data/river_flow") if f.lower().endswith(".csv")]
rf_file = random.choice(river_flow_files)

path = os.path.join("data/river_flow", rf_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
shape, loc, scale = gamma.fit(values, floc=0)
rf_sample = gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]

print(f"Selected file: {rf_file}")
print(rf_sample)


### River level (WORK IN PREC)

In [None]:
# Using same river station as river flow, Gamma

# m
# add prec dependency, modify ground water level based on previous prec_sample

file_prefix = rf_file.split('-')[0]
river_level_files = [f for f in os.listdir("data/river_level") if f.lower().endswith(".csv")]
for f in river_level_files:
    if file_prefix in f:
        rl_file = f
        break
    
path = os.path.join("data/river_level", rl_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
shape, loc, scale = gamma.fit(values, floc=0)
rl_sample = gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]

print(f"Selected file: {rl_file}")
print(rl_sample)



## Other

### Urban/Rural

In [None]:
urban = pd.read_csv("data/urban_rural.csv")
counts = urban['Urban_rural_flag'].value_counts()
urban_probs = counts / counts.sum()
p_urban = urban_probs['Urban']
urban_sample = np.random.binomial(n=1, p=p_urban, size=1)[0]
print(urban_sample)

### Population density

In [None]:
popden = pd.read_csv("data/population_density.csv", dtype={"LAD2021": "string", "OA21CD": "string", "Total": "Int64"})
merged = popden.merge(urban, on="OA21CD", how="left")
popden_urbanrural = merged[['LAD2021','OA21CD','Total','Urban_rural_flag']]

flag = 'urban' if urban_sample == 1 else 'rural'
popden_total  = popden_urbanrural[popden_urbanrural["Urban_rural_flag"].str.lower() == flag]["Total"]

log = np.log(popden_total)

mu, sigma = log.mean(), log.std()

popden_sample = np.random.lognormal(mean=mu, sigma=sigma, size=1)[0]
print(popden_sample)

### Mean property value

In [None]:
property = pd.read_csv("data/property_value.csv")
property = property.dropna(subset=['price', 'property_type', 'duration'])
property = property[property['price'] > 0]

log_property = np.log(property['price'])
shape, loc, scale = skewnorm.fit(log_property)
sample_log = skewnorm.rvs(shape, loc=loc, scale=scale, size=1)
property_sample = np.exp(sample_log)[0]
print(property_sample)

### Building age

In [None]:
building_age = pd.read_csv("data/property_age.csv")
age_columns = [
    "BP_PRE_1900","BP_1900_1918","BP_1919_1929","BP_1930_1939",
    "BP_1945_1954","BP_1955_1964","BP_1965_1972","BP_1973_1982",
    "BP_1983_1992","BP_1993_1999","BP_2000_2009","BP_2010_2015"
]
age_totals = building_age[age_columns].sum()
age_totals.index = [
    "Pre-1900","1900-1918","1919-1929","1930-1939","1945-1954","1955-1964",
    "1965-1972","1973-1982","1983-1992","1993-1999","2000-2009","2010-2015"
]
age_probs = age_totals / age_totals.sum()
age_categories = age_totals.index.tolist()
building_age_sample = np.random.choice(age_categories, size=1, p=age_probs)[0]
print(building_age_sample)

### Season

In [None]:
season = ['Winter', 'Spring', 'Summer', 'Autumn']
season_sample = random.choice(season)
print(season_sample)

SEASON_MONTHS = {
    "Spring": [3, 4, 5],
    "Summer": [6, 7, 8],
    "Autumn": [9, 10, 11],
    "Winter": [12, 1, 2],
}

### Holiday

In [None]:
'''
Min 28 days off annually
'''
p = 28 / 365
holiday_binary_sample = 1 if random.random() < p else 0
print(holiday_binary_sample)

### Emergency Response Times

In [None]:
response = pd.read_csv("data/response_times.csv")
def hhmmss_to_hours(s):
    h, m, sec = map(int, s.split(":"))
    return h + m/60 + sec/3600

SEASON_MONTHS_FULL = {
    "Spring": ["March", "April", "May"],
    "Summer": ["June", "July", "August"],
    "Autumn": ["September", "October", "November"],
    "Winter": ["December", "January", "February"],
}
months = SEASON_MONTHS_FULL[season_sample]
response_season = response[response['month'].isin(months)]

c2_mean = response_season["C2_mean"].apply(hhmmss_to_hours)
c3_mean = response_season["C3_mean"].apply(hhmmss_to_hours)
c2_count = response_season['C2_count'].str.replace(',', '').astype(int).sum()
c3_count = response_season['C3_count'].str.replace(',', '').astype(int).sum()
c2_prob = c2_count / (c2_count + c3_count)
c3_prob = c3_count / (c2_count + c3_count)
response_category = np.random.choice(['C2', 'C3'], size=1, p=[c2_prob, c3_prob])[0]
if response_category == 'C2':
    shape, loc, scale = lognorm.fit(c2_mean, floc=0)
else:
    shape, loc, scale = lognorm.fit(c3_mean, floc=0)
response_sample = lognorm.rvs(shape, loc=loc, scale=scale, size=1)[0]

popden_mean = popden_total.mean()
if popden_sample < popden_mean: # scale based on average population density
    popden_factor = 1 - 0.5 * ((popden_mean - popden_sample) / popden_mean)  # reduce scale
else:
    popden_factor = 1 + 0.5 * ((popden_sample - popden_mean) / popden_mean)
prec_factor = np.exp(prec_sample / 50) # small rain = minimal effect, heavy rain = large effect
print(popden_factor, prec_factor)
adjusted_response = response_sample * popden_factor * prec_factor
print(adjusted_response)

### Ambulance handover delays

In [None]:
handover = pd.read_csv("data/ambulance_handover.csv")

for col in ["Handover time known", "Over 15 minutes", "Over 30 minutes", "Over 60 minutes", "Handover time unknown", "All handovers"]:
    handover[col] = handover[col].str.replace(",", "").astype(int)

handover["Under 15 min"] = handover["Handover time known"] - handover["Over 15 minutes"]
handover["15–30 min"] = handover["Over 15 minutes"] - handover["Over 30 minutes"]
handover["30–60 min"] = handover["Over 30 minutes"] - handover["Over 60 minutes"]
handover["Over 60 min"] = handover["Over 60 minutes"]

handover["Date_parsed"] = pd.to_datetime(handover["Date"], format="%b'%y")
handover_season = handover[handover["Date_parsed"].dt.month.isin(SEASON_MONTHS[season_sample])]

counts = handover_season[["Under 15 min", "15–30 min", "30–60 min", "Over 60 min"]].sum()
probs = counts / counts.sum()

samples = np.random.choice(probs.index.to_numpy(),size=1,p=probs.to_numpy())
handover_sample = samples[0]
print(handover_sample)


### Hospital bed availability

In [None]:
beds = pd.read_csv("data/hospital_beds.csv")

beds["Period"] = pd.to_datetime(beds["Period"], format="%d/%m/%Y")
# Aggregate by month (sum across all hospitals)
beds = beds.groupby('Period')[['Available', 'Occupied', 'Free']].sum().reset_index()

# Compute occupancy percentage
beds['OccupancyPct'] = beds['Occupied'] / beds['Available']

beds_season = beds[beds["Period"].dt.month.isin(SEASON_MONTHS[season_sample])]
values = beds_season["OccupancyPct"].dropna().values

mu, sigma = norm.fit(values)
occ_pct_sample = np.clip(norm.rvs(mu, sigma), 0, 1)

print(occ_pct_sample)

## Grid

In [None]:
# Creates overall grid shapefile for watercourse, flood risk, elevation, impervious surface area, historic flood, road, hospital locations
# recrop area using boundary shapefile for handling updated shapefile/tiff files (GM_shapefile/CAUTH_MAY_2025_EN_BSC.shp)
# ignore cell if on or past boundary
# cells of size 1km x 1km
# centroids used for distance calculations
''' 
data/watercourse/Watercourse.shp
- closest watercourse to centre of cell (m)
- density of watercourses in cell
data/flood_risk/rofsw_4bandPolygon/merged_rofsw_4bandPolygon.shp
- confidence-weighted average risk
data/elevation.tif
- average elevation in cell (m)
data/impervious_surface.tif
- fraction of impervious surface in cell
data/historic_flood_map/Historic_Flood_MapPolygon.shp
- if cell has been flooded in the past
data/road/RoadLink.shp
- closest major road to centre of cell (km)
- density of roads in cell
data/hospital_locations/hospital_locations.shp
- distance to nearest hospital (km)
'''

'''CELL_SIZE = 1000  # metres
CELL_AREA_M2 = CELL_SIZE ** 2
CELL_AREA_KM2 = CELL_AREA_M2 / 1e6

WATERCOURSE = "data/watercourse/Watercourse.shp"
FLOOD_RISK = "data/flood_risk/rofsw_4bandPolygon/merged_rofsw_4bandPolygon.shp"
ELEVATION = "data/elevation.tif"
IMPERVIOUS = "data/impervious.tif"
HISTORIC_FLOOD = "data/historic_flood_map/Historic_Flood_MapPolygon.shp"
ROAD = "data/road/RoadLink.shp"
HOSPITAL = "data/hospital_locations/hospital_locations.shp"
BOUNDARY = "GM_shapefile/CAUTH_MAY_2025_EN_BSC.shp"

OUTPUT = "data/grid/grid.shp"

RISK_SCORES = {"Very low": 1, "Low": 2, "Medium": 3, "High": 4}

def build_grid(boundary):
    minx, miny, maxx, maxy = boundary.total_bounds

    xs = np.arange(minx, maxx, CELL_SIZE)
    ys = np.arange(miny, maxy, CELL_SIZE)

    grid = gpd.GeoDataFrame(
        geometry=[box(x, y, x + CELL_SIZE, y + CELL_SIZE) for x in xs for y in ys],
        crs="EPSG:27700"
    )

    grid = grid[grid.geometry.within(boundary.geometry.union_all())]
    grid["cell_id"] = grid.index
    return grid

def line_density(grid, lines, colname, cell_size):
    grid[colname] = 0.0
    cell_area_km2 = (cell_size * cell_size) / 1e6  # km²
    for i, cell in grid.geometry.items():
        inter = lines.geometry.intersection(cell)
        length_m = sum(geom.length for geom in inter if not geom.is_empty)
        grid.at[i, colname] = (length_m / 1000) / cell_area_km2
    return grid


def nearest_distance(grid, targets, colname, km=True):
    centroids = grid.copy()
    centroids["geometry"] = centroids.geometry.centroid

    nearest = gpd.sjoin_nearest(
        centroids,
        targets[["geometry"]],
        how="left",
        distance_col=colname
    )

    dist = nearest.groupby(nearest.index)[colname].first()
    if km:
        dist = dist / 1000

    grid[colname] = dist
    return grid


def flood_risk_score(grid, risk):
    inter = gpd.overlay(
        grid[["cell_id", "geometry"]],
        risk,
        how="intersection"
    )

    if inter.empty:
        grid["risk_score"] = 0.0
        return grid

    inter["area"] = inter.geometry.area
    inter["risk_value"] = inter["risk_band"].map(RISK_SCORES).fillna(0)
    inter["conf_weight"] = inter["confidence"] / 10

    inter["num"] = inter["area"] * inter["risk_value"] * inter["conf_weight"]
    inter["den"] = inter["area"] * inter["conf_weight"]

    agg = inter.groupby("cell_id")[["num", "den"]].sum()
    grid["risk_score"] = (agg["num"] / agg["den"]).reindex(grid.cell_id).fillna(0)
    return grid


def zonal_mean(grid, raster_path, colname):
    stats = zonal_stats(
        grid.geometry,
        raster_path,
        stats="mean",
        nodata=0
    )
    grid[colname] = [s["mean"] for s in stats]
    return grid


def zonal_fraction_nonzero(grid, raster_path, colname):
    stats = zonal_stats(
        grid.geometry,
        raster_path,
        stats=["count", "nodata"],
        add_stats={"nonzero": lambda x: np.count_nonzero(x)}
    )
    grid[colname] = [
        s["nonzero"] / s["count"] if s["count"] else 0
        for s in stats
    ]
    return grid


def historic_flood_flag(grid, historic):
    inter = gpd.overlay(
        grid[["cell_id", "geometry"]],
        historic,
        how="intersection"
    )
    inter["area"] = inter.geometry.area
    flooded = inter.groupby("cell_id")["area"].sum()

    grid["historic"] = (
        flooded / CELL_AREA_M2 > 0.5
    ).reindex(grid.cell_id).fillna(False).astype(int)

    return grid


# LOAD DATA
print("Loading data...")

water = gpd.read_file(WATERCOURSE).set_crs(epsg=27700, allow_override=True)
risk = gpd.read_file(FLOOD_RISK).set_crs(epsg=27700, allow_override=True)
historic = gpd.read_file(HISTORIC_FLOOD).set_crs(epsg=27700, allow_override=True)
road = gpd.read_file(ROAD).set_crs(epsg=27700, allow_override=True)
hospital = gpd.read_file(HOSPITAL).set_crs(epsg=27700, allow_override=True)
boundary = gpd.read_file(BOUNDARY).set_crs(epsg=27700, allow_override=True)

# GRID
print("Building grid...")
grid = build_grid(boundary)
grid.to_file("data/grid/grid_step_01.shp")

# LINE DENSITIES
print("Calculating watercourse density...")
grid = line_density(grid, water, "water_dens", CELL_SIZE)

print("Calculating road density...")
grid = line_density(grid, road, "road_dens", CELL_SIZE)

# NEAREST DISTANCES
print("Calculating nearest watercourse distance...")
grid = nearest_distance(grid, water, "water_dist", km=False)

print("Calculating nearest hospital distance...")
grid = nearest_distance(grid, hospital, "hospital")

print("Calculating nearest major road distance...")
major_roads = road[road["function"].isin(["A Road", "Motorway"])]
grid = nearest_distance(grid, major_roads, "road_dist")

# FLOOD RISK
print("Calculating flood risk score...")
grid = flood_risk_score(grid, risk)

# RASTER FEATURES
print("Calculating elevation...")
grid = zonal_mean(grid, ELEVATION, "elevation")

print("Calculating impervious fraction...")
grid = zonal_fraction_nonzero(grid, IMPERVIOUS, "impervious")

# HISTORIC FLOOD
print("Calculating historic flood flag...")
grid = historic_flood_flag(grid, historic)

# SAVE OUTPUT
cols = [
    "water_dens", "water_dist", "risk_score", "elevation",
    "impervious", "historic", "road_dens", "road_dist", "hospital", "geometry"
]

grid[cols].to_file(OUTPUT)'''

In [None]:
# Read from overall grid shapefile
gdf = gpd.read_file('data/grid/grid.shp')
features = [
    "water_dens", "water_dist", "risk_score", "elevation",
    "impervious", "historic", "road_dens", "road_dist", "hospital"
]

def sample_cell_with_noise(gdf, features, noise_scale=0.05):
    row = gdf.sample(1).iloc[0]
    sample = {}
    for f in features:
        val = row[f]
        noise = np.random.normal(0, noise_scale * abs(val + 1e-6))
        sample[f] = max(val + noise, 0)
    sample["geometry"] = row.geometry 
    return sample

def sample_neighborhood(gdf, features, k=5):
    cell = gdf.sample(1)
    dists = gdf.geometry.distance(cell.geometry.iloc[0])
    neighbors = gdf.loc[dists.nsmallest(k).index]
    return neighbors[features].mean().to_dict()

grid_sample = sample_cell_with_noise(gdf, features)
neighborhood_sample = sample_neighborhood(gdf, features)
print(grid_sample)


### Soil moisture saturation (WORK IN PREC)

In [None]:
# Choose pixel that contains the overall grid cell chosen
# sample from normal distribution of time-series values for the pixel
# filters by season

cell_geom = grid_sample["geometry"]  # polygon of the sampled grid cell

# Load soil moisture rasters
tiff_files = sorted(glob.glob("data/soil_moisture/*.tif"))
season_tiffs = []

for f in tiff_files:
    # Extract date from filename: dt_smuk_2023-12-22.tif
    date_str = f.split("_")[-1].replace(".tif", "")
    file_date = dt.datetime.strptime(date_str, "%Y-%m-%d")
    if file_date.month in SEASON_MONTHS[season_sample]:
        season_tiffs.append(f)

# Load soil moisture rasters
stack = []
with rasterio.open(season_tiffs[0]) as src:
    transform = src.transform
    nodata = src.nodata
    for f in season_tiffs:
        with rasterio.open(f) as s:
            data = s.read(1).astype(np.float32)
            if nodata is not None:
                data[data == nodata] = np.nan
            stack.append(data)

stack = np.stack(stack, axis=0)  # shape: (time, rows, cols)

# Find the single pixel containing the centroid of the sampled grid cell
centroid_x, centroid_y = cell_geom.centroid.x, cell_geom.centroid.y
col, row = ~transform * (centroid_x, centroid_y)
row, col = int(row), int(col)

# Ensure row/col are within raster bounds
row = np.clip(row, 0, stack.shape[1]-1)
col = np.clip(col, 0, stack.shape[2]-1)

# Extract time series for that pixel
values = stack[:, row, col]
values = values[~np.isnan(values)]

# Fit normal distribution safely
if len(values) == 0:
    soil_sample = np.nan
else:
    mu, sigma = norm.fit(values)
    sigma = max(sigma, 1e-6)
    soil_sample = norm.rvs(mu, sigma)

print(soil_sample)



### Flood depth

In [None]:
'''
to get the probability of exceeding each depth threshold
- each shapefile shows area at or above a certain flood depth
- e.g. the clipped cell of each shapefile contains multiple polygons overlapping
- using rofsw_4bandPolygon.shp get the confidence (0-1), take into account the risk_band (Very low, Low, Medium, High), using the area of the cell that is flooded at each threshold
- calculate the probability of flood depth less than each range
- e.g. P(depth > 0.2) = area_flooded_at_0.2 / total_area
- P(depth <= 0.2) = 1 - P(depth > 0.2)
- e.g. P(depth > 0.3) = area_flooded_at_0.3 / total_area
- P(depth <= 0.3 and > 0.2) = P(depth <= 0.3) - P(depth <= 0.2)
- based on those sample flood depth range probabilistically (0, 0-0.2, 0.2-0.3, 0.3-0.6, 0.6-0.9, 0.9-1.2, >1.2)
'''
clipped_data = {}

for shp_path in glob.glob('data/flood_risk/*/*.shp'):
    gdf_shp = gpd.read_file(shp_path)
    gdf_shp_clipped = gdf_shp.clip(grid_sample["geometry"].bounds)
    
    # Store clipped data
    key = os.path.basename(shp_path).replace(".shp","")
    clipped_data[key] = gdf_shp_clipped

print("Clipped shapefiles:", list(clipped_data.keys()))


In [None]:
# Configuration: depth thresholds and risk-band weights
DEPTH_THRESHOLDS = [0.2, 0.3, 0.6, 0.9, 1.2]

RISK_BAND_WEIGHTS = {
    "Very Low": 0.25,
    "Low": 0.5,
    "Medium": 0.75,
    "High": 1.0
}

# Load and clip RoFSW polygons
rofsw = gpd.read_file(
    "data/flood_risk/rofsw_4bandPolygon/merged_rofsw_4bandPolygon.shp"
)
rofsw = rofsw.clip(gpd.GeoSeries([grid_sample["geometry"]]))


# Load and clip depth-threshold shapefiles
threshold_layers = {}

for shp_path in glob.glob("data/flood_risk/*/*.shp"):
    if "rofsw_4bandPolygon" in shp_path:
        continue

    gdf = gpd.read_file(shp_path)
    gdf = gdf.clip(gpd.GeoSeries([grid_sample["geometry"]]))

    if not gdf.empty:
        key = os.path.basename(shp_path)
        threshold_layers[key] = gdf


In [None]:
# Prepare weights and grid-cell area
cell_area = grid_sample["geometry"].area

rofsw["risk_weight"] = rofsw["risk_band"].map(RISK_BAND_WEIGHTS)
rofsw["combined_weight"] = rofsw["confidence"] * rofsw["risk_weight"]

# Calculate P(depth > 0) based on RoFSW
if rofsw.empty:
    p_flood = 0.0
else:
    # Fraction of cell that is flooded (any depth > 0)
    rofsw_weighted_area = (rofsw.geometry.area * rofsw["combined_weight"]).sum()
    p_flood = min(rofsw_weighted_area / cell_area, 1.0)

# Function to compute weighted exceedance probability
def weighted_flood_fraction(threshold_gdf):
    if threshold_gdf.empty:
        return 0.0

    overlay = gpd.overlay(threshold_gdf, rofsw, how="intersection")
    if overlay.empty:
        return 0.0

    overlay["weighted_area"] = overlay.geometry.area * overlay["combined_weight"]
    return overlay["weighted_area"].sum() / cell_area

# Compute P(depth > d) for each threshold
exceedance_probs = {0: p_flood}
for depth in DEPTH_THRESHOLDS:
    matching = [gdf for name, gdf in threshold_layers.items() if f"{depth}".replace(".", "") in name.replace("_", "")]
    exceedance_probs[depth] = weighted_flood_fraction(matching[0])

# Calculate probabilities for each depth range
breaks = [0, 0.2, 0.3, 0.6, 0.9, 1.2]
probs = {(0.0, 0.0): 1 - exceedance_probs[0], (1.2, np.inf): exceedance_probs[1.2]}
for i in range(len(breaks)-1):
    a, b = breaks[i], breaks[i+1]
    probs[(a, b)] = exceedance_probs[a] - exceedance_probs[b]


In [None]:
print(probs)
# Sample a range
bins = list(probs.keys())
range_probs = np.array(list(probs.values()))
range_index = np.random.choice(len(bins), p=range_probs)
low, high = bins[range_index]

# Sample within the selected range
if low == 0.0 and high == 0.0:
    sampled_depth = 0.0
elif np.isinf(high):
    sampled_depth = low + np.random.exponential(scale=0.3)
else:
    sampled_depth = np.random.uniform(low, high)

print("\nSelected depth range:", (low, high))
print("Sampled flood depth (m):", round(sampled_depth, 3))

### Depth-damage

In [None]:
df = pd.read_csv("data/depth_damage.csv")
depths = np.array([float(c) for c in df.columns[1:]])

# Compute overall damage fraction (mean across all types)
overall_damage = df.iloc[:, 1:].astype(float).mean(axis=0).values

# Define exponential damage function
def exp_damage(d, k):
    return 1 - np.exp(-k * d)

# Fit the exponential model
params, _ = curve_fit(exp_damage, depths, overall_damage, bounds=(0, np.inf))
k = params[0]

# Add noise and ensures stays within bounds
damage_fraction_sample = np.clip(exp_damage(sampled_depth, k) + np.random.normal(0, 0.05), 0, 1)
print(damage_fraction_sample)


## Household

### Disability rate

In [None]:
disability = pd.read_csv("data/disabled.csv")
total_disability = disability.groupby('Disability (3 categories)')['Observation'].sum()

alpha = total_disability['Disabled under the Equality Act'] + 1
beta = total_disability['Not disabled under the Equality Act'] + 1

disabled_sample = np.random.beta(alpha, beta, size=1)[0]
print(disabled_sample)

### English proficiency

In [None]:
def map_proficiency(category):
    if category in [
        "Main language is English (English or Welsh in Wales)",
        "Main language is not English (English or Welsh in Wales): Can speak English very well or well"
    ]:
        return "Good English Proficiency"
    elif category == "Main language is not English (English or Welsh in Wales): Cannot speak English or cannot speak English well":
        return "Bad English Proficiency"
    else:
        return None

english = pd.read_csv("data/english_proficiency.csv")   
total_english = english.groupby('Proficiency in English language (4 categories)')['Observation'].sum()
english['Proficiency_Group'] = english['Proficiency in English language (4 categories)'].apply(map_proficiency)
grouped_english = english.groupby('Proficiency_Group')['Observation'].sum()
#english_probs = grouped_english / grouped_english.sum()

alpha = grouped_english['Good English Proficiency'] + 1
beta = grouped_english['Bad English Proficiency'] + 1

english_sample = np.random.beta(alpha, beta, size=1)[0]
print(english_sample)

### General health

In [None]:
df = pd.read_csv('data/general_health.csv')
gen_health_sample = sample_categorical_census(df, 
                                                'General health (4 categories)', 
                                                'Observation', 
                                                ['Does not apply'])
print(gen_health_sample)

### Age

In [None]:
df = pd.read_csv('data/age.csv')
age_sample = sample_categorical_census(df,
                                       'Age (6 categories)',
                                       'Observation',
                                       [])
print(age_sample)

### Elderly

In [None]:
elderly_sample = 1 if age_sample == 'Aged 65 years and over' else 0
print(elderly_sample)

### Children

In [None]:
child_sample = 1 if age_sample == 'Aged 15 years and under' else 0
print(child_sample)

### Employment history

In [None]:
'''
Does not apply: Either child or in employment
'''

employ = pd.read_csv('data/employment-age.csv')
filtered_employ = employ[employ['Age (6 categories)'] == age_sample]

employ_sample = sample_categorical_census(filtered_employ,
                                       'Employment history (4 categories)',
                                       'Observation',
                                       [])

print(employ_sample)

### Highest level of qualification

In [None]:
'''
Does not apply is only for <15, 
'''
qual = pd.read_csv('data/qualification-age.csv')
filtered_qual = qual[qual['Age (6 categories)'] == age_sample]

qual_sample = sample_categorical_census(filtered_qual,
                                       'Highest level of qualification (7 categories)',
                                       'Observation',
                                       [])
print(qual_sample)

### Lifestage of household reference person

In [None]:
lifestage = pd.read_csv('data/lifestage_hrp_age.csv')
filtered_lifestage = lifestage[lifestage['Age (6 categories)'] == age_sample]
lifestage_sample = sample_categorical_census(filtered_lifestage,
                                       'Lifestage of Household Reference Person(13 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(lifestage_sample)

### Accomodation type

In [None]:
acco_type = pd.read_csv('data/accomodation_type.csv')
acco_type_sample = sample_categorical_census(acco_type,
                                       'Accommodation type (5 categories)',
                                       'Observation',
                                       [])
print(acco_type_sample)

### Vehicle

In [None]:
vehicle = pd.read_csv('data/vehicle.csv')
vehicle_sample = sample_categorical_census(vehicle,
                                       'Car or van availability (3 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(vehicle_sample)

### Second address

In [None]:
second_add = pd.read_csv('data/second_address.csv')
second_add_sample = sample_categorical_census(second_add,
                                       'Second address indicator (3 categories)',
                                       'Observation',
                                       [])
print(second_add_sample)

### Household size

In [None]:
house_size = pd.read_csv('data/household_size.csv')
house_size_sample = sample_categorical_census(house_size,
                                       'Household size (5 categories)',
                                       'Observation',
                                       ['0 people in household'])
print(house_size_sample)

### Economic activity status

In [None]:
'''
Does not apply is only for <15, 
'''
eas = pd.read_csv('data/nssec_economic_age.csv')
filtered_eas = eas[eas['Age (6 categories)'] == age_sample]

eas_sample = sample_categorical_census(filtered_eas,
                                       'Economic activity status (4 categories)',
                                       'Observation',
                                       [])
print(eas_sample)

### Ns-SeC

In [None]:
nssec = pd.read_csv('data/nssec_economic_age.csv')
filtered_nssec = nssec[nssec['Age (6 categories)'] == age_sample]
filtered_nssec = filtered_nssec[filtered_nssec['Economic activity status (4 categories)'] == eas_sample]
nssec_sample = sample_categorical_census(filtered_nssec, 
                                         'National Statistics Socio-economic Classification (NS-SeC) (10 categories)', 
                                         'Observation',
                                         [])
print(nssec_sample)

### Mean income

In [None]:
income = pd.read_csv("data/mean_income.csv")
income['Total annual income (£)'] = (
    income['Total annual income (£)']
    .str.strip()        
    .str.replace(',', '')      
    .astype(float)       
)

log_income = np.log(income['Total annual income (£)'])
shape, loc, scale = skewnorm.fit(log_income)
sample_log = skewnorm.rvs(shape, loc=loc, scale=scale, size=1)
income_sample = np.exp(sample_log)[0]

NSSEC = {
    "L1, L2 and L3: Higher managerial, administrative and professional occupations": 1.90,
    "L4, L5 and L6: Lower managerial, administrative and professional occupations": 1.35,
    "L7: Intermediate occupations": 1.00,
    "L8 and L9: Small employers and own account workers": 1.10,
    "L10 and L11: Lower supervisory and technical occupations": 0.90,
    "L12: Semi-routine occupations": 0.75,
    "L13: Routine occupations": 0.65,
    "L14.1 and L14.2: Never worked and long-term unemployed": 0.40,
    "L15: Full-time students": 0.35,
    "Does not apply": 0.00
}

income_sample *= NSSEC[nssec_sample]
print(income_sample)

### Low-income fraction

In [None]:

median = income['Total annual income (£)'].median()
low_income_threshold = 0.6 * median
print(low_income_threshold)

low_income_sample = 1 if income_sample < low_income_threshold else 0
print(low_income_sample)


### No. adults employed in household

In [None]:
num_adults = pd.read_csv('data/household_employed_size.csv')
filtered_num_adults = num_adults[num_adults['Household size (5 categories)'] == house_size_sample]

num_adults_sample = sample_categorical_census(filtered_num_adults,
                                       'Number of adults in employment in household (5 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(num_adults_sample)

### No. disabled people household 

In [None]:
num_disable = pd.read_csv('data/household_disabled_size.csv')
filtered_num_disable = num_disable[num_disable['Household size (5 categories)'] == house_size_sample]

num_disable_sample = sample_categorical_census(filtered_num_disable,
                                       'Number of disabled people in household (4 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(num_disable_sample)

### No. long-term health in household

In [None]:
num_long = pd.read_csv('data/household_long-term_size.csv')
filtered_num_long = num_long[num_long['Household size (5 categories)'] == house_size_sample]

num_long_sample = sample_categorical_census(filtered_num_long,
                                       'Number of people in household with a long-term heath condition but are not disabled (4 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(num_long_sample)

### Deprived in education

In [None]:
dep_edu = pd.read_csv('data/deprived_education+deps.csv')
filtered_dep_edu = dep_edu[dep_edu['Highest level of qualification (7 categories)'] == qual_sample]

dep_edu_sample = sample_categorical_census(filtered_dep_edu,
                                       'Household deprived in the education dimension (3 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(dep_edu_sample)

### Deprived in employment

In [None]:
dep_employ = pd.read_csv('data/deprived_employment+deps.csv')
filtered_dep_employ = dep_employ[dep_employ['Employment history (4 categories)'] == employ_sample]
filtered_dep_employ = filtered_dep_employ[filtered_dep_employ['National Statistics Socio-economic Classification (NS-SeC) (10 categories)'] == nssec_sample]
dep_employ_sample = sample_categorical_census(filtered_dep_employ,
                                       'Household deprived in the employment dimension (3 categories)',
                                       'Observation',
                                       [])
print(dep_employ_sample)

### Deprived in health and disability

In [None]:
dep_health = pd.read_csv('data/deprived_health+deps.csv')
filtered_dep_health = dep_health[dep_health['Number of people in household with a long-term heath condition but are not disabled (4 categories)'] == num_long_sample]
filtered_dep_health = filtered_dep_health[filtered_dep_health['Number of disabled people in household (4 categories)'] == num_disable_sample]
dep_health_sample = sample_categorical_census(filtered_dep_health,
                                       'Household deprived in the health and disability dimension (3 categories)',
                                       'Observation',
                                       [])
print(dep_health_sample)

### No. of people per room in household 

In [None]:
num_people = pd.read_csv('data/people_per_room_hsize.csv')
filtered_num_people = num_people[num_people['Household size (5 categories)'] == house_size_sample]
num_people_sample = sample_categorical_census(filtered_num_people,
                                       'Number of people per room in household (5 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(num_people_sample)

### Occupancy rating for rooms

In [None]:
occupancy = pd.read_csv('data/occupancy_rating_nopeopleper.csv')
filtered_occupancy = occupancy[occupancy['Number of people per room in household (5 categories)'] == num_people_sample]
num_occupancy = sample_categorical_census(filtered_occupancy,
                                       'Occupancy rating for rooms (5 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(num_occupancy)

### Deprived in housing

In [None]:
dep_housing = pd.read_csv('data/deprived_housing+deps.csv')
filtered_dep_housing = dep_housing[dep_housing['Number of people per room in household (5 categories)'] == num_people_sample]
filtered_dep_housing = filtered_dep_housing[filtered_dep_housing['Occupancy rating for rooms (5 categories)'] == num_occupancy]
dep_housing_sample = sample_categorical_census(filtered_dep_housing,
                                       'Household deprived in the housing dimension (3 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(dep_housing_sample)

### Household deprivation

In [None]:
# Smaller = less deprived
household_dep_sample = (dep_edu_sample == 'Household is deprived in the education dimension') + (dep_employ_sample == 'Household is deprived in the employment dimension') + (dep_health_sample == 'Household is deprived in the health and disability dimension') + (dep_housing_sample == 'Household is deprived in the housing dimension')
print(household_dep_sample)

### Tenure of household

In [None]:
tenure = pd.read_csv('data/tenure.csv')
tenure_sample = sample_categorical_census(tenure,
                                       'Tenure of household (7 categories)',
                                       'Observation',
                                       ['Does not apply'])
print(tenure_sample)

### Household access to internet

In [None]:
if lifestage_sample == 'Household reference person is aged 66 years or over: One-person household':
    prob = 0.85
else:
    prob = 0.98
internet_sample = np.random.binomial(n=1, p=prob)
print(internet_sample)


### Home insurance coverage 

In [None]:
home_insure = 0.75
home_insure_sample = np.random.binomial(1, home_insure, size=1)[0]
print(home_insure_sample)

### Health insurance coverage

In [None]:
health_insure = 0.14
health_insure_sample = np.random.binomial(1, health_insure, size=1)[0]
print(health_insure_sample)

### Household (INCORPORATE INSURANCE)

In [None]:
'''
Home ownership:
- wealth accumulation
- tenure security
Mortgage holders retain equity but face some financial exposure
Private renters:
- higher housing cost volatility
- lower security
Social renters:
- low-income
- benefit-dependent
- high-deprivation populations
'''
TENURE_RISK = {
    "Owned: Owns outright": 0.0,
    "Owned: Owns with a mortgage or loan or shared ownership": 0.1,
    "Private rented: Private landlord or letting agency": 0.6,
    "Private rented: Other private rented or lives rent free": 0.6,
    "Social rented: Rents from council or Local Authority": 0.8,
    "Social rented: Other social rented": 0.8,
}

'''
- Housing quality, space, and permanence decrease down the list
- Flats and temporary housing show higher overcrowding and energy risk
- Temporary structures are near-maximal risk 
'''
ACCO_RISK = {
    "Whole house or bungalow: Detached": 0.1,
    "Whole house or bungalow: Semi-detached": 0.1,
    "Whole house or bungalow: Terraced": 0.3,
    "Flat, maisonette or apartment": 0.5,
    "A caravan or other mobile or temporary structure": 0.9,
}

'''
Single-person households:
- income fragility
- social isolation risk
Two-person households:
- risk-sharing
Large households:
- crowding
- higher costs
- child dependency
'''
SIZE_RISK = {
    "1 person in household": 0.5,
    "2 people in household": 0.2,
    "3 people in household": 0.3,
    "4 or more people in household": 0.6,
}

'''
Internet access for weather warning services
'''
INTERNET_RISK = {
    1: 0.0,
    0: 0.6,
}

'''
Risk increases with less earners
'''
EMPLOYMENT_RISK = {
    "3 or more adults in employment in household": 0.0,
    "2 adults in employment in household": 0.2,
    "1 adult in employment in household": 0.5,
    "No adults in employment in household": 0.9,
}

'''
Deprivation in education, employment, health, housing
'''
DEPRIVATION_RISK = {
    0: 0.0,
    1: 0.25,
    2: 0.5,
    3: 0.75,
    4: 1.0,
}

'''
Older single households:
- health risks
- lower chance of internet access
- elderly vulnerable
Households with dependent children:
- cost pressure
- children vulnerable
Child-free working-age households are most resilient
'''
def lifestage_risk_map(x):
    if "66 years or over" in x and "One-person" in x:
        return 0.7
    if "Dependent children" in x:
        return 0.6
    if "Two or more person household: No dependent children" in x:
        return 0.3
    return 0.4

tenure_risk = TENURE_RISK[tenure_sample]
acco_risk = ACCO_RISK[acco_type_sample]
size_risk = SIZE_RISK[house_size_sample]
internet_risk = INTERNET_RISK[internet_sample]
employment_risk = EMPLOYMENT_RISK[num_adults_sample]
deprivation_risk = DEPRIVATION_RISK[household_dep_sample]
lifestage_risk = lifestage_risk_map(lifestage_sample)

'''
Deprivation: looks at 4 factors (higher weight)
Employment: Income stability
Employment: Wealth and security
Others: Secondary modifiers
'''
WEIGHTS = {
    "tenure_risk": 0.15,
    "acco_risk": 0.10,
    "size_risk": 0.10,
    "internet_risk": 0.10,
    "lifestage_risk": 0.10,
    "employment_risk": 0.20,
    "deprivation_risk": 0.25,
}
 
risk_score = (
    WEIGHTS["tenure_risk"]       * tenure_risk +
    WEIGHTS["acco_risk"]         * acco_risk +
    WEIGHTS["size_risk"]         * size_risk +
    WEIGHTS["internet_risk"]     * internet_risk +
    WEIGHTS["lifestage_risk"]    * lifestage_risk +
    WEIGHTS["employment_risk"]   * employment_risk +
    WEIGHTS["deprivation_risk"]  * deprivation_risk
)

print('Tenure -', tenure_sample)
print('Accomodation type -', acco_type_sample)
print('Household size -', house_size_sample)
print('Internet access -', internet_sample)
print('Adults employed -', num_adults_sample)
print('Household deprivation -', household_dep_sample)
print('Lifestage HRP -', lifestage_sample)

noise = np.random.normal(loc=0, scale=0.03) 
household_risk_score = np.clip(risk_score + noise, 0, 1)
print(household_risk_score)

## Derived