In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from scipy.stats import gamma, lognorm, norm, skewnorm
import random
import os
import rasterio
import glob
import datetime as dt
from scipy.optimize import curve_fit

def sample_categorical_census(df: pd.DataFrame, category_col: str, value_col: str, ignore_categories: list):
    """
    Sample from a categorical distribution.
    """

    # Filter ignored categories
    if ignore_categories:
        df = df[~df[category_col].isin(ignore_categories)]

    totals = df.groupby(category_col)[value_col].sum()
    probs = totals / totals.sum()

    samples = np.random.choice(
        probs.index.to_numpy(),
        size=1,
        p=probs.to_numpy()
    )

    return samples[0]


## Hydrological

### 24-Hour Precipitation

In [None]:
precipitation = pd.read_csv("data/precipitation.csv")
rainfall = precipitation["rainfall"]
pct_zero = (rainfall == 0).sum() / len(rainfall)
zero_sample = np.random.binomial(n=1, p=pct_zero, size=1)[0]

precipitation_nonzero = precipitation[precipitation["rainfall"] != 0].copy()
rainfall_nonzero = precipitation["rainfall"]
max_prec = max(rainfall_nonzero)
shape, loc, scale = gamma.fit(rainfall_nonzero)
prec_sample = 0 if zero_sample == 1 else gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]
print(prec_sample)

### Groundwater level (WORK IN PREC)

In [None]:
# Choose random groundwater station, Gamma

# mAOD
# add prec dependency, modify ground water level based on previous prec_sample

groundwater_files = [f for f in os.listdir("data/groundwater_level") if f.lower().endswith(".csv")]
gw_file = random.choice(groundwater_files)

path = os.path.join("data/groundwater_level", gw_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
a, loc, scale = skewnorm.fit(values)
gw_sample = skewnorm.rvs(a, loc, scale, size=1)[0]

print(f"Selected file: {gw_file}")
print(gw_sample)

### River flow (WORK IN PREC)

In [None]:
# Random choose river station, Gamma

# m3/s
# add prec dependency, modify ground water level based on previous prec_sample

river_flow_files = [f for f in os.listdir("data/river_flow") if f.lower().endswith(".csv")]
rf_file = random.choice(river_flow_files)

path = os.path.join("data/river_flow", rf_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
shape, loc, scale = gamma.fit(values, floc=0)
rf_sample = gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]

print(f"Selected file: {rf_file}")
print(rf_sample)


### River level (WORK IN PREC)

In [None]:
# Using same river station as river flow, Gamma

# m
# add prec dependency, modify ground water level based on previous prec_sample

file_prefix = rf_file.split('-')[0]
river_level_files = [f for f in os.listdir("data/river_level") if f.lower().endswith(".csv")]
for f in river_level_files:
    if file_prefix in f:
        rl_file = f
        break
    
path = os.path.join("data/river_level", rl_file)
df = pd.read_csv(path)

values = pd.to_numeric(df["value"], errors="coerce").dropna()
shape, loc, scale = gamma.fit(values, floc=0)
rl_sample = gamma.rvs(a=shape, loc=loc, scale=scale, size=1)[0]

print(f"Selected file: {rl_file}")
print(rl_sample)



## Other

### Urban/Rural

In [None]:
urban = pd.read_csv("data/urban_rural.csv")
counts = urban['Urban_rural_flag'].value_counts()
urban_probs = counts / counts.sum()
p_urban = urban_probs['Urban']
urban_sample = np.random.binomial(n=1, p=p_urban, size=1)[0]
print(urban_sample)

### Population density

In [None]:
popden = pd.read_csv("data/population_density.csv", dtype={"LAD2021": "string", "OA21CD": "string", "Total": "Int64"})
merged = popden.merge(urban, on="OA21CD", how="left")
popden_urbanrural = merged[['LAD2021','OA21CD','Total','Urban_rural_flag']]

flag = 'urban' if urban_sample == 1 else 'rural'
popden_total  = popden_urbanrural[popden_urbanrural["Urban_rural_flag"].str.lower() == flag]["Total"]

log = np.log(popden_total)

mu, sigma = log.mean(), log.std()

popden_sample = np.random.lognormal(mean=mu, sigma=sigma, size=1)[0]
print(popden_sample)

### Mean property value

In [None]:
property = pd.read_csv("data/property_value.csv")
property = property.dropna(subset=['price', 'property_type', 'duration'])
property = property[property['price'] > 0]

log_property = np.log(property['price'])
shape, loc, scale = skewnorm.fit(log_property)
sample_log = skewnorm.rvs(shape, loc=loc, scale=scale, size=1)
property_sample = np.exp(sample_log)[0]
print(property_sample)

### Building age

In [None]:
building_age = pd.read_csv("data/property_age.csv")
age_columns = [
    "BP_PRE_1900","BP_1900_1918","BP_1919_1929","BP_1930_1939",
    "BP_1945_1954","BP_1955_1964","BP_1965_1972","BP_1973_1982",
    "BP_1983_1992","BP_1993_1999","BP_2000_2009","BP_2010_2015"
]
age_totals = building_age[age_columns].sum()
age_totals.index = [
    "Pre-1900","1900-1918","1919-1929","1930-1939","1945-1954","1955-1964",
    "1965-1972","1973-1982","1983-1992","1993-1999","2000-2009","2010-2015"
]
age_probs = age_totals / age_totals.sum()
age_categories = age_totals.index.tolist()
building_age_sample = np.random.choice(age_categories, size=1, p=age_probs)[0]
print(building_age_sample)

### Season

In [None]:
season = ['Winter', 'Spring', 'Summer', 'Autumn']
season_sample = random.choice(season)
print(season_sample)

SEASON_MONTHS = {
    "Spring": [3, 4, 5],
    "Summer": [6, 7, 8],
    "Autumn": [9, 10, 11],
    "Winter": [12, 1, 2],
}

### Holiday

In [None]:
'''
Min 28 days off annually
'''
p = 28 / 365
holiday_binary_sample = 1 if random.random() < p else 0
print(holiday_binary_sample)

### Emergency Response Times

In [None]:
response = pd.read_csv("data/response_times.csv")
def hhmmss_to_hours(s):
    h, m, sec = map(int, s.split(":"))
    return h + m/60 + sec/3600

SEASON_MONTHS_FULL = {
    "Spring": ["March", "April", "May"],
    "Summer": ["June", "July", "August"],
    "Autumn": ["September", "October", "November"],
    "Winter": ["December", "January", "February"],
}
months = SEASON_MONTHS_FULL[season_sample]
response_season = response[response['month'].isin(months)]

c2_mean = response_season["C2_mean"].apply(hhmmss_to_hours)
c3_mean = response_season["C3_mean"].apply(hhmmss_to_hours)
c2_count = response_season['C2_count'].str.replace(',', '').astype(int).sum()
c3_count = response_season['C3_count'].str.replace(',', '').astype(int).sum()
c2_prob = c2_count / (c2_count + c3_count)
c3_prob = c3_count / (c2_count + c3_count)
response_category = np.random.choice(['C2', 'C3'], size=1, p=[c2_prob, c3_prob])[0]
if response_category == 'C2':
    shape, loc, scale = lognorm.fit(c2_mean, floc=0)
else:
    shape, loc, scale = lognorm.fit(c3_mean, floc=0)
response_sample = lognorm.rvs(shape, loc=loc, scale=scale, size=1)[0]

popden_mean = popden_total.mean()
if popden_sample < popden_mean: # scale based on average population density
    popden_factor = 1 - 0.5 * ((popden_mean - popden_sample) / popden_mean)  # reduce scale
else:
    popden_factor = 1 + 0.5 * ((popden_sample - popden_mean) / popden_mean)
prec_factor = np.exp(prec_sample / 50) # small rain = minimal effect, heavy rain = large effect
print(popden_factor, prec_factor)
adjusted_response = response_sample * popden_factor * prec_factor
print(adjusted_response)

### Ambulance handover delays

In [None]:
handover = pd.read_csv("data/ambulance_handover.csv")

for col in ["Handover time known", "Over 15 minutes", "Over 30 minutes", "Over 60 minutes", "Handover time unknown", "All handovers"]:
    handover[col] = handover[col].str.replace(",", "").astype(int)

handover["Under 15 min"] = handover["Handover time known"] - handover["Over 15 minutes"]
handover["15–30 min"] = handover["Over 15 minutes"] - handover["Over 30 minutes"]
handover["30–60 min"] = handover["Over 30 minutes"] - handover["Over 60 minutes"]
handover["Over 60 min"] = handover["Over 60 minutes"]

handover["Date_parsed"] = pd.to_datetime(handover["Date"], format="%b'%y")
handover_season = handover[handover["Date_parsed"].dt.month.isin(SEASON_MONTHS[season_sample])]

counts = handover_season[["Under 15 min", "15–30 min", "30–60 min", "Over 60 min"]].sum()
probs = counts / counts.sum()

samples = np.random.choice(probs.index.to_numpy(),size=1,p=probs.to_numpy())
handover_sample = samples[0]
print(handover_sample)


### Hospital bed availability

In [None]:
beds = pd.read_csv("data/hospital_beds.csv")

beds["Period"] = pd.to_datetime(beds["Period"], format="%d/%m/%Y")
# Aggregate by month (sum across all hospitals)
beds = beds.groupby('Period')[['Available', 'Occupied', 'Free']].sum().reset_index()

# Compute occupancy percentage
beds['OccupancyPct'] = beds['Occupied'] / beds['Available']

beds_season = beds[beds["Period"].dt.month.isin(SEASON_MONTHS[season_sample])]
values = beds_season["OccupancyPct"].dropna().values

mu, sigma = norm.fit(values)
occ_pct_sample = np.clip(norm.rvs(mu, sigma), 0, 1)

print(occ_pct_sample)

## Grid

In [None]:
# Creates overall grid shapefile for watercourse, flood risk, elevation, impervious surface area, historic flood, road, hospital locations
# recrop area using boundary shapefile for handling updated shapefile/tiff files (GM_shapefile/CAUTH_MAY_2025_EN_BSC.shp)
# ignore cell if on or past boundary
# cells of size 1km x 1km
# centroids used for distance calculations
''' 
data/watercourse/Watercourse.shp
- closest watercourse to centre of cell (m)
- density of watercourses in cell
data/flood_risk/rofsw_4bandPolygon/merged_rofsw_4bandPolygon.shp
- confidence-weighted average risk
data/elevation.tif
- average elevation in cell (m)
data/impervious_surface.tif
- fraction of impervious surface in cell
data/historic_flood_map/Historic_Flood_MapPolygon.shp
- if cell has been flooded in the past
data/road/RoadLink.shp
- closest major road to centre of cell (km)
- density of roads in cell
data/hospital_locations/hospital_locations.shp
- distance to nearest hospital (km)
'''

'''CELL_SIZE = 1000  # metres
CELL_AREA_M2 = CELL_SIZE ** 2
CELL_AREA_KM2 = CELL_AREA_M2 / 1e6

WATERCOURSE = "data/watercourse/Watercourse.shp"
FLOOD_RISK = "data/flood_risk/rofsw_4bandPolygon/merged_rofsw_4bandPolygon.shp"
ELEVATION = "data/elevation.tif"
IMPERVIOUS = "data/impervious.tif"
HISTORIC_FLOOD = "data/historic_flood_map/Historic_Flood_MapPolygon.shp"
ROAD = "data/road/RoadLink.shp"
HOSPITAL = "data/hospital_locations/hospital_locations.shp"
BOUNDARY = "GM_shapefile/CAUTH_MAY_2025_EN_BSC.shp"

OUTPUT = "data/grid/grid.shp"

RISK_SCORES = {"Very low": 1, "Low": 2, "Medium": 3, "High": 4}
RISK_SCORES = {
    "Very Low": 0.0005, # (0 + 0.1)/2 
    "Low": 0.00055, # (0.1 + 1)/2
    "Medium": 0.0215, # (1 + 3.3)/2
    "High": 0.033 # (3.3 + 3.3)/2
}

def build_grid(boundary):
    minx, miny, maxx, maxy = boundary.total_bounds

    xs = np.arange(minx, maxx, CELL_SIZE)
    ys = np.arange(miny, maxy, CELL_SIZE)

    grid = gpd.GeoDataFrame(
        geometry=[box(x, y, x + CELL_SIZE, y + CELL_SIZE) for x in xs for y in ys],
        crs="EPSG:27700"
    )

    grid = grid[grid.geometry.within(boundary.geometry.union_all())]
    grid["cell_id"] = grid.index
    return grid

def line_density(grid, lines, colname, cell_size):
    grid[colname] = 0.0
    cell_area_km2 = (cell_size * cell_size) / 1e6  # km²
    for i, cell in grid.geometry.items():
        inter = lines.geometry.intersection(cell)
        length_m = sum(geom.length for geom in inter if not geom.is_empty)
        grid.at[i, colname] = (length_m / 1000) / cell_area_km2
    return grid


def nearest_distance(grid, targets, colname, km=True):
    centroids = grid.copy()
    centroids["geometry"] = centroids.geometry.centroid

    nearest = gpd.sjoin_nearest(
        centroids,
        targets[["geometry"]],
        how="left",
        distance_col=colname
    )

    dist = nearest.groupby(nearest.index)[colname].first()
    if km:
        dist = dist / 1000

    grid[colname] = dist
    return grid


def flood_risk_score(grid, risk):
    inter = gpd.overlay(
        grid[["cell_id", "geometry"]],
        risk,
        how="intersection"
    )

    if inter.empty:
        grid["risk_score"] = 0.0
        return grid

    inter["area"] = inter.geometry.area
    inter["risk_value"] = inter["risk_band"].map(RISK_SCORES).fillna(0)
    inter["conf_weight"] = inter["confidence"] / 10

    inter["num"] = inter["area"] * inter["risk_value"] * inter["conf_weight"]
    inter["den"] = inter["area"] * inter["conf_weight"]

    agg = inter.groupby("cell_id")[["num", "den"]].sum()
    grid["risk_score"] = (agg["num"] / agg["den"]).reindex(grid.cell_id).fillna(0)
    return grid


def zonal_mean(grid, raster_path, colname):
    stats = zonal_stats(
        grid.geometry,
        raster_path,
        stats="mean",
        nodata=0
    )
    grid[colname] = [s["mean"] for s in stats]
    return grid


def zonal_fraction_nonzero(grid, raster_path, colname):
    stats = zonal_stats(
        grid.geometry,
        raster_path,
        stats=["count", "nodata"],
        add_stats={"nonzero": lambda x: np.count_nonzero(x)}
    )
    grid[colname] = [
        s["nonzero"] / s["count"] if s["count"] else 0
        for s in stats
    ]
    return grid


def historic_flood_flag(grid, historic):
    inter = gpd.overlay(
        grid[["cell_id", "geometry"]],
        historic,
        how="intersection"
    )
    inter["area"] = inter.geometry.area
    flooded = inter.groupby("cell_id")["area"].sum()

    grid["historic"] = (
        flooded / CELL_AREA_M2 > 0.5
    ).reindex(grid.cell_id).fillna(False).astype(int)

    return grid


# LOAD DATA
print("Loading data...")

water = gpd.read_file(WATERCOURSE).set_crs(epsg=27700, allow_override=True)
risk = gpd.read_file(FLOOD_RISK).set_crs(epsg=27700, allow_override=True)
historic = gpd.read_file(HISTORIC_FLOOD).set_crs(epsg=27700, allow_override=True)
road = gpd.read_file(ROAD).set_crs(epsg=27700, allow_override=True)
hospital = gpd.read_file(HOSPITAL).set_crs(epsg=27700, allow_override=True)
boundary = gpd.read_file(BOUNDARY).set_crs(epsg=27700, allow_override=True)

# GRID
print("Building grid...")
grid = build_grid(boundary)
grid.to_file("data/grid/grid_step_01.shp")

# LINE DENSITIES
print("Calculating watercourse density...")
grid = line_density(grid, water, "water_dens", CELL_SIZE)

print("Calculating road density...")
grid = line_density(grid, road, "road_dens", CELL_SIZE)

# NEAREST DISTANCES
print("Calculating nearest watercourse distance...")
grid = nearest_distance(grid, water, "water_dist", km=False)

print("Calculating nearest hospital distance...")
grid = nearest_distance(grid, hospital, "hospital")

print("Calculating nearest major road distance...")
major_roads = road[road["function"].isin(["A Road", "Motorway"])]
grid = nearest_distance(grid, major_roads, "road_dist")

# FLOOD RISK
print("Calculating flood risk score...")
grid = flood_risk_score(grid, risk)

# RASTER FEATURES
print("Calculating elevation...")
grid = zonal_mean(grid, ELEVATION, "elevation")

print("Calculating impervious fraction...")
grid = zonal_fraction_nonzero(grid, IMPERVIOUS, "impervious")

# HISTORIC FLOOD
print("Calculating historic flood flag...")
grid = historic_flood_flag(grid, historic)

# SAVE OUTPUT
cols = [
    "water_dens", "water_dist", "risk_score", "elevation",
    "impervious", "historic", "road_dens", "road_dist", "hospital", "geometry"
]

grid[cols].to_file(OUTPUT)'''

In [None]:
# Read from overall grid shapefile
gdf = gpd.read_file('data/grid/grid.shp')
features = [
    "water_dens", "water_dist", "risk_score", "elevation",
    "impervious", "historic", "road_dens", "road_dist", "hospital"
]

def sample_cell_with_noise(gdf, features, noise_scale=0.05):
    row = gdf.sample(1).iloc[0]
    sample = {}
    for f in features:
        val = row[f]
        noise = np.random.normal(0, noise_scale * abs(val + 1e-6))
        sample[f] = max(val + noise, 0)
    sample["geometry"] = row.geometry 
    return sample

def sample_neighborhood(gdf, features, k=5):
    cell = gdf.sample(1)
    dists = gdf.geometry.distance(cell.geometry.iloc[0])
    neighbors = gdf.loc[dists.nsmallest(k).index]
    return neighbors[features].mean().to_dict()

grid_sample = sample_cell_with_noise(gdf, features)
neighborhood_sample = sample_neighborhood(gdf, features)
print(grid_sample)


### Soil moisture saturation (WORK IN PREC)

In [None]:
# Choose pixel that contains the overall grid cell chosen
# sample from normal distribution of time-series values for the pixel
# filters by season

cell_geom = grid_sample["geometry"]  # polygon of the sampled grid cell

# Load soil moisture rasters
tiff_files = sorted(glob.glob("data/soil_moisture/*.tif"))
season_tiffs = []

for f in tiff_files:
    # Extract date from filename: dt_smuk_2023-12-22.tif
    date_str = f.split("_")[-1].replace(".tif", "")
    file_date = dt.datetime.strptime(date_str, "%Y-%m-%d")
    if file_date.month in SEASON_MONTHS[season_sample]:
        season_tiffs.append(f)

# Load soil moisture rasters
stack = []
with rasterio.open(season_tiffs[0]) as src:
    transform = src.transform
    nodata = src.nodata
    for f in season_tiffs:
        with rasterio.open(f) as s:
            data = s.read(1).astype(np.float32)
            if nodata is not None:
                data[data == nodata] = np.nan
            stack.append(data)

stack = np.stack(stack, axis=0)  # shape: (time, rows, cols)

# Find the single pixel containing the centroid of the sampled grid cell
centroid_x, centroid_y = cell_geom.centroid.x, cell_geom.centroid.y
col, row = ~transform * (centroid_x, centroid_y)
row, col = int(row), int(col)

# Ensure row/col are within raster bounds
row = np.clip(row, 0, stack.shape[1]-1)
col = np.clip(col, 0, stack.shape[2]-1)

# Extract time series for that pixel
values = stack[:, row, col]
values = values[~np.isnan(values)]

# Fit normal distribution safely
if len(values) == 0:
    soil_sample = np.nan
else:
    mu, sigma = norm.fit(values)
    sigma = max(sigma, 1e-6)
    soil_sample = norm.rvs(mu, sigma)

print(soil_sample)



### Flood depth

In [None]:
clipped_data = {}

for shp_path in glob.glob('data/flood_risk/*/*.shp'):
    gdf_shp = gpd.read_file(shp_path)
    gdf_shp_clipped = gdf_shp.clip(grid_sample["geometry"].bounds)
    
    # Store clipped data
    key = os.path.basename(shp_path).replace(".shp","")
    clipped_data[key] = gdf_shp_clipped


In [None]:
grid_geom = gpd.GeoSeries([grid_sample["geometry"]])
if grid_geom.crs is None:
    grid_geom = grid_geom.set_crs(epsg=27700)

# Load and clip depth-threshold shapefiles
threshold_layers = {}
for shp_path in glob.glob("data/flood_risk/*/*.shp"):
    gdf = gpd.read_file(shp_path)
    if gdf.crs != grid_geom.crs:
        gdf = gdf.to_crs(grid_geom.crs)
    gdf = gdf.clip(grid_geom)

    if not gdf.empty:
        key = os.path.basename(shp_path)
        threshold_layers[key] = gdf


In [None]:
DEPTH_THRESHOLDS = [0.2, 0.3, 0.6, 0.9, 1.2]

RISK_BAND_WEIGHTS = {
    "Very Low": 0.0005, # (0 + 0.1)/2 
    "Low": 0.00055, # (0.1 + 1)/2
    "Medium": 0.0215, # (1 + 3.3)/2
    "High": 0.033 # (3.3 + 3.3)/2
}

FILE_DEPTH = {
    "merged_rofsw_4bandPolygon.shp": 0.0,
    "merged_rofsw_4band_0_2m_depthPolygon.shp": 0.2,
    "merged_rofsw_4band_0_3m_depthPolygon.shp": 0.3,
    "merged_rofsw_4band_0_6m_depthPolygon.shp": 0.6,
    "merged_rofsw_4band_0_9m_depthPolygon.shp": 0.9,
    "merged_rofsw_4band_1_2m_depthPolygon.shp": 1.2
}

exceedance_probs = {}

cell_area = grid_sample["geometry"].area

for file, layer in threshold_layers.items():
    risk_weight = layer["risk_band"].map(RISK_BAND_WEIGHTS)
    weighted_area = layer.geometry.area * risk_weight
    total_weighted_area = weighted_area.sum()
    prob = total_weighted_area / cell_area
    exceedance_probs[FILE_DEPTH[file]] = prob

for depth in sorted(exceedance_probs):
    prob = exceedance_probs[depth]
    print(f"Depth > {depth} m: Probability = {prob:.20f}")

print(exceedance_probs)

breaks = [0, 0.2, 0.3, 0.6, 0.9, 1.2]
probs = {(0.0, 0.0): 1 - exceedance_probs[0], (1.2, np.inf): exceedance_probs[1.2]}
for i in range(len(breaks)-1):
    a, b = breaks[i], breaks[i+1]
    probs[(a, b)] = exceedance_probs[a] - exceedance_probs[b]
print(probs)




In [None]:
# Sample a range
bins = list(probs.keys())
range_probs = np.array(list(probs.values()))
range_index = np.random.choice(len(bins), p=range_probs)
low, high = bins[range_index]

# Sample within the selected range
if low == 0.0 and high == 0.0:
    depth_sample = 0.0
elif np.isinf(high):
    depth_sample = low + np.random.exponential(scale=0.3)
else:
    depth_sample = np.random.uniform(low, high)

print("\nSelected depth range:", (low, high))
print(depth_sample)



### Depth-damage

In [None]:
df = pd.read_csv("data/depth_damage.csv")
depths = np.array([float(c) for c in df.columns[1:]])

# Compute overall damage fraction (mean across all types)
overall_damage = df.iloc[:, 1:].astype(float).mean(axis=0).values

# Define exponential damage function
def exp_damage(d, k):
    return 1 - np.exp(-k * d)

# Fit the exponential model
params, _ = curve_fit(exp_damage, depths, overall_damage, bounds=(0, np.inf))
k = params[0]

# Add noise and ensures stays within bounds
damage_fraction_sample = np.clip(exp_damage(depth_sample, k) + np.random.normal(0, 0.05), 0, 1)
print(damage_fraction_sample)


## Other TEMP

### Disability rate

In [None]:
disability = pd.read_csv("data/disabled.csv")
total_disability = disability.groupby('Disability (3 categories)')['Observation'].sum()

alpha = total_disability['Disabled under the Equality Act']
beta = total_disability['Not disabled under the Equality Act']

disabled_sample = np.random.beta(alpha, beta, size=1)[0]
print(disabled_sample)

### English proficiency

In [None]:
def map_proficiency(category):
    if category in [
        "Main language is English (English or Welsh in Wales)",
        "Main language is not English (English or Welsh in Wales): Can speak English very well or well"
    ]:
        return "Good English Proficiency"
    elif category == "Main language is not English (English or Welsh in Wales): Cannot speak English or cannot speak English well":
        return "Bad English Proficiency"
    else:
        return None

english = pd.read_csv("data/english_proficiency.csv")   
total_english = english.groupby('Proficiency in English language (4 categories)')['Observation'].sum()
english['Proficiency_Group'] = english['Proficiency in English language (4 categories)'].apply(map_proficiency)
grouped_english = english.groupby('Proficiency_Group')['Observation'].sum()
#english_probs = grouped_english / grouped_english.sum()

alpha = grouped_english['Good English Proficiency'] + 1
beta = grouped_english['Bad English Proficiency'] + 1

english_sample = np.random.beta(alpha, beta, size=1)[0]
print(english_sample)

### General health

In [None]:
df = pd.read_csv('data/general_health.csv')
gen_health_sample = sample_categorical_census(df, 
                                                'General health (4 categories)', 
                                                'Observation', 
                                                ['Does not apply'])
print(gen_health_sample)

### Elderly rate

In [None]:
age = pd.read_csv('data/age.csv')
elderly_sample = 1 
print(elderly_sample)

### Children rate

In [None]:
child_sample = 1
print(child_sample)

### Vehicle

In [None]:
vehicle = pd.read_csv('data/vehicle.csv')
vehicle_sample = 1
print(vehicle_sample)

### Second address

In [None]:
second_add = pd.read_csv('data/second_address.csv')
second_add_sample = 1
print(second_add_sample)

## Derived

In [None]:
'''
UNUSED:
- disabled_sample -- CHANGE TO VARIABLE PERCENTAGE
- english_sample
- low_income_sample
- property_sample
- income_sample (used by low_income_sample)
- building_age_sample -- TURN INTO ACTUAL NUMBER NOT RANGE
- adjusted_response
- depth_sample (used by damage_fraction_sample)
- damage_fraction_sample
- grid_sample
    "water_dens", "water_dist", "risk_score", "elevation",
    "impervious", "historic", "road_dens", "road_dist", "hospital"
- gen_health_sample -- MAYBE TURN INTO PERCENTAGE SOMEHOW OR REMOVE ENTIRELY
- handover_sample -- TURN INTO ACTUAL NUMBER IN MINS NOT RANGE
- occ_pct_sample
- elderly -- TURN INTO PERCENTAGE
- children -- TURN INTO PERCENTAGE
- vehicle_sample -- TURN INTO PERCENTAGE
- second address -- TURN INTO PERCENTAGE

UNUSED, use for depth:
- soil_sample
- gw_sample
- rf_sample
- rl_sample

'''

### Physical vulnerability

In [None]:
'''
    "Mean building age",
    "Proportion of buildings brick/stone",
    "Impervious surface area",
    "Drainage",
    "Road network density",
    "Distance to water",
    "Urban/rural",
    "Elevation above sea level",
    "Population density",
'''

### Socioeconomic vulnerability

In [None]:
'''
    "Age",
    "Elderly rate",
    "Children rate",
    "Disability rate",
    "English proficiency",
    "Mean income",
    "Low-income fraction",
    "Employment history",
    "Highest qualification",
    "Economic activity status",
    "Occupation current",
    "Ns-Sec",
    "Tenure of household",
    "HBAI statistics",
    "General health",
    "Long-term health condition",
    "Mean property value",
    "Vehicle",
    "Second address",
    "Accomodation type",
    "Household",
    "Deprived in education",
    "Deprived in employment",
    "Deprived in health and disability",
    "Deprived in housing",
    "Household size",
    "Families in household",
    "Adults and children in household",
    "Adults employed in household",
    "Disabled in household",
    "Long-term health in household",
    "People per room in household",
    "Occupancy rating for rooms",
    "Household composition",
    "Household deprivation",
    "Lifestage of HRP",
    "Mental health cost",
    "Mental health types",
'''

### Preparedness

In [None]:
'''
    "Warning issued",
    "Emergency response time",
    "Access to communications",
    "Household access to internet",
'''

### Recovery capacity

In [None]:
'''
    "Ambulance handover delays",
    "Hospital bed availability",
    "Hospital locations",
    "Reconstruction time",
    "Home insurance coverage",
    "Health insurance coverage",
    "Local government budget",
'''

### Exposure

In [None]:
'''
Population density
Mean property value
Holiday binary
'''

### Overall vulnerability

In [None]:
'''
Physical vulnerability
Socioeconomic vulnerability
Preparedness
Response capacity
'''

### Impact score

In [None]:
'''
Physical vulnerability
Socioeconomic vulnerability
Preparedness 
Response capacity
Overall vulnerability
Exposure
Depth-damage curve
Flood depth
Population density
'''