In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

np.random.seed(42)

BASE_DIR = Path("..")
RAW_DIR = BASE_DIR / "data" / "raw"
PROC_DIR = BASE_DIR / "data" / "processed"

PROC_DIR.mkdir(parents=True, exist_ok=True)

print("Directories ready")
n = 5000  # number of grid cells



Directories ready


In [2]:
df = pd.DataFrame({
    "grid_id": range(n),
    "LST": np.random.normal(35, 3, n),
    "NDVI": np.random.uniform(0.05, 0.6, n),
    "NDBI": np.random.uniform(0.2, 0.8, n),
    "AirTemp": np.random.normal(33, 2.5, n),
    "RH": np.random.uniform(50, 85, n),
    "Building_density": np.random.uniform(0.2, 0.9, n),
    "Road_density": np.random.uniform(0.1, 0.7, n),
    "Dist_green": np.random.uniform(0, 500, n),
    "Population": np.random.poisson(150, n),
    "Elderly_pct": np.random.uniform(0.05, 0.18, n),
    "Elevation": np.random.uniform(0, 20, n)
})



In [3]:
df.head()
def normalize(col):
    return (col - col.min()) / (col.max() - col.min())

norm_cols = [
    "LST", "NDVI", "NDBI", "AirTemp", "RH",
    "Building_density", "Road_density",
    "Dist_green", "Population",
    "Elderly_pct", "Elevation"
]

for col in norm_cols:
    df[col + "_norm"] = normalize(df[col])

df.head()


Unnamed: 0,grid_id,LST,NDVI,NDBI,AirTemp,RH,Building_density,Road_density,Dist_green,Population,...,NDVI_norm,NDBI_norm,AirTemp_norm,RH_norm,Building_density_norm,Road_density_norm,Dist_green_norm,Population_norm,Elderly_pct_norm,Elevation_norm
0,0,36.490142,0.14253,0.387594,29.816968,66.567484,0.255843,0.227492,143.292821,144,...,0.168248,0.312571,0.29109,0.473489,0.079735,0.212527,0.286605,0.4,0.836729,0.532846
1,1,34.585207,0.154464,0.30694,29.705501,64.674033,0.345089,0.515208,455.893012,129,...,0.189954,0.178117,0.285595,0.419363,0.207273,0.692157,0.911884,0.242105,0.57103,0.785819
2,2,36.943066,0.303392,0.715509,27.46433,81.047442,0.301683,0.246787,227.57312,151,...,0.460834,0.859225,0.175115,0.887414,0.145242,0.244692,0.455187,0.473684,0.629511,0.290095
3,3,39.56909,0.207454,0.442114,35.043836,58.580181,0.781546,0.666102,401.862749,167,...,0.286336,0.403459,0.548753,0.245163,0.830991,0.943702,0.80381,0.642105,0.358033,0.804869
4,4,34.29754,0.186105,0.439126,35.069144,63.940852,0.803713,0.32057,127.908442,167,...,0.247504,0.398479,0.55,0.398404,0.862668,0.367691,0.255832,0.642105,0.841634,0.309114


In [4]:
output_path = PROC_DIR / "mumbai_grid_features.csv"
df.to_csv(output_path, index=False)

print(f"Saved file to: {output_path}")


Saved file to: ..\data\processed\mumbai_grid_features.csv


In [5]:
df.describe()[["LST", "AirTemp", "NDVI", "Population"]]
assert df.isnull().sum().sum() == 0
print("No missing values ✔")


No missing values ✔
