### inspect shapefiles


### package instalation

In [0]:
!pip install geopandas rasterio folium shapely

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting geopandas
  Downloading geopandas-1.1.1-py3-none-any.whl (338 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 338.4/338.4 kB 7.4 MB/s eta 0:00:00
Collecting rasterio
  Downloading rasterio-1.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 22.2/22.2 MB 35.5 MB/s eta 0:00:00
Collecting folium
  Downloading folium-0.20.0-py2.py3-none-any.whl (113 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 113.4/113.4 kB 13.6 MB/s eta 0:00:00
Collecting shapely
  Downloading shapely-2.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 36.6 MB/s eta 0:00:00
Collecting numpy>=1.24
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.8/16.8 MB 11

In [0]:
import sys, os, psutil, importlib
libs = ["geopandas","shapely","rasterio","pandas","numpy","pyproj","requests"]
status = {}
for lib in libs:
    try:
        importlib.import_module(lib)
        status[lib] = "OK"
    except ImportError:
        status[lib] = "MISSING"
print("Python:", sys.version)
print("Working dir:", os.getcwd())
print("Lib status:", status)
print("CPUs logical:", psutil.cpu_count(logical=True))
print("RAM GB:", round(psutil.virtual_memory().total/1024**3,2))

Python: 3.10.12 (main, Aug 15 2025, 14:32:43) [GCC 11.4.0]
Working dir: /Workspace/Users/npokkiri@munichre.com/jrc
Lib status: {'geopandas': 'OK', 'shapely': 'OK', 'rasterio': 'OK', 'pandas': 'OK', 'numpy': 'OK', 'pyproj': 'OK', 'requests': 'OK'}
CPUs logical: 8
RAM GB: 57.39


## Grid generation


### config

In [0]:
# EDIT THESE BEFORE RUN
ADMIN_PATH = "/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/geozones/RMS_Admin0_geozones.gpkg"   # confirm path
# INPUT FILES (adjust these)
          # India boundary
TILE_FOOTPRINT_PATH = "./extracted/GHSL2_0_MWD_L1_tile_schema_land.shp"

# NAME OF ADMIN FIELD & VALUE FOR INDIA (fill in exact field & value)
ADMIN_FIELD = "ISO3"         # e.g., 'ADM0_NAME' or 'CNTRY_NAME'
ADMIN_VALUE = "IND"     # exact case-sensitive match

# TILE ID field in tile footprint shapefile (inspect soon)
TILE_ID_FIELD = "tile_id"  # change after inspection if needed

CELL_SIZE = 5000        # meters
TARGET_CRS = "ESRI:54009"
EXPORT_CRS = "EPSG:4326" # lat/lon for output points
DRY_RUN = False           # set False after validation
OUTPUT_CENTROIDS_CSV = "data/grids/india_5km_grid_centroids.csv"
OUTPUT_CENTROIDS_GPKG = "data/grids/india_5km_grid_centroids.gpkg"

MAKE_INDIA_PIXEL_MASK = True
MASK_CACHE_DIR = "data/masks"  
os.makedirs("data/masks", exist_ok=True) 
os.makedirs("data/grids", exist_ok=True)
print("Config loaded. DRY_RUN =", DRY_RUN)

Config loaded. DRY_RUN = False


### Load and inspect

In [0]:
import geopandas as gpd, os

assert os.path.exists(ADMIN_PATH), f"Missing admin file: {ADMIN_PATH}"
assert os.path.exists(TILE_FOOTPRINT_PATH), f"Missing tile footprint: {TILE_FOOTPRINT_PATH}"

admin_full = gpd.read_file(ADMIN_PATH)
tiles_full = gpd.read_file(TILE_FOOTPRINT_PATH)

print("Admin columns:", admin_full.columns.tolist())
print("Tiles columns:", tiles_full.columns.tolist())
print("Admin CRS:", admin_full.crs)
print("Tiles CRS:", tiles_full.crs)
print("Admin sample:\n", admin_full.head(2))
print("Tiles sample:\n", tiles_full.head(2))

if ADMIN_FIELD not in admin_full.columns:
    raise ValueError(f"ADMIN_FIELD '{ADMIN_FIELD}' not in admin columns.")

admin_india = admin_full[admin_full[ADMIN_FIELD]==ADMIN_VALUE].copy()
print("India features:", len(admin_india))
if admin_india.empty:
    raise ValueError("No admin rows matched India. Check ADMIN_FIELD / ADMIN_VALUE.")

Admin columns: ['CountryNam', 'CountryGeo', 'ISO3', 'FIPS', 'ISO3N', 'Region', 'Area_in_sq', 'RMS_ISO2A', 'CountryCod', 'Shape_Leng', 'Shape_Area', 'GKD_ISO2A', 'INSIDE_X', 'INSIDE_Y', 'geometry']
Tiles columns: ['tile_id', 'left', 'top', 'right', 'bottom', 'geometry']
Admin CRS: EPSG:4326
Tiles CRS: PROJCS["World_Mollweide",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Mollweide"],PARAMETER["central_meridian",0],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["ESRI","54009"]]
Admin sample:
             CountryNam  ...                                           geometry
0                Aruba  ...  MULTIPOLYGON (((-70.0126 12.49184, -70.01232 1...
1  Antigua and Barbuda  ...  MULTIPOLYGON (((-61.90275 17.08461, -61.90047 ...

[2 rows x 1




### reproject and filter tiles

In [0]:
# Reproject to target CRS (Mollweide)
admin_india_m = admin_india.to_crs(TARGET_CRS)
tiles_m = tiles_full.to_crs(TARGET_CRS)

# Keep only tiles that spatially intersect India
tiles_m['__idx'] = range(len(tiles_m))
candidate_tiles = tiles_m[tiles_m.geometry.intersects(admin_india_m.unary_union)].copy()
print("Intersecting tiles:", len(candidate_tiles))
print(candidate_tiles[[TILE_ID_FIELD]].head())

if candidate_tiles.empty:
    raise ValueError("No tiles intersect India in provided footprint.")

  candidate_tiles = tiles_m[tiles_m.geometry.intersects(admin_india_m.unary_union)].copy()


Intersecting tiles: 13
    tile_id
259  R5_C25
260  R5_C26
279  R6_C25
280  R6_C26
281  R6_C27


In [0]:
# from grid_generation import assign_tile_id, centroid_table
# grid_with_tiles = assign_tile_id(grid_polygons, tiles_land, tile_id_field=TILE_ID_FIELD)
# print("After tile assignment rows:", len(grid_with_tiles))
# print(grid_with_tiles[['grid_id','tile_id']].head())

# centroids_df = centroid_table(grid_with_tiles)
# print("Centroid table preview:")
# print(centroids_df.head())


### build the global grid

In [0]:
import numpy as np
from shapely.geometry import Point

# Determine global bounds from intersecting tile footprints
xmin, ymin, xmax, ymax = candidate_tiles.total_bounds

# Snap origin for stability
def snap_down(v, step): import math; return math.floor(v/step)*step
def snap_up(v, step): import math; return math.ceil(v/step)*step

x0 = snap_down(xmin, CELL_SIZE)
y0 = snap_down(ymin, CELL_SIZE)
x1 = snap_up(xmax, CELL_SIZE)
y1 = snap_up(ymax, CELL_SIZE)

# Centers, not corners: add half cell
x_centers = np.arange(x0 + CELL_SIZE/2, x1, CELL_SIZE)
y_centers = np.arange(y0 + CELL_SIZE/2, y1, CELL_SIZE)

print("x centers:", len(x_centers), "y centers:", len(y_centers), "total raw:", len(x_centers)*len(y_centers))

# Create all centroids
xx, yy = np.meshgrid(x_centers, y_centers)
flat_x = xx.ravel()
flat_y = yy.ravel()

import geopandas as gpd
centroids_all = gpd.GeoDataFrame(
    {"centroid_x": flat_x, "centroid_y": flat_y},
    geometry=[Point(xy) for xy in zip(flat_x, flat_y)],
    crs=TARGET_CRS
)
print("Full mesh cells:", len(centroids_all))

x centers: 801 y centers: 1000 total raw: 801000
Full mesh cells: 801000


### filter centroids within india

In [0]:
centroids_in = gpd.sjoin(
    centroids_all, 
    admin_india_m[['geometry']], 
    how='inner', 
    predicate='within'
).drop(columns=['index_right'], errors='ignore')
print("Centroids inside India:", len(centroids_in))

Centroids inside India: 131298


### assign tile IDs

In [0]:
# Spatial join with tiles
centroids_tile = gpd.sjoin(
    centroids_in,
    candidate_tiles[[TILE_ID_FIELD,'geometry']],
    how='left',
    predicate='within'  # or intersects if tile edges ambiguous
).rename(columns={TILE_ID_FIELD:'tile_id'}).drop(columns=['index_right'], errors='ignore')

print("Rows after tile join:", len(centroids_tile))

# Resolve duplicates where one centroid got multiple tile rows
# (Occurs if we used 'intersects' and centroid lies exactly at overlapping tile edge.)
dups = centroids_tile.duplicated(subset=['centroid_x','centroid_y'], keep=False)
if dups.any():
    # Keep only first tile per centroid deterministically by sorting tile_id
    centroids_tile = (centroids_tile
                      .sort_values(['centroid_x','centroid_y','tile_id'])
                      .drop_duplicates(subset=['centroid_x','centroid_y'], keep='first'))
    print("After de-dup tile overlaps:", len(centroids_tile))

Rows after tile join: 131298


### compute stable IDs and bounds -

is this really needed though?

In [0]:
import numpy as np

# grid indices from stable origin
i_index = np.round((centroids_tile['centroid_x'] - (x0 + CELL_SIZE/2)) / CELL_SIZE).astype(int)
j_index = np.round((centroids_tile['centroid_y'] - (y0 + CELL_SIZE/2)) / CELL_SIZE).astype(int)

centroids_tile['i_idx'] = i_index
centroids_tile['j_idx'] = j_index
centroids_tile['grid_id'] = "G_" + centroids_tile['j_idx'].astype(str) + "_" + centroids_tile['i_idx'].astype(str)

# Compute square bounds directly from centroid
centroids_tile['grid_minx'] = centroids_tile['centroid_x'] - CELL_SIZE/2
centroids_tile['grid_maxx'] = centroids_tile['centroid_x'] + CELL_SIZE/2
centroids_tile['grid_miny'] = centroids_tile['centroid_y'] - CELL_SIZE/2
centroids_tile['grid_maxy'] = centroids_tile['centroid_y'] + CELL_SIZE/2

print("Sample rows with IDs and bounds:")
print(centroids_tile[['grid_id','tile_id','centroid_x','centroid_y','grid_minx','grid_miny']].head())

Sample rows with IDs and bounds:
          grid_id tile_id  centroid_x  centroid_y  grid_minx  grid_miny
134448  G_167_681  R9_C28   9362500.0    837500.0  9360000.0   835000.0
135249  G_168_681  R9_C28   9362500.0    842500.0  9360000.0   840000.0
135250  G_168_682  R9_C28   9367500.0    842500.0  9365000.0   840000.0
136049  G_169_680  R9_C28   9357500.0    847500.0  9355000.0   845000.0
136050  G_169_681  R9_C28   9362500.0    847500.0  9360000.0   845000.0


### convert to lat lon

In [0]:
centroids_wgs84 = centroids_tile.to_crs(EXPORT_CRS)
centroids_tile['lon'] = centroids_wgs84.geometry.x
centroids_tile['lat'] = centroids_wgs84.geometry.y
print("Added lon/lat.")

Added lon/lat.



### export


In [0]:
EXPORT_COLS = [
    'grid_id','tile_id',
    'centroid_x','centroid_y','lon','lat',
    'grid_minx','grid_miny','grid_maxx','grid_maxy',
    'i_idx','j_idx'
]
out_df = centroids_tile[EXPORT_COLS].copy()

if DRY_RUN:
    out_df.to_csv(OUTPUT_CENTROIDS_CSV, index=False)
    centroids_tile[['grid_id','tile_id','geometry']].to_file(OUTPUT_CENTROIDS_GPKG, driver="GPKG")
    print("Saved CSV:", OUTPUT_CENTROIDS_CSV)
    print("Saved GPKG:", OUTPUT_CENTROIDS_GPKG)
else:
    out_df.to_csv(OUTPUT_CENTROIDS_CSV, index=False)
    centroids_tile[['grid_id','tile_id','geometry']].to_file(OUTPUT_CENTROIDS_GPKG, driver="GPKG")
    print("Saved CSV:", OUTPUT_CENTROIDS_CSV)
    print("Saved GPKG:", OUTPUT_CENTROIDS_GPKG)

Saved CSV: data/grids/india_5km_grid_centroids.csv
Saved GPKG: data/grids/india_5km_grid_centroids.gpkg


In [0]:
# Basic checks
import pandas as pd
if not DRY_RUN:
    df = pd.read_csv(OUTPUT_CENTROIDS_CSV, nrows=5)
    print("CSV columns:", df.columns.tolist())
    print(df.head())
print("Unique tile count:", centroids_tile['tile_id'].nunique())
print("Total grids:", len(centroids_tile))
print("Any missing tile_id?", centroids_tile['tile_id'].isna().sum())

CSV columns: ['grid_id', 'tile_id', 'centroid_x', 'centroid_y', 'lon', 'lat', 'grid_minx', 'grid_miny', 'grid_maxx', 'grid_maxy', 'i_idx', 'j_idx']
     grid_id tile_id  centroid_x  ...  grid_maxy  i_idx  j_idx
0  G_167_681  R9_C28   9362500.0  ...   840000.0    681    167
1  G_168_681  R9_C28   9362500.0  ...   845000.0    681    168
2  G_168_682  R9_C28   9367500.0  ...   845000.0    682    168
3  G_169_680  R9_C28   9357500.0  ...   850000.0    680    169
4  G_169_681  R9_C28   9362500.0  ...   850000.0    681    169

[5 rows x 12 columns]
Unique tile count: 12
Total grids: 131298
Any missing tile_id? 0


## Raster data extraction

In [0]:
import os, pandas as pd, geopandas as gpd

# === EDIT AS NEEDED ===
CENTROIDS_CSV = "data/grids/india_5km_grid_centroids.csv"
TILES_ROOT    = "data/tiles"
OUTPUT_COUNTS_DIR = "data/class_counts_output"
DATASETS = ["built_c","smod"]          # choose subset if needed
DRY_RUN = True                         # set to False once paths verified
INCLUDE_NODATA = True
KEEP_ORIGINAL_SMOD = True              # set False if you only want reclass_0/1/2
# MAKE_INDIA_PIXEL_MASK = True          # set True later if you want per-pixel border masking
# MASK_CACHE_DIR = "data/masks"          # used only if MAKE_INDIA_PIXEL_MASK=True

# # For optional mask: path to India admin shapefile in same CRS as rasters or WGS84 (will reproject)
# ADMIN_PATH = "data/extracted/GAUL_2024_L1.shp"
# ADMIN_FIELD = "ADM0_NAME"   # <-- adjust
# ADMIN_VALUE = "INDIA"       # <-- adjust
# TARGET_CRS = "ESRI:54009"   # raster CRS for both datasets

print("CONFIG LOADED")
print("Centroids CSV exists?", os.path.exists(CENTROIDS_CSV))
print("Tiles root exists?", os.path.isdir(TILES_ROOT))
print("Output dir?", OUTPUT_COUNTS_DIR)
os.makedirs(OUTPUT_COUNTS_DIR, exist_ok=True)
if MAKE_INDIA_PIXEL_MASK:
    os.makedirs(MASK_CACHE_DIR, exist_ok=True)

# Peek at centroids
if os.path.exists(CENTROIDS_CSV):
    centroids_preview = pd.read_csv(CENTROIDS_CSV, nrows=5)
    print("Centroids columns:", centroids_preview.columns.tolist())
    display(centroids_preview)
else:
    print("Centroids CSV missing. STOP before next cell.")
import os, pandas as pd, geopandas as gpd

# === EDIT AS NEEDED ===
CENTROIDS_CSV = "data/grids/india_5km_grid_centroids.csv"
TILES_ROOT    = "data/tiles"
OUTPUT_COUNTS_DIR = "data/class_counts_output"
DATASETS = ["built_c","smod"]          # choose subset if needed
DRY_RUN = False                       # set to False once paths verified
INCLUDE_NODATA = False
KEEP_ORIGINAL_SMOD = False              # set False if you only want reclass_0/1/2
MAKE_INDIA_PIXEL_MASK = False          # set True later if you want per-pixel border masking
MASK_CACHE_DIR = "data/masks"          # used only if MAKE_INDIA_PIXEL_MASK=True

# For optional mask: path to India admin shapefile in same CRS as rasters or WGS84 (will reproject)
# ADMIN_PATH = "data/extracted/GAUL_2024_L1.shp"
# ADMIN_FIELD = "ADM0_NAME"   # <-- adjust
# ADMIN_VALUE = "INDIA"       # <-- adjust
# TARGET_CRS = "ESRI:54009"   # raster CRS for both datasets

print("CONFIG LOADED")
print("Centroids CSV exists?", os.path.exists(CENTROIDS_CSV))
print("Tiles root exists?", os.path.isdir(TILES_ROOT))
print("Output dir?", OUTPUT_COUNTS_DIR)
os.makedirs(OUTPUT_COUNTS_DIR, exist_ok=True)
if MAKE_INDIA_PIXEL_MASK:
    os.makedirs(MASK_CACHE_DIR, exist_ok=True)

# Peek at centroids
if os.path.exists(CENTROIDS_CSV):
    centroids_preview = pd.read_csv(CENTROIDS_CSV, nrows=5)
    print("Centroids columns:", centroids_preview.columns.tolist())
    display(centroids_preview)
else:
    print("Centroids CSV missing. STOP before next cell.")

CONFIG LOADED
Centroids CSV exists? True
Tiles root exists? True
Output dir? data/class_counts_output
Centroids columns: ['grid_id', 'tile_id', 'centroid_x', 'centroid_y', 'lon', 'lat', 'grid_minx', 'grid_miny', 'grid_maxx', 'grid_maxy', 'i_idx', 'j_idx']


grid_id,tile_id,centroid_x,centroid_y,lon,lat,grid_minx,grid_miny,grid_maxx,grid_maxy,i_idx,j_idx
G_167_681,R9_C28,9362500.0,837500.0,93.82220089409869,6.779494965967758,9360000.0,835000.0,9365000.0,840000.0,681,167
G_168_681,R9_C28,9362500.0,842500.0,93.82708664568874,6.820042844564184,9360000.0,840000.0,9365000.0,845000.0,681,168
G_168_682,R9_C28,9367500.0,842500.0,93.87719456913102,6.820042844564184,9365000.0,840000.0,9370000.0,845000.0,682,168
G_169_680,R9_C28,9357500.0,847500.0,93.78189170071516,6.860592037295733,9355000.0,845000.0,9360000.0,850000.0,680,169
G_169_681,R9_C28,9362500.0,847500.0,93.83200224931292,6.860592037295733,9360000.0,845000.0,9365000.0,850000.0,681,169


CONFIG LOADED
Centroids CSV exists? True
Tiles root exists? True
Output dir? data/class_counts_output
Centroids columns: ['grid_id', 'tile_id', 'centroid_x', 'centroid_y', 'lon', 'lat', 'grid_minx', 'grid_miny', 'grid_maxx', 'grid_maxy', 'i_idx', 'j_idx']


grid_id,tile_id,centroid_x,centroid_y,lon,lat,grid_minx,grid_miny,grid_maxx,grid_maxy,i_idx,j_idx
G_167_681,R9_C28,9362500.0,837500.0,93.82220089409869,6.779494965967758,9360000.0,835000.0,9365000.0,840000.0,681,167
G_168_681,R9_C28,9362500.0,842500.0,93.82708664568874,6.820042844564184,9360000.0,840000.0,9365000.0,845000.0,681,168
G_168_682,R9_C28,9367500.0,842500.0,93.87719456913102,6.820042844564184,9365000.0,840000.0,9370000.0,845000.0,682,168
G_169_680,R9_C28,9357500.0,847500.0,93.78189170071516,6.860592037295733,9355000.0,845000.0,9360000.0,850000.0,680,169
G_169_681,R9_C28,9362500.0,847500.0,93.83200224931292,6.860592037295733,9360000.0,845000.0,9365000.0,850000.0,681,169



### download tiles

In [0]:
import pandas as pd, os

centroids_df = pd.read_csv(CENTROIDS_CSV)
print("Total grids (rows):", len(centroids_df))
print("Columns:", centroids_df.columns.tolist())

tile_ids = sorted(centroids_df['tile_id'].dropna().unique())
print("Tile IDs:", tile_ids)

# Check local directories for each dataset
status_rows = []
for tile in tile_ids:
    for ds in DATASETS:
        tile_dir = os.path.join(TILES_ROOT, ds, tile)
        tif_ok = False
        tif_name = None
        if os.path.isdir(tile_dir):
            tifs = [f for f in os.listdir(tile_dir) if f.lower().endswith(".tif")]
            if tifs:
                tif_ok = True
                tif_name = tifs[0]
        status_rows.append({"tile_id": tile, "dataset": ds, "dir_exists": os.path.isdir(tile_dir), "tif_found": tif_ok, "tif_name": tif_name})
import pandas as pd
tile_status = pd.DataFrame(status_rows)
display(tile_status)
missing = tile_status[(tile_status.dir_exists==False) | (tile_status.tif_found==False)]
print("Tiles needing download:", len(missing), "rows")
# display(missing.head(10))

Total grids (rows): 131298
Columns: ['grid_id', 'tile_id', 'centroid_x', 'centroid_y', 'lon', 'lat', 'grid_minx', 'grid_miny', 'grid_maxx', 'grid_maxy', 'i_idx', 'j_idx']
Tile IDs: ['R5_C25', 'R5_C26', 'R6_C25', 'R6_C26', 'R6_C27', 'R6_C28', 'R7_C25', 'R7_C26', 'R7_C27', 'R8_C26', 'R8_C28', 'R9_C28']


tile_id,dataset,dir_exists,tif_found,tif_name
R5_C25,built_c,True,True,GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R5_C25.tif
R5_C25,smod,True,True,GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R5_C25.tif
R5_C26,built_c,True,True,GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R5_C26.tif
R5_C26,smod,True,True,GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R5_C26.tif
R6_C25,built_c,True,True,GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R6_C25.tif
R6_C25,smod,True,True,GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R6_C25.tif
R6_C26,built_c,True,True,GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R6_C26.tif
R6_C26,smod,True,True,GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R6_C26.tif
R6_C27,built_c,True,True,GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R6_C27.tif
R6_C27,smod,True,True,GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R6_C27.tif


Tiles needing download: 0 rows


In [0]:
from tile_downloader import download_tiles
import pandas as pd

if not tile_ids:
    raise ValueError("No tile IDs found in centroids.")

print("Starting tile download (dry run =", DRY_RUN, ")")
dl_status = download_tiles(
    tile_ids=tile_ids,
    datasets=DATASETS,
    dest_root=TILES_ROOT,
    concurrency=3,    # adjust if network allows
    retries=2,
    dry_run=DRY_RUN
)
dl_df = pd.DataFrame(dl_status)
display(dl_df.head())
print("Status counts:")
print(dl_df['status'].value_counts())

if (not DRY_RUN):
    dl_df.to_csv(os.path.join(OUTPUT_COUNTS_DIR, "download_status.csv"), index=False)

Starting tile download (dry run = False )


tile_id,dataset,status
R5_C25,smod,exists
R5_C25,built_c,exists
R5_C26,built_c,exists
R5_C26,smod,exists
R6_C25,built_c,exists


Status counts:
exists    24
Name: status, dtype: int64



### download verification

In [0]:
import rasterio, os, random

sample_tile = random.choice(tile_ids)
print("Sample tile:", sample_tile)

for ds in DATASETS:
    rdir = os.path.join(TILES_ROOT, ds, sample_tile)
    if not os.path.isdir(rdir):
        print(f"[{ds}] directory missing for tile {sample_tile}")
        continue
    tifs = [f for f in os.listdir(rdir) if f.lower().endswith(".tif")]
    if not tifs:
        print(f"[{ds}] no tif in {rdir}")
        continue
    rpath = os.path.join(rdir, tifs[0])
    with rasterio.open(rpath) as src:
        print(f"{ds} raster: {rpath}")
        print("  shape:", (src.height, src.width), "res:", src.res, "crs:", src.crs, "nodata:", src.nodata)

Sample tile: R6_C26
built_c raster: data/tiles/built_c/R6_C26/GHS_BUILT_C_MSZ_E2018_GLOBE_R2023A_54009_10_V1_0_R6_C26.tif
  shape: (100000, 100000) res: (10.0, 10.0) crs: ESRI:54009 nodata: 255.0
smod raster: data/tiles/smod/R6_C26/GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V2_0_R6_C26.tif
  shape: (1000, 1000) res: (1000.0, 1000.0) crs: ESRI:54009 nodata: -200.0


### india mask pixel needed?

In [0]:
# import os, numpy as np, rasterio, pandas as pd

# MASK_CACHE_DIR = "data/masks"
# missing_masks = []
# sample_checks = 0

# for tile in tile_ids:
#     mask_file = os.path.join(MASK_CACHE_DIR, f"india_mask_{tile}.npy")
#     built_dir = os.path.join(TILES_ROOT, "built_c", tile)
#     tif_list = [f for f in os.listdir(built_dir)] if os.path.isdir(built_dir) else []
#     tif = next((f for f in tif_list if f.lower().endswith(".tif")), None)
#     if tif is None:
#         print(f"[WARN] no built_c tif for tile {tile}")
#         continue
#     with rasterio.open(os.path.join(built_dir, tif)) as src:
#         expected_shape = (src.height, src.width)
#     if not os.path.exists(mask_file):
#         missing_masks.append(tile)
#     else:
#         if sample_checks < 3:
#             m = np.load(mask_file)
#             print(f"Mask {tile}: shape={m.shape} match={m.shape==expected_shape} unique={np.unique(m)}")
#             sample_checks += 1

# print("Missing mask tiles:", missing_masks if missing_masks else "None")


### counting starts

In [0]:
from class_counts_pipeline import ClassCountsPipeline
import time

FULL_GRID = "data/grids/india_5km_grid_centroids_with_boundary.csv"

start = time.time()
pipeline_full = ClassCountsPipeline(
    grid_csv_path=FULL_GRID,
    built_root="data/tiles/built_c",
    smod_root="data/tiles/smod",
    output_dir="data/class_counts_boundary",
    use_smod=True,             # include smod counts
    keep_original_smod=True,
    include_nodata=False,
    add_percentages=True,
    use_boundary_mask=False,
    mask_built=True,
    mask_smod=False,           # only built_c masked
    admin_path="/Volumes/prp_mr_bdap_projects/geospatialsolutions/external/geozones/RMS_Admin0_geozones.gpkg",
    admin_field="ISO3",
    admin_value="IND",
    tile_footprint_path="./extracted/GHSL2_0_MWD_L1_tile_schema_land.shp",
    tile_id_field="tile_id",
    target_crs="ESRI:54009",
    chunk_size=250,
    max_workers=5,
    verbose=True
)

combined = pipeline_full.process_all(datasets=("built_c","smod"), save_per_tile=True)

elapsed = time.time() - start
print(f"Full run elapsed: {elapsed/60:.2f} minutes")

Loaded 131,298 grids | tiles=12 | boundary_mask=False | use_smod=True
Pass-through columns: ['centroid_x', 'centroid_y', 'lon', 'lat', 'i_idx', 'j_idx', 'tile_class']

=== Tile 1/12: R5_C25 ===
  [built_c] windows=6549 chunks=27 mask=False
    [built_c] progress 5/27 (18.5%)
    [built_c] progress 10/27 (37.0%)
    [built_c] progress 15/27 (55.6%)
    [built_c] progress 20/27 (74.1%)
    [built_c] progress 25/27 (92.6%)
    [built_c] progress 27/27 (100.0%)
  [smod] windows=6549 chunks=27 mask=False
    [smod] progress 5/27 (18.5%)
    [smod] progress 10/27 (37.0%)
    [smod] progress 15/27 (55.6%)
    [smod] progress 20/27 (74.1%)
    [smod] progress 25/27 (92.6%)
    [smod] progress 27/27 (100.0%)
  Saved per-tile data/class_counts_boundary/class_counts_R5_C25_20251008_141702.csv

=== Tile 2/12: R5_C26 ===
  [built_c] windows=1565 chunks=7 mask=False
    [built_c] progress 1/7 (14.3%)
    [built_c] progress 2/7 (28.6%)
    [built_c] progress 3/7 (42.9%)
    [built_c] progress 4/7 (57

## Post processing


### Long format or wide format??

In [0]:
import json, time

audit = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "dry_run": DRY_RUN,
    "tiles": tile_ids,
    "datasets": DATASETS,
    "include_nodata": INCLUDE_NODATA,
    "keep_original_smod": KEEP_ORIGINAL_SMOD,
    "masking_enabled": MAKE_INDIA_PIXEL_MASK,
    "centroids_rows": int(len(centroids_df)),
}
print(audit)
if not DRY_RUN:
    audit_path = os.path.join(OUTPUT_COUNTS_DIR, "audit_summary.json")
    with open(audit_path, "w") as f:
        json.dump(audit, f, indent=2)
    print("Saved audit:", audit_path)

{'timestamp': '2025-10-08 14:21:55', 'dry_run': False, 'tiles': ['R5_C25', 'R5_C26', 'R6_C25', 'R6_C26', 'R6_C27', 'R6_C28', 'R7_C25', 'R7_C26', 'R7_C27', 'R8_C26', 'R8_C28', 'R9_C28'], 'datasets': ['built_c', 'smod'], 'include_nodata': False, 'keep_original_smod': False, 'masking_enabled': False, 'centroids_rows': 131298}
Saved audit: data/class_counts_output/audit_summary.json


In [0]:
# identify columns


# counts_path1 = "./data/class_counts_boundary/class_counts_all_20251007_164711.csv"
# df = pd.read_csv(counts_path1)

###
import re
from datetime import datetime
from pathlib import Path
import pandas as pd

FILENAME_RE = re.compile(r"class_counts_all_(\d{8})_(\d{6})\.csv")

def get_latest_by_name_timestamp(dir_path: str | Path):
    dir_path = Path(dir_path)
    best = None
    best_dt = None
    for p in dir_path.glob("class_counts_all_*.csv"):
        m = FILENAME_RE.fullmatch(p.name)
        if not m:
            continue
        dt = datetime.strptime(m.group(1)+m.group(2), "%Y%m%d%H%M%S")
        if best_dt is None or dt > best_dt:
            best_dt = dt
            best = p
    if best is None:
        raise FileNotFoundError("No valid combined files with expected naming pattern.")
    return best, best_dt

latest_file, ts = get_latest_by_name_timestamp("data/class_counts_boundary")
print(f"Selected latest by timestamp: {latest_file.name} (UTC-ish timestamp {ts})")
df = pd.read_csv(latest_file)
print("Rows:", len(df))
###
id_cols = ['grid_id','tile_id','centroid_x','centroid_y','lon', 'lat' ]
smod_cols = [c for c in df.columns if c.startswith('smod_reclass_')]
# any other columns you want preserved during the melt (e.g. built_c_class_)
other_preserve = [c for c in df.columns if c.startswith('built_c_class_')]

# melt, keeping id and other_preserve columns
df_long = df.melt(
    id_vars=id_cols + other_preserve,
    value_vars=smod_cols,
    var_name='smod_band',
    value_name='smod_value'
).reset_index(drop=True)

# replace NA in the smod_value column with 0
df_long['smod_value'] = df_long['smod_value'].fillna(0)

df_long.head(20)

Selected latest by timestamp: class_counts_all_20251008_142150.csv (UTC-ish timestamp 2025-10-08 14:21:50)
Rows: 131298


Unnamed: 0,grid_id,tile_id,centroid_x,centroid_y,lon,lat,built_c_class_11,built_c_class_12,built_c_class_13,built_c_class_14,built_c_class_15,built_c_class_21,built_c_class_22,built_c_class_23,built_c_class_24,built_c_class_25,smod_band,smod_value
0,G_807_133,R5_C25,6622500.0,4037500.0,73.893787,33.385615,12438,0,0,0,0,0,0,0,0,0,smod_reclass_0,10
1,G_807_134,R5_C25,6627500.0,4037500.0,73.949577,33.385615,10111,318,0,0,0,0,0,0,0,0,smod_reclass_0,13
2,G_807_135,R5_C25,6632500.0,4037500.0,74.005367,33.385615,16324,299,0,0,0,0,0,0,0,0,smod_reclass_0,8
3,G_807_136,R5_C25,6637500.0,4037500.0,74.061157,33.385615,15924,897,0,0,0,0,0,0,0,0,smod_reclass_0,11
4,G_807_137,R5_C25,6642500.0,4037500.0,74.116947,33.385615,9188,0,0,0,0,0,0,0,0,0,smod_reclass_0,14
5,G_807_138,R5_C25,6647500.0,4037500.0,74.172737,33.385615,5063,0,0,0,0,0,0,0,0,0,smod_reclass_0,21
6,G_807_139,R5_C25,6652500.0,4037500.0,74.228527,33.385615,4857,0,0,0,0,0,0,0,0,0,smod_reclass_0,22
7,G_807_140,R5_C25,6657500.0,4037500.0,74.284317,33.385615,13010,2466,1137,0,0,0,0,0,0,0,smod_reclass_0,7
8,G_807_141,R5_C25,6662500.0,4037500.0,74.340107,33.385615,12335,3250,8007,357,0,0,0,38,0,0,smod_reclass_0,6
9,G_807_142,R5_C25,6667500.0,4037500.0,74.395897,33.385615,10684,0,0,0,0,0,0,0,0,0,smod_reclass_0,14


In [0]:
import pandas as pd
import numpy as np
from itertools import product

# counts_df = df_sorted
proportions_df = pd.read_csv('./data/input.csv')

## df_long is need beofre this
# # extract numeric suffix from smod_band and convert to int
df_long['smod_class'] = df_long['smod_band'].str.replace('smod_reclass_', '', regex=False).astype(int)

# replace NA values in the value column with 0
df_long['smod_value'] = df_long['smod_value'].fillna(0)

# sort so each grid_id appears with smod_class 1,2,3
df_sorted = df_long.sort_values(['grid_id', 'smod_class']).reset_index(drop=True)

def compute_expected_and_ordered(counts_df, proportions_csv_path, floor_bins=None, out_prefix='expected'):
    # defaults
    if floor_bins is None:
        floor_bins = ['1','2','3','4-5','6-9','10-20','20+']
    expected_cols = [f'expected_{b}' for b in floor_bins]

    # 1) load & normalize proportions
    proportions_df = pd.read_csv(proportions_csv_path)
    if 'urban' in proportions_df.columns and 'smod' not in proportions_df.columns:
        proportions_df = proportions_df.rename(columns={'urban': 'smod'})
    # ensure numeric floor cols and key columns
    proportions_df[floor_bins] = proportions_df[floor_bins].apply(pd.to_numeric, errors='coerce').fillna(0.0)
    proportions_df[['smod','built']] = proportions_df[['smod','built']].astype(int)

    # build combo index and P matrix
    proportions_df['combo'] = list(zip(proportions_df['smod'], proportions_df['built']))
    proportions_df = proportions_df.set_index('combo', drop=False)
    combo_index = {combo: i for i, combo in enumerate(proportions_df.index)}
    K = len(combo_index)
    P = np.zeros((K, len(floor_bins)), dtype=np.float32)
    for combo, row in proportions_df.iterrows():
        P[combo_index[combo], :] = row[floor_bins].values.astype(np.float32)

    # 2) melt counts once and aggregate observed counts
    # detect built columns and useful id cols
    built_cols = [c for c in counts_df.columns if c.startswith('built_c_class_')]
    possible_id_cols = ['grid_id','tile_id','centroid_x','centroid_y','smod_band','smod_value','smod_class']
    id_cols = [c for c in possible_id_cols if c in counts_df.columns]

    counts_long = counts_df.melt(id_vars=id_cols, value_vars=built_cols,
                                 var_name='built_col', value_name='count')
    counts_long['built'] = counts_long['built_col'].str.replace('built_c_class_', '', regex=False).astype(int)

    if 'smod_class' in counts_long.columns:
        counts_long['smod'] = counts_long['smod_class'].astype(int)
    else:
        counts_long['smod'] = counts_long['smod_band'].str.replace('smod_reclass_', '', regex=False).astype(int)

    # aggregate obs counts (one row per grid_id, smod, built)
    obs_counts = counts_long.groupby(['grid_id','smod','built'], observed=True, as_index=False)['count'].sum()

    # 3) expand to full grid x combos so zeros are explicit
    grid_order = counts_df['grid_id'].astype(str).drop_duplicates().tolist()
    combos_list = list(combo_index.keys())
    full_rows = [(g, int(s), int(b)) for g in grid_order for (s,b) in combos_list]
    full_df = pd.DataFrame(full_rows, columns=['grid_id','smod','built'])

    full_counts = full_df.merge(obs_counts, on=['grid_id','smod','built'], how='left')
    full_counts['count'] = full_counts['count'].fillna(0).astype(float)

    # attach combo_idx and filter combos not in combo_index (safety)
    full_counts['combo'] = list(zip(full_counts['smod'].astype(int), full_counts['built'].astype(int)))
    full_counts = full_counts[full_counts['combo'].isin(combo_index)].reset_index(drop=True)
    full_counts['combo_idx'] = full_counts['combo'].map(combo_index).astype(int)

    # 4) vectorized expected per-row (M x F)
    combo_idx_arr = full_counts['combo_idx'].to_numpy(dtype=int)
    row_counts = full_counts['count'].to_numpy(dtype=np.float32)[:, None]
    combo_P_rows = P[combo_idx_arr]                      # (M,F)
    expected_per_row = combo_P_rows * row_counts         # (M,F)

    exp_df = pd.DataFrame(expected_per_row, columns=expected_cols)
    combo_wide_full = pd.concat([full_counts[['grid_id','smod','built','count']].reset_index(drop=True),
                                 exp_df.reset_index(drop=True)], axis=1)

    # 5) grid-level expected via fast matrix multiply: pivot to grid x combo then dot
    counts_pivot = full_counts.pivot_table(index='grid_id', columns='combo_idx', values='count',
                                           aggfunc='sum', fill_value=0)
    all_cols = np.arange(K)
    counts_pivot = counts_pivot.reindex(columns=all_cols, fill_value=0).reindex(index=grid_order, fill_value=0)
    C = counts_pivot.to_numpy(dtype=np.float32)
    expected_matrix = C.dot(P)
    grid_wide = pd.DataFrame(expected_matrix, index=counts_pivot.index, columns=floor_bins).reset_index()

    # 6) build built-summary (smod='ALL') and ordered rows where summary comes first
    # create smod_sort BEFORE selecting columns to avoid missing-column error
    built_summary = combo_wide_full.groupby(['grid_id','built'], observed=True, as_index=False)[expected_cols].sum()
    count_sum = combo_wide_full.groupby(['grid_id','built'], observed=True, as_index=False)['count'].sum().rename(columns={'count':'count'})
    built_summary = built_summary.merge(count_sum, on=['grid_id','built'], how='left')[['grid_id','built','count'] + expected_cols]
    built_summary['smod'] = 'ALL'
    built_summary['smod_sort'] = -1    # <- ensure this exists BEFORE selecting/concatenating

    detailed = combo_wide_full.copy()
    # if smod dtype is not int, ensure numeric for sorting
    detailed['smod_sort'] = detailed['smod'].astype(int)

    cols_order = ['grid_id','smod','built','count'] + expected_cols
    # built_summary currently has no smod column; we added it above; now reorder safely
    built_summary = built_summary[['grid_id','smod','built','count'] + expected_cols + ['smod_sort']]
    detailed = detailed[['grid_id','smod','built','count'] + expected_cols + ['smod_sort']]

    combined_ordered = pd.concat([built_summary, detailed], ignore_index=True, sort=False)
    combined_ordered['grid_id'] = pd.Categorical(combined_ordered['grid_id'], categories=grid_order, ordered=True)
    combined_ordered['built'] = combined_ordered['built'].astype(int)
    combined_ordered = combined_ordered.sort_values(['grid_id','smod_sort','built']).reset_index(drop=True)
    combined_ordered['smod'] = combined_ordered['smod'].astype(str)
    combined_ordered = combined_ordered.drop(columns=['smod_sort'])

    # write outputs
    grid_wide.to_csv(f'{out_prefix}_by_grid.csv', index=False)
    combo_wide_full.to_csv(f'{out_prefix}_combo_wide_full.csv', index=False)
    combined_ordered.to_csv(f'{out_prefix}_ordered_full_combos.csv', index=False)

    print(f"Wrote: {out_prefix}_by_grid.csv, {out_prefix}_combo_wide_full.csv, {out_prefix}_ordered_full_combos.csv")
    print("\nGrid preview:")
    print(grid_wide.head(6).to_string(index=False))
    print("\nOrdered preview (first 12 rows):")
    print(combined_ordered.head(12).to_string(index=False))

    return {
        'grid_wide': grid_wide,
        'combo_wide_full': combo_wide_full,
        'ordered_full_combos': combined_ordered
    }

# Example call (adjust path if needed)
outputs = compute_expected_and_ordered(df_sorted, './data/input.csv')

Wrote: expected_by_grid.csv, expected_combo_wide_full.csv, expected_ordered_full_combos.csv

Grid preview:
  grid_id          1          2          3       4-5       6-9  10-20   20+
G_167_681  43.200001  33.919998   7.680000  5.760000  2.880000   1.60  0.64
G_168_681  62.099998  48.759998  11.040000  8.280000  4.140000   2.30  0.92
G_168_682 699.299988 549.080017 124.320007 93.239998 46.619999  25.90 10.36
G_169_680  55.350002  43.459999   9.840000  7.380000  3.690000   2.05  0.82
G_169_681   5.400000   4.240000   0.960000  0.720000  0.360000   0.20  0.08
G_169_682  75.599998  59.360001  13.440000 10.080000  5.040000   2.80  1.12

Ordered preview (first 12 rows):
  grid_id smod  built  count  expected_1  expected_2  expected_3  expected_4-5  expected_6-9  expected_10-20  expected_20+
G_167_681  ALL     11   96.0   43.200001   33.919998        7.68          5.76          2.88            1.60          0.64
G_167_681  ALL     12    0.0    0.000000    0.000000        0.00          0.00   

In [0]:
grid_df = outputs['grid_wide']
combo_df = outputs['combo_wide_full']
ordered_df = outputs['ordered_full_combos']

print(grid_df.shape)
print(combo_df.head())
display(ordered_df.head())

(131298, 8)
     grid_id  smod  built  ...  expected_6-9  expected_10-20  expected_20+
0  G_167_681     0     11  ...          0.96            0.64          0.32
1  G_167_681     0     12  ...          0.00            0.00          0.00
2  G_167_681     0     13  ...          0.00            0.00          0.00
3  G_167_681     0     14  ...          0.00            0.00          0.00
4  G_167_681     0     21  ...          0.00            0.00          0.00

[5 rows x 11 columns]


grid_id,smod,built,count,expected_1,expected_2,expected_3,expected_4-5,expected_6-9,expected_10-20,expected_20+
G_167_681,ALL,11,96.0,43.2,33.92,7.6800003,5.76,2.8799999,1.5999999,0.64
G_167_681,ALL,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G_167_681,ALL,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G_167_681,ALL,14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
G_167_681,ALL,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
