In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import geopandas as gpd
import rasterio
from rasterio.features import shapes, rasterize
from rasterio.windows import from_bounds, Window
from shapely.geometry import shape, box
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics


In [2]:
def load_and_prepare_training(training_shp, class_field="class_id", target_crs=None):
    gdf = gpd.read_file(training_shp)
    if class_field not in gdf.columns:
        raise ValueError(f"Class field '{class_field}' not found. Available fields: {list(gdf.columns)}")
    gdf[class_field] = gdf[class_field].astype(int)
    gdf = gdf[~gdf.geometry.is_empty].copy()
    gdf['geometry'] = gdf['geometry'].buffer(0)
    if target_crs is not None and gdf.crs != target_crs:
        gdf = gdf.to_crs(target_crs)
    return gdf


def sample_training_pixels_windowed(gdf, raster_path, class_field="class_id", max_pixels_per_poly=None):
    X_list = []
    y_list = []
    with rasterio.open(raster_path) as src:
        nbands = src.count
        raster_crs = src.crs
        if gdf.crs != raster_crs:
            gdf = gdf.to_crs(raster_crs)

        for idx, row in gdf.iterrows():
            geom = row.geometry
            cls = int(row[class_field])
            if geom is None or geom.is_empty:
                continue
            minx, miny, maxx, maxy = geom.bounds
            pad_x = (maxx - minx) * 0.001 + 1e-6
            pad_y = (maxy - miny) * 0.001 + 1e-6
            win = from_bounds(minx - pad_x, miny - pad_y, maxx + pad_x, maxy + pad_y, transform=src.transform)
            col_off = max(0, int(win.col_off))
            row_off = max(0, int(win.row_off))
            width = int(min(src.width - col_off, win.width))
            height = int(min(src.height - row_off, win.height))
            if width <= 0 or height <= 0:
                continue
            window = Window(col_off, row_off, width, height)
            arr = src.read(window=window)
            if arr.size == 0:
                continue
            window_transform = rasterio.windows.transform(window, src.transform)
            mask = rasterize(
                [(geom, 1)],
                out_shape=(height, width),
                transform=window_transform,
                fill=0,
                dtype='uint8'
            )
            pixels_idx = np.where(mask.flatten() == 1)[0]
            if pixels_idx.size == 0:
                continue
            arr_reshaped = arr.reshape(nbands, -1)
            sel_pixels = arr_reshaped[:, pixels_idx].T
            if max_pixels_per_poly is not None and sel_pixels.shape[0] > max_pixels_per_poly:
                choice = np.random.choice(sel_pixels.shape[0], max_pixels_per_poly, replace=False)
                sel_pixels = sel_pixels[choice]
            X_list.append(sel_pixels)
            y_list.append(np.full((sel_pixels.shape[0],), cls, dtype=np.int32))
    if len(X_list) == 0:
        return np.zeros((0,0)), np.zeros((0,), dtype=np.int32)
    X = np.vstack(X_list)
    y = np.concatenate(y_list)
    return X, y


def train_random_forest(X, y, n_estimators=200, random_state=42):
    clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=random_state)
    clf.fit(X, y)
    return clf


def accuracy_report_holdout(clf, X, y, test_size=0.2):
    if X.shape[0] < 10:
        print("Not enough samples for holdout accuracy.")
        return None, None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)
    clf2 = RandomForestClassifier(n_estimators=clf.n_estimators, n_jobs=-1, random_state=42)
    clf2.fit(X_train, y_train)
    preds = clf2.predict(X_test)
    report = metrics.classification_report(y_test, preds)
    cm = metrics.confusion_matrix(y_test, preds)
    return report, cm


def classify_and_polygonize_blockwise(raster_path, clf, class_field='class_id'):
    """
    Classify raster blockwise and polygonize each block's predicted classes on the fly.
    Returns a GeoDataFrame of polygons (not yet dissolved).
    """
    recs = []
    with rasterio.open(raster_path) as src:
        nbands = src.count
        for ji, window in src.block_windows(1):
            arr = src.read(window=window)  # (bands, h, w)
            bands, h, w = arr.shape
            arr2 = arr.reshape(bands, -1).T
            arr2 = np.nan_to_num(arr2, nan=0.0, posinf=0.0, neginf=0.0)
            # predict
            try:
                preds = clf.predict(arr2)
            except Exception:
                mask_nonzero = ~(np.all(arr2 == 0, axis=1))
                preds = np.zeros(arr2.shape[0], dtype=int)
                if np.any(mask_nonzero):
                    preds_masked = clf.predict(arr2[mask_nonzero])
                    preds[mask_nonzero] = preds_masked
            preds2d = preds.reshape(h, w).astype('int32')
            # polygonize only non-zero classes
            window_transform = rasterio.windows.transform(window, src.transform)
            mask = preds2d != 0
            if not np.any(mask):
                continue
            for geom_json, val in shapes(preds2d, mask=mask, transform=window_transform):
                recs.append({ 'geometry': shape(geom_json), class_field: int(val) })
    if len(recs) == 0:
        return gpd.GeoDataFrame(columns=['geometry', class_field])
    gdf = gpd.GeoDataFrame(recs)
    # set CRS from source raster
    with rasterio.open(raster_path) as src:
        gdf.crs = src.crs
    return gdf


# -----------------------
# MAIN: user parameters
# -----------------------
if __name__ == "__main__":
    # ---------- USER PARAMETERS (edit) ----------
    raster_path = r"D:/2_Analytics/9_LULC_classification/demo_ortho/KRB_BGD_ORTHO_3CM_UTM_GEOTIFF.tif"
    training_shp = r"D:/2_Analytics/9_LULC_classification/training_shp/training_shp.shp"
    class_field = "class_id"
    out_vector = r"D:/2_Analytics/9_LULC_classification/classified_polygons.shp"
    rf_n_estimators = 200
    max_samples_per_polygon = 2000

    legend = {
        1: ("Water", "#1f78b4"),
        2: ("Built-up", "#b15928"),
        3: ("Vegetation", "#33a02c"),
        4: ("Bare soil", "#fdbf6f"),
        5: ("Road", "#6a3d9a"),
        6: ("Shadow", "#666666"),
        7: ("Agriculture", "#b2df8a"),
        8: ("Forest", "#228B22"),
    }
    # -------------------------------------------

    print("Loading training shapefile and preparing geometries...")
    with rasterio.open(raster_path) as _src:
        raster_crs = _src.crs
        raster_bounds = _src.bounds

    gdf = load_and_prepare_training(training_shp, class_field=class_field, target_crs=raster_crs)
    print(f"Training features loaded: {len(gdf)}")

    bbox = box(*raster_bounds)
    gdf_in = gdf[gdf.geometry.intersects(bbox)]
    print(f"Features intersecting raster extent: {len(gdf_in)} / {len(gdf)}")
    if len(gdf_in) == 0:
        raise RuntimeError("No training polygons intersect the raster. Check CRS and locations.")

    print("Sampling training pixels (windowed per polygon)...")
    X, y = sample_training_pixels_windowed(gdf_in, raster_path, class_field=class_field, max_pixels_per_poly=max_samples_per_polygon)
    print(f"Sampled pixels: {X.shape[0]}, bands: {X.shape[1] if X.size else 0}")
    if X.shape[0] == 0:
        raise RuntimeError("No training pixels sampled. Check raster and polygons.")

    print("Training Random Forest...")
    clf = train_random_forest(X, y, n_estimators=rf_n_estimators)
    print("Training completed.")

    print("Internal accuracy (holdout):")
    report, cm = accuracy_report_holdout(clf, X, y)
    if report is not None:
        print(report)
        print("Confusion matrix:\n", cm)

    print("Classifying raster blockwise and polygonizing on-the-fly...")
    gdf_polys = classify_and_polygonize_blockwise(raster_path, clf, class_field=class_field)
    print(f"Collected polygons before dissolve: {len(gdf_polys)}")

    if len(gdf_polys) == 0:
        raise RuntimeError("No polygons generated from classification. Exiting.")

    # dissolve by class_id
    gdf_polys = gdf_polys.set_crs(raster_crs)
    gdf_dissolved = gdf_polys.dissolve(by=class_field, as_index=False)

    # add class_name from legend
    name_map = {k: v[0] for k, v in legend.items()}
    color_map = {k: v[1] for k, v in legend.items()}
    gdf_dissolved['class_name'] = gdf_dissolved[class_field].map(name_map).fillna('unknown')

    # save legend CSV next to vector
    legend_csv = os.path.splitext(out_vector)[0] + "_legend.csv"
    with open(legend_csv, 'w', encoding='utf-8') as f:
        f.write("class_id,class_name,color\n")
        for cid, (nm, col) in legend.items():
            f.write(f"{cid},{nm},{col}\n")
    print("Saved legend CSV to:", legend_csv)

    # write vector
    gdf_dissolved.to_file(out_vector, driver='ESRI Shapefile')
    print("Saved polygon vector to:", out_vector)

    print("Done. No TIFF outputs were created.")


Loading training shapefile and preparing geometries...
Training features loaded: 26
Features intersecting raster extent: 26 / 26
Sampling training pixels (windowed per polygon)...
Sampled pixels: 52000, bands: 4
Training Random Forest...
Training completed.
Internal accuracy (holdout):
              precision    recall  f1-score   support

           1       0.59      0.54      0.56      1600
           2       0.41      0.41      0.41      2400
           5       0.64      0.65      0.64      2000
           7       0.51      0.53      0.52      2000
           8       0.54      0.54      0.54      2400

    accuracy                           0.53     10400
   macro avg       0.54      0.54      0.54     10400
weighted avg       0.53      0.53      0.53     10400

Confusion matrix:
 [[ 865  221   82  186  246]
 [ 192  988  322  438  460]
 [  74  265 1298  160  203]
 [ 131  444  154 1066  205]
 [ 204  488  183  220 1305]]
Classifying raster blockwise and polygonizing on-the-fly...


MemoryError: Unable to allocate 128. MiB for an array with shape (3350432, 1, 5) and data type float64

In [16]:
# ===== Install deps if needed (uncomment) =====
# !pip install rasterio geopandas scikit-learn numpy tqdm joblib

# ===== Full pipeline with checks + colorized RGB output =====
import json
from pathlib import Path
import numpy as np
import rasterio
from rasterio.windows import Window
from rasterio import windows
from rasterio.features import rasterize
import geopandas as gpd
from shapely.geometry import box
from sklearn.ensemble import RandomForestClassifier
from joblib import dump
from tqdm import tqdm
import warnings
import sys

# --------------------
# Helper / check funcs
# --------------------
def check_files_exist(raster_path, shp_path):
    rp = Path(raster_path)
    sp = Path(shp_path)
    if not rp.exists():
        raise FileNotFoundError(f"Raster not found: {rp}")
    if not sp.exists():
        raise FileNotFoundError(f"Shapefile not found: {sp}")
    # shapefile requires .shp and its sidecar files; at minimum check .shp exists
    return True

def safe_read_gdf(shp_path):
    try:
        gdf = gpd.read_file(shp_path)
    except Exception as e:
        raise RuntimeError(f"Failed to read shapefile '{shp_path}': {e}")
    if gdf.empty:
        raise RuntimeError("Shapefile contains no features.")
    return gdf

def build_mapping_from_user_and_gdf(user_classes, gdf, class_field):
    """
    Use user_classes order as primary. If shapefile contains additional classes,
    append them after user_classes and warn.
    Returns mapping name->code (int) with 'unclassified' mapped to 0 if present.
    """
    # clean shapefile class names
    gdf[class_field] = gdf[class_field].astype(str).str.strip().str.lower()
    shp_unique = list(sorted(set(gdf[class_field].unique())))
    mapping = {}
    code = 1
    # start with user classes if provided
    if user_classes:
        for cname in user_classes:
            key = str(cname).strip().lower()
            if key == "unclassified":
                mapping[key] = 0
            else:
                if key not in mapping:
                    mapping[key] = code
                    code += 1
    # include any shapefile classes not present
    extra = [c for c in shp_unique if c not in mapping]
    if extra:
        warnings.warn(f"Shapefile contains classes not in user_classes: {extra}. They will be appended to the mapping.")
        for cname in extra:
            if cname == "unclassified":
                mapping[cname] = 0
            else:
                mapping[cname] = code
                code += 1
    return mapping

def windows_from_raster_obj(src, tile_size):
    nrows, ncols = src.height, src.width
    for row_off in range(0, nrows, tile_size):
        height = min(tile_size, nrows - row_off)
        for col_off in range(0, ncols, tile_size):
            width = min(tile_size, ncols - col_off)
            yield Window(row_off=row_off, col_off=col_off, height=height, width=width)

# -------------------------
# Training sample extraction
# -------------------------
def collect_training_samples(raster_path, shp_path, class_field, mapping,
                             tile_size=512, max_samples=200000, verbose=False, treat_zeros_as_nodata=False):
    """
    Windowed rasterize-then-sample approach. Returns X (n_samples x bands) and y (n_samples,)
    """
    # read shapefile
    gdf = safe_read_gdf(shp_path)
    if class_field not in gdf.columns:
        raise ValueError(f"class_field '{class_field}' not in shapefile columns: {list(gdf.columns)}")
    # normalize class strings
    gdf[class_field] = gdf[class_field].astype(str).str.strip().str.lower()

    with rasterio.open(raster_path) as src:
        raster_crs = src.crs
        band_count = src.count
        if band_count < 3:
            warnings.warn(f"Raster has {band_count} bands. Expected RGB (3 bands) — still proceeding but check your input.")
        # reproject gdf to raster CRS if needed
        if gdf.crs is None:
            raise RuntimeError("Shapefile has no CRS. Assign a CRS in QGIS or geopandas before running.")
        if raster_crs != gdf.crs:
            if verbose:
                print(f"Reprojecting shapefile from {gdf.crs} to raster CRS {raster_crs}.")
            gdf = gdf.to_crs(raster_crs)
        # spatial index
        try:
            sindex = gdf.sindex
        except Exception:
            sindex = None

        X_list = []
        y_list = []
        total_samples = 0

        # quick check: do any polygons intersect raster extent?
        raster_box = box(*src.bounds)
        intersect_count = int(gdf.intersects(raster_box).sum())
        if intersect_count == 0:
            raise RuntimeError("No training polygons intersect the raster extent. Check CRS and geometry locations.")

        if verbose:
            print(f"Raster size: {src.width} x {src.height}; Bands: {band_count}")
            print(f"Training polygons intersecting raster: {intersect_count}")

        for win in tqdm(list(windows_from_raster_obj(src, tile_size)), desc="Collect windows"):
            win_bounds = windows.bounds(win, src.transform)  # (left, bottom, right, top)
            # candidate polygons
            if sindex is not None:
                candidate_idx = list(sindex.intersection(win_bounds))
                if not candidate_idx:
                    continue
                sub = gdf.iloc[candidate_idx]
                # precise intersect
                sub = sub[sub.intersects(box(*win_bounds))]
                if sub.empty:
                    continue
            else:
                sub = gdf[gdf.intersects(box(*win_bounds))]
                if sub.empty:
                    continue

            # shapes for rasterize
            shapes = []
            for _, row in sub.iterrows():
                cname = str(row[class_field]).strip().lower()
                if cname in mapping:
                    shapes.append((row.geometry, int(mapping[cname])))
            if not shapes:
                continue

            # rasterize labels for the window
            win_transform = windows.transform(win, src.transform)
            label = rasterize(
                shapes,
                out_shape=(win.height, win.width),
                transform=win_transform,
                fill=0,
                all_touched=True,
                dtype="uint8"
            )

            # read image window
            img = src.read(window=win)  # (bands, h, w)
            # mask nodata or all-zero background
            if src.nodata is not None:
                mask = np.any(img == src.nodata, axis=0)
            else:
                if treat_zeros_as_nodata:
                    mask = np.all(img == 0, axis=0)
                else:
                    mask = np.zeros((win.height, win.width), dtype=bool)

            flat_label = label.ravel()
            flat_img = img.reshape((img.shape[0], -1)).T  # (n_pixels, bands)
            flat_mask = mask.ravel()

            cond = (flat_label > 0) & (~flat_mask)
            if np.any(cond):
                X_list.append(flat_img[cond])
                y_list.append(flat_label[cond])
                total_samples += int(cond.sum())

            if max_samples and total_samples >= max_samples:
                if verbose:
                    print(f"Reached max_samples={max_samples}. Collected {total_samples} samples.")
                break

    if not X_list:
        raise RuntimeError("No training samples were collected. Check shapefile geometry, class_field, and CRS.")

    X = np.vstack(X_list)
    y = np.hstack(y_list)

    # downsample uniformly if too many samples
    if max_samples and X.shape[0] > max_samples:
        if verbose:
            print(f"Downsampling training samples from {X.shape[0]} to {max_samples}")
        rng = np.random.default_rng(42)
        idx = rng.choice(X.shape[0], size=max_samples, replace=False)
        X = X[idx]
        y = y[idx]

    if verbose:
        print("Collected samples:", X.shape[0], "features:", X.shape[1])

    return X, y

# -------------------------
# Train and prediction funcs
# -------------------------
def train_rf(X, y, n_estimators=100, verbose=False):
    clf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, class_weight="balanced", random_state=42)
    clf.fit(X, y)
    if verbose:
        print("Trained RandomForest on", X.shape[0], "samples.")
    return clf

def classify_to_label_tiff(raster_path, label_out_path, clf, out_dtype="uint8", tile_size=512, verbose=False, treat_zeros_as_nodata=False):
    """
    Classify window-by-window and write single-band integer label TIFF.
    """
    with rasterio.open(raster_path) as src:
        meta = src.meta.copy()
        meta.update({"count": 1, "dtype": out_dtype, "compress": "lzw", "tiled": True})
        with rasterio.open(label_out_path, "w", **meta) as dst:
            for win in tqdm(list(windows_from_raster_obj(src, tile_size)), desc="Classify windows"):
                img = src.read(window=win)  # (bands, h, w)
                h, w = win.height, win.width
                flat = img.reshape((img.shape[0], -1)).T  # (n_pixels, bands)

                if src.nodata is not None:
                    mask = np.any(img == src.nodata, axis=0).ravel()
                    valid_idx = ~mask
                else:
                    if treat_zeros_as_nodata:
                        mask = np.all(img == 0, axis=0).ravel()
                        valid_idx = ~mask
                    else:
                        valid_idx = np.ones(flat.shape[0], dtype=bool)

                pred = np.zeros(flat.shape[0], dtype=np.uint8)
                if np.any(valid_idx):
                    # predict only on valid pixels
                    pred_valid = clf.predict(flat[valid_idx])
                    pred[valid_idx] = pred_valid

                out_arr = pred.reshape((h, w)).astype(out_dtype)
                dst.write(out_arr, 1, window=win)
    if verbose:
        print("Wrote label TIFF to:", label_out_path)

def create_rgb_tiff_from_label(label_tiff, rgb_out_path, mapping, color_by_name, tile_size=512, verbose=False):
    """
    Create a 3-band RGB GeoTIFF by mapping integer labels to RGB colors (windowed).
    mapping: dict name->code
    color_by_name: dict name->(R,G,B)
    """
    # build code->color mapping
    code2color = {}
    for name, code in mapping.items():
        col = color_by_name.get(name, (255, 0, 255))  # fallback magenta
        # ensure tuple of 3 ints
        col = tuple(int(c) for c in col)
        code2color[int(code)] = col

    with rasterio.open(label_tiff) as src_label:
        meta = src_label.meta.copy()
        meta.update({"count": 3, "dtype": "uint8", "compress": "lzw", "tiled": True})
        with rasterio.open(rgb_out_path, "w", **meta) as dst_rgb:
            for win in tqdm(list(src_label.block_windows(1)), desc="Colorize windows"):
                ji, win_obj = win
                lbl = src_label.read(1, window=win_obj)  # (h,w)
                h, w = lbl.shape
                rgb = np.zeros((3, h, w), dtype=np.uint8)
                # vectorized mapping: iterate codes present
                vals = np.unique(lbl)
                for v in vals:
                    if v in code2color:
                        r, g, b = code2color[v]
                    else:
                        # unknown label -> black
                        r, g, b = (0, 0, 0)
                    mask = (lbl == v)
                    rgb[0][mask] = r
                    rgb[1][mask] = g
                    rgb[2][mask] = b
                dst_rgb.write(rgb, window=win_obj)
    if verbose:
        print("Wrote RGB TIFF to:", rgb_out_path)

# -------------------------
# Main wrapper with checks
# -------------------------
def run_full_pipeline(
    raster_path,
    shp_path,
    class_field="class",
    user_classes=None,
    tile_size=512,
    n_estimators=100,
    max_samples=200000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,
    label_out="classified_labels.tif",
    rgb_out="classified_rgb.tif",
    model_out="rf_model.joblib",
    verbose=True,
    treat_zeros_as_nodata=False
):
    raster_path = Path(raster_path)
    shp_path = Path(shp_path)
    label_out = Path(label_out)
    rgb_out = Path(rgb_out)
    model_out = Path(model_out)

    # 1) basic filesystem checks
    check_files_exist(raster_path, shp_path)

    # 2) read shapefile and confirm class_field
    gdf = safe_read_gdf(shp_path)
    if class_field not in gdf.columns:
        raise ValueError(f"class_field '{class_field}' not found. Available fields: {list(gdf.columns)}")

    # 3) open raster to get metadata and check tile_size sanity
    with rasterio.open(raster_path) as src:
        if tile_size <= 0:
            raise ValueError("tile_size must be > 0")
        if tile_size > max(src.width, src.height):
            warnings.warn("tile_size is larger than raster dimensions; using raster size windows will still work.")
        raster_crs = src.crs
        if raster_crs is None:
            raise RuntimeError("Raster has no CRS. Cannot safely align shapefile. Exiting.")

    # 4) build class mapping (user order first, append extras from shapefile)
    mapping = build_mapping_from_user_and_gdf(user_classes, gdf, class_field)

    # 5) show mapping and counts per class in shapefile
    gdf[class_field] = gdf[class_field].astype(str).str.strip().str.lower()
    counts = gdf[class_field].value_counts().to_dict()
    if verbose:
        print("Class mapping (name -> code):", mapping)
        print("Shapefile counts per class (raw polygons):", counts)

    # 6) Collect training samples
    X, y = collect_training_samples(
        str(raster_path), str(shp_path), class_field, mapping,
        tile_size=tile_size, max_samples=max_samples, verbose=verbose, treat_zeros_as_nodata=treat_zeros_as_nodata
    )

    # 7) Train RF
    clf = train_rf(X, y, n_estimators=n_estimators, verbose=verbose)
    dump(clf, str(model_out))
    if verbose:
        print("Saved RF model to:", model_out)

    # 8) Classify to label TIFF
    classify_to_label_tiff(str(raster_path), str(label_out), clf, out_dtype="uint8", tile_size=tile_size, verbose=verbose, treat_zeros_as_nodata=treat_zeros_as_nodata)

    # 9) Save mapping JSON
    mapping_file = label_out.with_suffix(".classes.json")
    with open(mapping_file, "w") as f:
        json.dump(mapping, f, indent=2)
    if verbose:
        print("Saved class mapping to:", mapping_file)

    # 10) Create RGB color map (you can customize colors here)
    # Default color palette (change as desired)
    color_by_name = {
        "water":       (0, 0, 255),
        "forest":      (34, 139, 34),
        "buildup":     (200, 200, 200),
        "roads":       (255, 255, 0),
        "agriculture": (255, 165, 0),
        "unclassified":(0, 0, 0)
    }
    # If shapefile had extra classes not in defaults, fallback magenta for unknown names

    create_rgb_tiff_from_label(str(label_out), str(rgb_out), mapping, color_by_name, tile_size=tile_size, verbose=verbose)

    return {"model": str(model_out), "labels": str(label_out), "rgb": str(rgb_out), "mapping": str(mapping_file)}

# -------------------------
# Run (edit these paths to your real files)
# -------------------------
#raster_input = r"D:/2_Analytics/9_LULC_classification/demo_ortho/cropped_ortho.tif"
raster_input = r"D:\2_Analytics\9_LULC_classification\demo_ortho\KRB_BGD_ORTHO_3CM_UTM_GEOTIFF.tif"
training_shp = r"D:\2_Analytics\9_LULC_classification\training_shp\training_new.shp"
output_label = r"D:/2_Analytics/9_LULC_classification/demo_ortho/classified_rf_labels.tif"
output_rgb = r"D:/2_Analytics/9_LULC_classification/demo_ortho/classified_rf_rgb.tif"
model_file = r"D:/2_Analytics/9_LULC_classification/demo_ortho/rf_model.joblib"

# Run pipeline (will print progress)
results = run_full_pipeline(
    raster_input,
    training_shp,
    class_field="class",
    user_classes=["water","forest","buildup","roads","agriculture","unclassified"],
    tile_size=512,
    n_estimators=100,
    max_samples=2000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,
    label_out=output_label,
    rgb_out=output_rgb,
    model_out=model_file,
    verbose=True,
    treat_zeros_as_nodata=False  # set True if your ortho background is black and should be treated as nodata
)

print("Finished. Outputs:", results)


Class mapping (name -> code): {'water': 1, 'forest': 2, 'buildup': 3, 'roads': 4, 'agriculture': 5, 'unclassified': 0}
Shapefile counts per class (raw polygons): {'forest': 1, 'agriculture': 1, 'roads': 1, 'water': 1, 'buildup': 1}
Raster size: 104701 x 121220; Bands: 4
Training polygons intersecting raster: 5


Collect windows: 100%|██████████| 48585/48585 [00:05<00:00, 8100.79it/s] 


Collected samples: 7732344 features: 4


MemoryError: Unable to allocate 59.0 MiB for an array with shape (7732344, 1) and data type int64

In [None]:
# ====== Report per-class pixel value ranges from training data ======
# Requires collect_training_samples() defined earlier

import numpy as np
import geopandas as gpd
from pathlib import Path
import json

# ---------- Edit paths ----------
raster_input = r"D:\2_Analytics\9_LULC_classification\demo_ortho\KRB_BGD_ORTHO_3CM_UTM_GEOTIFF.tif"
training_shp = r"D:\2_Analytics\9_LULC_classification\training_shp\training_new.shp"
class_field = "class"
out_json = Path("geoai_landcover_output") / "class_value_ranges.json"
out_json.parent.mkdir(parents=True, exist_ok=True)
# --------------------------------

# Build mapping from shapefile classes
gdf = gpd.read_file(training_shp)
gdf[class_field] = gdf[class_field].astype(str).str.strip().str.lower()
unique_labels = sorted(gdf[class_field].unique())
mapping = {}
code = 1
for name in unique_labels:
    if name == "unclassified":
        mapping[name] = 0
    else:
        mapping[name] = code
        code += 1
print("Mapping (name -> code):", mapping)

# Collect training samples
X, y = collect_training_samples(
    raster_path=raster_input,
    shp_path=training_shp,
    class_field=class_field,
    mapping=mapping,
    tile_size=512,
    max_samples=20000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000,
    verbose=True,
    treat_zeros_as_nodata=False
)

# Compute ranges per class
n_bands = X.shape[1]
ranges = {}
for name, code in mapping.items():
    if code == 0:
        continue
    mask = (y == code)
    if not np.any(mask):
        ranges[name] = {"code": code, "count": 0, "band_ranges": []}
        continue
    class_vals = X[mask]  # (N_class_samples, n_bands)
    band_ranges = []
    for b in range(n_bands):
        vmin = int(class_vals[:, b].min())
        vmax = int(class_vals[:, b].max())
        band_ranges.append({"band": b+1, "min": vmin, "max": vmax})
    ranges[name] = {"code": code, "count": int(mask.sum()), "band_ranges": band_ranges}

# Print neatly
print("\nPer-class training pixel value ranges:")
for name, info in ranges.items():
    print(f"\n{name} (code={info['code']}, samples={info['count']})")
    for br in info["band_ranges"]:
        print(f"  Band {br['band']}: {br['min']} – {br['max']}")

# Save to JSON
with open(out_json, "w") as f:
    json.dump(ranges, f, indent=2)
print("\nSaved value ranges to:", out_json)


Mapping (name -> code): {'agriculture': 1, 'buildup': 2, 'forest': 3, 'roads': 4, 'water': 5}
Raster size: 104701 x 121220; Bands: 4
Training polygons intersecting raster: 5


Collect windows: 100%|██████████| 48585/48585 [00:06<00:00, 7810.23it/s]  


Collected samples: 7732344 features: 4

Per-class training pixel value ranges:

agriculture (code=1, samples=228131)
  Band 1: 102 – 221
  Band 2: 83 – 200
  Band 3: 57 – 174
  Band 4: 255 – 255

buildup (code=2, samples=343643)
  Band 1: 25 – 253
  Band 2: 18 – 247
  Band 3: 11 – 244
  Band 4: 255 – 255

forest (code=3, samples=6672913)
  Band 1: 27 – 249
  Band 2: 18 – 243
  Band 3: 0 – 212
  Band 4: 255 – 255

roads (code=4, samples=151947)
  Band 1: 76 – 232
  Band 2: 50 – 199
  Band 3: 26 – 163
  Band 4: 255 – 255

water (code=5, samples=335710)
  Band 1: 47 – 198
  Band 2: 38 – 182
  Band 3: 15 – 162
  Band 4: 255 – 255

Saved value ranges to: geoai_landcover_output\class_value_ranges.json


In [None]:
# ====== Stratified subsample, train RF safely, then predict & colorize ======
import numpy as np
import rasterio
from rasterio import windows
from collections import Counter
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
import json
from tqdm import tqdm

# ---------- Config ----------
# Adjust these to control memory/time tradeoffs
samples_per_class = 40000    # <= choose 20k..100k depending on available RAM/CPU
total_max_samples = None     # optional hard cap (None to use samples_per_class * n_classes)
rf_n_estimators = 100        # fewer trees -> less memory/time
rf_n_jobs = 1                # IMPORTANT: avoid -1 to reduce memory spikes
rf_random_state = 42

raster_input = r"D:\2_Analytics\9_LULC_classification\demo_ortho\KRB_BGD_ORTHO_3CM_UTM_GEOTIFF.tif"
output_label = r"D:/2_Analytics/9_LULC_classification/demo_ortho/classified_rf_labels.tif"
output_rgb   = r"D:/2_Analytics/9_LULC_classification/demo_ortho/classified_rf_rgb.tif"
model_file   = r"D:/2_Analytics/9_LULC_classification/demo_ortho/rf_model.joblib"
mapping_json = Path("geoai_landcover_output") / "class_value_ranges.json"

# color_by_name as before (edit if needed)
color_by_name = {
    "agriculture": (255, 200, 0),
    "buildup":     (200, 200, 200),
    "forest":      (34, 139, 34),
    "roads":       (180, 180, 180),
    "water":       (0, 0, 255),
    "unclassified":(0, 0, 0)
}
tile_size = 512

# ---------- Ensure X,y exist ----------
if "X" not in globals() or "y" not in globals():
    raise RuntimeError("X,y not found in notebook. Run the sample collection cell first.")

# If more than 3 bands, keep first 3
if X.shape[1] > 3:
    X = X[:, :3]

# Build inverse mapping (name->code) must exist
if "mapping" not in globals():
    if mapping_json.exists():
        # mapping was saved in a previous JSON (ranges file): load it
        info = json.load(open(mapping_json, "r"))
        # if the saved structure is ranges, convert to mapping
        if isinstance(info, dict) and all("code" in v for v in info.values()):
            mapping = {name:int(v["code"]) for name,v in info.items()}
        else:
            # try fallback: if info is mapping directly
            mapping = {k:int(v) for k,v in info.items()}
    else:
        raise RuntimeError("mapping not in memory and mapping JSON not found.")

inv_map = {v:k for k,v in mapping.items()}  # code->name

# ---------- Stratified subsample ----------
print("Stratified subsample: target samples per class:", samples_per_class)
unique_codes = sorted([c for n,c in mapping.items() if int(c) != 0])  # skip code 0 (unclassified)
per_class_idx = {}
rng = np.random.default_rng(42)

for code in unique_codes:
    idxs = np.nonzero(y == code)[0]
    n = idxs.size
    if n == 0:
        print(f"Warning: no samples for code {code} ({inv_map.get(code,'?')})")
        per_class_idx[code] = np.array([], dtype=int)
        continue
    if n <= samples_per_class:
        chosen = idxs
    else:
        chosen = rng.choice(idxs, size=samples_per_class, replace=False)
    per_class_idx[code] = chosen
    print(f" - {inv_map.get(code,'?')} (code={code}): available={n:,}, selected={chosen.size:,}")

# Combine indices
all_idx = np.hstack([per_class_idx[c] for c in unique_codes if per_class_idx[c].size>0])
if total_max_samples:
    if all_idx.size > total_max_samples:
        all_idx = rng.choice(all_idx, size=total_max_samples, replace=False)

# Shuffle combined indices
perm = rng.permutation(all_idx.size)
all_idx = all_idx[perm]

X_sub = X[all_idx]
y_sub = y[all_idx]
print("Final training set shape:", X_sub.shape, y_sub.shape)

# Free some memory by deleting big X,y if you won't use them later
# del X, y   # uncomment if you want

# ---------- Train RandomForest (safer params) ----------
print("Training RandomForestClassifier with n_estimators=", rf_n_estimators, "n_jobs=", rf_n_jobs)
clf = RandomForestClassifier(n_estimators=rf_n_estimators, n_jobs=rf_n_jobs,
                             class_weight="balanced", random_state=rf_random_state)
clf.fit(X_sub, y_sub)
dump(clf, model_file)
print("Model trained and saved to:", model_file)

# ---------- Predict full raster (windowed) ----------
print("Starting windowed prediction...")
with rasterio.open(raster_input) as src:
    meta = src.meta.copy()
    meta.update({"count": 1, "dtype": "uint8", "compress": "lzw", "tiled": True})
    with rasterio.open(output_label, "w", **meta) as dst:
        # iterate windows in the same tiling pattern used earlier
        for row_off in range(0, src.height, tile_size):
            for col_off in range(0, src.width, tile_size):
                height = min(tile_size, src.height - row_off)
                width = min(tile_size, src.width - col_off)
                win = windows.Window(col_off=col_off, row_off=row_off, width=width, height=height)
                img = src.read(window=win)  # (bands, h, w)
                if img.shape[0] > 3:
                    img = img[:3, :, :]
                h, w = win.height, win.width
                flat = img.reshape((img.shape[0], -1)).T

                if src.nodata is not None:
                    mask = np.any(img == src.nodata, axis=0).ravel()
                    valid_idx = ~mask
                else:
                    valid_idx = np.ones(flat.shape[0], dtype=bool)

                pred = np.zeros(flat.shape[0], dtype=np.uint8)
                if np.any(valid_idx):
                    pred_valid = clf.predict(flat[valid_idx])
                    pred[valid_idx] = pred_valid

                out_arr = pred.reshape((h, w)).astype(np.uint8)
                dst.write(out_arr, 1, window=win)

print("Label TIFF written to:", output_label)

# ---------- Colorized RGB TIFF ----------
print("Colorizing label TIFF to RGB...")
code2color = {int(code): color_by_name.get(name, (255,0,255)) for name,code in mapping.items()}
with rasterio.open(output_label) as src_label:
    meta = src_label.meta.copy()
    meta.update({"count": 3, "dtype": "uint8", "compress": "lzw", "tiled": True})
    with rasterio.open(output_rgb, "w", **meta) as dst_rgb:
        for ji, win in src_label.block_windows(1):
            lbl = src_label.read(1, window=win)
            h, w = lbl.shape
            rgb = np.zeros((3, h, w), dtype=np.uint8)
            vals = np.unique(lbl)
            for v in vals:
                col = code2color.get(int(v), (0,0,0))
                mask = (lbl == v)
                rgb[0][mask] = col[0]
                rgb[1][mask] = col[1]
                rgb[2][mask] = col[2]
            dst_rgb.write(rgb, window=win)

print("RGB TIFF written to:", output_rgb)

# ---------- Pixel counts ----------
from collections import Counter
with rasterio.open(output_label) as src:
    counts = Counter()
    for ji, win in src.block_windows(1):
        data = src.read(1, window=win)
        vals, cnt = np.unique(data, return_counts=True)
        for v,c in zip(vals, cnt):
            counts[int(v)] += int(c)

inv_map = {v:k for k,v in mapping.items()}
print("\nPixel counts per class:")
for code, cnt in sorted(counts.items()):
    name = inv_map.get(code, "unknown")
    print(f" {code:>2d} ({name:12s}): {cnt:,}")

print("\nDone.")


Stratified subsample: target samples per class: 40000
 - agriculture (code=1): available=228,131, selected=40,000
 - buildup (code=2): available=343,643, selected=40,000
 - forest (code=3): available=6,672,913, selected=40,000
 - roads (code=4): available=151,947, selected=40,000
 - water (code=5): available=335,710, selected=40,000
Final training set shape: (200000, 3) (200000,)
Training RandomForestClassifier with n_estimators= 100 n_jobs= 1
Model trained and saved to: D:/2_Analytics/9_LULC_classification/demo_ortho/rf_model.joblib
Starting windowed prediction...
