In [2]:
import os, glob, json
import numpy as np
import rasterio
from sklearn.model_selection import train_test_split


In [3]:
TIFF_FOLDER = r"C:\Users\NNadi\Downloads\Air_pollution_Agentic_AI\GEE_AirQuality_Delhi_Daily_100m_FLOAT"
TIFF_GLOB = os.path.join(TIFF_FOLDER, "AirQuality_Delhi_*.tif")

SEQ_LEN = 5
PATCH_SIZE = 10
PATCH_STRIDE = 64
ALL_BANDS = 24
PRED_BANDS = list(range(9,16))  # PM2.5…AOD → 7 dynamic pollutant bands

SAVE_DIR = "./datasets"
os.makedirs(SAVE_DIR, exist_ok=True)


In [4]:
def load_tifs_sorted(pattern):
    files = sorted(glob.glob(pattern))
    data_list = []

    for f in files:
        with rasterio.open(f) as src:
            arr = src.read()  # shape = (24, H, W)
            if arr.shape[0] != ALL_BANDS:
                raise ValueError(f"Incorrect band count in {f}: got {arr.shape[0]}")
            data_list.append(arr.astype(np.float32))

    data = np.stack(data_list, axis=0)  # (days, 24, H, W)
    return data, files

data_all, file_list = load_tifs_sorted(TIFF_GLOB)
print("Loaded:", data_all.shape)


Loaded: (34, 24, 402, 402)


In [5]:
data_all = np.nan_to_num(data_all, nan=0.0, posinf=0.0, neginf=0.0)

band_min = np.percentile(data_all, 1, axis=(0,2,3))
band_max = np.percentile(data_all, 99, axis=(0,2,3))
band_range = np.maximum(band_max - band_min, 1)

data_norm = ((data_all - band_min[None,:,None,None]) / band_range[None,:,None,None])
data_norm = np.clip(data_norm, 0, 1).astype(np.float32)


In [6]:
def make_sequences(data_norm, seq_len, pred_band_idx):
    X_list, y_list = [], []
    D, C, H, W = data_norm.shape

    for i in range(D - seq_len):
        seq = data_norm[i:i+seq_len]      # (5,24,H,W)
        nxt = data_norm[i+seq_len]        # (24,H,W)

        # Convert sequence → (5,H,W,24)
        seq = np.transpose(seq, (0,2,3,1))

        # Convert target → (H,W,pred_bands)
        y = np.transpose(nxt, (1,2,0))[..., pred_band_idx]

        X_list.append(seq)
        y_list.append(y)

    return np.array(X_list), np.array(y_list)

X_full, y_full = make_sequences(data_norm, SEQ_LEN, PRED_BANDS)
print("Sequence dataset:", X_full.shape, y_full.shape)


Sequence dataset: (29, 5, 402, 402, 24) (29, 402, 402, 7)


In [7]:
def extract_patches(X, y, patch, stride):
    PX, PY = [], []
    N, T, H, W, C = X.shape

    for i in range(N):
        for r in range(0, H - patch + 1, stride):
            for c in range(0, W - patch + 1, stride):
                PX.append(X[i, :, r:r+patch, c:c+patch, :])
                PY.append(y[i,     r:r+patch, c:c+patch, :])

    return np.array(PX, dtype=np.float32), np.array(PY, dtype=np.float32)

X_all, y_all = extract_patches(X_full, y_full, PATCH_SIZE, PATCH_STRIDE)
print("Patch shapes:", X_all.shape, y_all.shape)


Patch shapes: (1421, 5, 10, 10, 24) (1421, 10, 10, 7)


In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_all, y_all, test_size=0.30, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42)

print("Train:", X_train.shape)
print("Val  :", X_val.shape)
print("Test :", X_test.shape)


Train: (994, 5, 10, 10, 24)
Val  : (213, 5, 10, 10, 24)
Test : (214, 5, 10, 10, 24)


In [9]:
np.save(os.path.join(SAVE_DIR, "X_train.npy"), X_train)
np.save(os.path.join(SAVE_DIR, "y_train.npy"), y_train)

np.save(os.path.join(SAVE_DIR, "X_val.npy"), X_val)
np.save(os.path.join(SAVE_DIR, "y_val.npy"), y_val)

np.save(os.path.join(SAVE_DIR, "X_test.npy"), X_test)
np.save(os.path.join(SAVE_DIR, "y_test.npy"), y_test)


In [10]:
meta = {
    "sequence_length": SEQ_LEN,
    "patch_size": PATCH_SIZE,
    "stride": PATCH_STRIDE,
    "bands_total": ALL_BANDS,
    "predict_bands": PRED_BANDS,
    "dataset_shapes": {
        "X_train": list(X_train.shape),
        "y_train": list(y_train.shape),
        "X_val":   list(X_val.shape),
        "y_val":   list(y_val.shape),
        "X_test":  list(X_test.shape),
        "y_test":  list(y_test.shape),
    }
}

with open(os.path.join(SAVE_DIR, "metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("Saved dataset + metadata to:", SAVE_DIR)


Saved dataset + metadata to: ./datasets
