In [13]:
import os
import numpy as np
import rasterio
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from tqdm import tqdm
# =============================================
# CONFIGURATION
# =============================================
PIXELS_PER_IMAGE = 350      # <-- reduce to 200 for even faster pipeline
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
# =============================================
# 1. Load + SAMPLE pixels from a TIFF efficiently
# =============================================
def load_sampled_pixels(path, pixels_per_image=350):
    with rasterio.open(path) as src:
        bands = src.count
        height = src.height
        width = src.width

        # total pixels
        total = height * width

        # random sample indices
        sample_indices = np.random.choice(total, pixels_per_image, replace=False)

        # convert 1D → (row, col)
        rows = sample_indices // width
        cols = sample_indices % width

        # extract spectral vector per sample
        samples = []

        for r, c in zip(rows, cols):
            pixel = src.read(window=((r, r+1), (c, c+1)))[:, 0, 0]  # shape: (13,)
            samples.append(pixel)

    return np.array(samples, dtype=np.float32)

In [17]:
# =============================================
# 2. Build dataset using sampled pixels
# =============================================
def load_dataset(root):
    X = []
    y = []
    label_map = {}

    class_folders = sorted(os.listdir(root))

    for label_id, cls in enumerate(class_folders):
        class_dir = os.path.join(root, cls)

        if not os.path.isdir(class_dir):
            continue

        print(f"Loading class: {cls}")

        label_map[label_id] = cls

        #for file in os.listdir(class_dir):
        for file in tqdm(os.listdir(class_dir), desc=f"Loading {cls}"):
            if not file.endswith(".tif"):
                continue

            img_path = os.path.join(class_dir, file)

            # sample pixels from image
            sampled_pixels = load_sampled_pixels(img_path, PIXELS_PER_IMAGE)

            X.append(sampled_pixels)
            y.append(np.full(sampled_pixels.shape[0], label_id))

    X = np.vstack(X)
    y = np.hstack(y)

    return X, y, label_map

In [18]:
# =============================================
# 3. Load dataset
# =============================================
dataset_path = r"D:\Database\EuoroSat\EuroSATallBands"
X, y, label_map = load_dataset(dataset_path)

print("Dataset shape:", X.shape)  
print("Labels shape:", y.shape)  
print("Classes:", label_map)


Loading class: AnnualCrop


Loading AnnualCrop: 100%|██████████| 3002/3002 [03:31<00:00, 14.19it/s]


Loading class: Forest


Loading Forest: 100%|██████████| 3000/3000 [03:38<00:00, 13.75it/s]


Loading class: HerbaceousVegetation


Loading HerbaceousVegetation: 100%|██████████| 3000/3000 [04:05<00:00, 12.24it/s]


Loading class: Highway


Loading Highway: 100%|██████████| 2500/2500 [03:14<00:00, 12.83it/s]


Loading class: Industrial


Loading Industrial: 100%|██████████| 2500/2500 [03:33<00:00, 11.72it/s]


Loading class: Pasture


Loading Pasture: 100%|██████████| 2000/2000 [02:36<00:00, 12.75it/s]


Loading class: PermanentCrop


Loading PermanentCrop: 100%|██████████| 2500/2500 [03:30<00:00, 11.88it/s]


Loading class: Residential


Loading Residential: 100%|██████████| 3000/3000 [04:24<00:00, 11.35it/s]


Loading class: River


Loading River: 100%|██████████| 2500/2500 [03:38<00:00, 11.44it/s]


Loading class: SeaLake


Loading SeaLake: 100%|██████████| 3597/3597 [06:39<00:00,  9.00it/s]


Dataset shape: (9658950, 13)
Labels shape: (9658950,)
Classes: {0: 'AnnualCrop', 1: 'Forest', 2: 'HerbaceousVegetation', 3: 'Highway', 4: 'Industrial', 5: 'Pasture', 6: 'PermanentCrop', 7: 'Residential', 8: 'River', 9: 'SeaLake'}


In [19]:
# =============================================
# 4. Train KMeans (unsupervised)
# =============================================
from sklearn.cluster import MiniBatchKMeans

print("\nTraining MiniBatchKMeans...")

batch_size = 1000
n_batches = X.shape[0] // batch_size

kmeans = MiniBatchKMeans(
    n_clusters=10,
    batch_size=batch_size,
    random_state=RANDOM_SEED
)

for i in tqdm(range(n_batches), desc="KMeans progress"):
    batch = X[i*batch_size:(i+1)*batch_size]
    kmeans.partial_fit(batch)

# final predictions
clusters = kmeans.predict(X)

print("Cluster labels:", np.unique(clusters))
joblib.dump(kmeans, "kmeans_landtype.pkl")


Training MiniBatchKMeans...


KMeans progress: 100%|██████████| 9658/9658 [01:02<00:00, 153.85it/s] 


Cluster labels: [0 1 2 3 4 5 6 7 8 9]


['kmeans_landtype.pkl']

In [None]:
#%pip install xgboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.5/72.0 MB 1.4 MB/s eta 0:00:51
   ---------------------------------------- 0.8/72.0 MB 1.3 MB/s eta 0:00:53
    --------------------------------------- 1.6/72.0 MB 2.0 MB/s eta 0:00:37
   - -------------------------------------- 2.1/72.0 MB 2.1 MB/s eta 0:00:34
   - -------------------------------------- 2.4/72.0 MB 2.0 MB/s eta 0:00:35
   - -------------------------------------- 2.6/72.0 MB 2.0 MB/s eta 0:00:35
   - -------------------------------------- 2.6/72.0 MB 2.0 MB/s eta 0:00:35
   - ---------------------------------

In [20]:
# =============================================
# 5. Train XGBoost (supervised)
# =============================================

import xgboost as xgb
from xgboost.callback import TrainingCallback
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import joblib
import numpy as np

# --------------------------------------------
# XGBoost Parameters (CPU Optimized)
# --------------------------------------------
params = {
    "objective": "multi:softprob",
    "num_class": len(np.unique(y)),   # number of land-type classes
    "eval_metric": "mlogloss",
    "tree_method": "hist",            # FAST CPU mode
    "max_depth": 12,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

# Convert to DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)

# --------------------------------------------
# tqdm Progress Callback
# --------------------------------------------
class TQDMCallback(TrainingCallback):
    def __init__(self, total_rounds):
        self.pbar = tqdm(total=total_rounds, desc="XGBoost Training")

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        return False

    def after_training(self, model):
        self.pbar.close()
        return model

# --------------------------------------------
# Train
# --------------------------------------------
num_rounds = 200
evals_result = {}  # MUST be defined before calling train

bst = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_rounds,
    evals=[(dtrain, "train"), (dtest, "test")],
    evals_result=evals_result,
    verbose_eval=False,
    callbacks=[TQDMCallback(num_rounds)]
)

# --------------------------------------------
# Accuracy
# --------------------------------------------
proba = bst.predict(dtest)
preds = np.argmax(proba, axis=1)

acc = accuracy_score(y_test, preds)
print("XGBoost Accuracy:", acc)

# --------------------------------------------
# Save Model
# --------------------------------------------
joblib.dump(bst, "xgb_land_classifier.pkl")
joblib.dump(label_map, "label_map.pkl")

print("Model saved.")


XGBoost Training: 100%|██████████| 200/200 [30:41<00:00,  9.21s/it]


XGBoost Accuracy: 0.8001585317244628
Model saved.
