In [2]:
import os
import numpy as np
import rasterio
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib


In [3]:
# =============================================
# CONFIGURATION
# =============================================
PIXELS_PER_IMAGE =500      # <-- reduce to 200 for even faster pipeline
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
# =============================================
# 1. Load + SAMPLE pixels from a TIFF efficiently
# =============================================
def load_sampled_pixels(path, pixels_per_image=300):
    with rasterio.open(path) as src:
        bands = src.count
        height = src.height
        width = src.width

        # total pixels
        total = height * width

        # random sample indices
        sample_indices = np.random.choice(total, pixels_per_image, replace=False)

        # convert 1D â†’ (row, col)
        rows = sample_indices // width
        cols = sample_indices % width

        # extract spectral vector per sample
        samples = []

        for r, c in zip(rows, cols):
            pixel = src.read(window=((r, r+1), (c, c+1)))[:, 0, 0]  # shape: (13,)
            samples.append(pixel)

    return np.array(samples, dtype=np.float32)

In [6]:
# =============================================
# 2. Build dataset using sampled pixels
# =============================================
def load_dataset(root):
    X = []
    y = []
    label_map = {}

    class_folders = sorted(os.listdir(root))

    for label_id, cls in enumerate(class_folders):
        class_dir = os.path.join(root, cls)

        if not os.path.isdir(class_dir):
            continue

        print(f"Loading class: {cls}")

        label_map[label_id] = cls

        for file in os.listdir(class_dir):
            if not file.endswith(".tif"):
                continue

            img_path = os.path.join(class_dir, file)

            # sample pixels from image
            sampled_pixels = load_sampled_pixels(img_path, PIXELS_PER_IMAGE)

            X.append(sampled_pixels)
            y.append(np.full(sampled_pixels.shape[0], label_id))

    X = np.vstack(X)
    y = np.hstack(y)

    return X, y, label_map

In [5]:
# =============================================
# 3. Load dataset
# =============================================
dataset_path = r"D:\Database\EuoroSat\EuroSATallBands"
X, y, label_map = load_dataset(dataset_path)

print("Dataset shape:", X.shape)  
print("Labels shape:", y.shape)  
print("Classes:", label_map)

Loading class: AnnualCrop
Loading class: Forest
Loading class: HerbaceousVegetation
Loading class: Highway
Loading class: Industrial
Loading class: Pasture
Loading class: PermanentCrop
Loading class: Residential
Loading class: River
Loading class: SeaLake
Dataset shape: (9658950, 13)
Labels shape: (9658950,)
Classes: {0: 'AnnualCrop', 1: 'Forest', 2: 'HerbaceousVegetation', 3: 'Highway', 4: 'Industrial', 5: 'Pasture', 6: 'PermanentCrop', 7: 'Residential', 8: 'River', 9: 'SeaLake'}


In [1]:
# =============================================
# 4. Train KMeans (unsupervised)
# =============================================
print("\nTraining KMeans...")
kmeans = KMeans(n_clusters=8, random_state=RANDOM_SEED)
clusters = kmeans.fit_predict(X)

print("Cluster labels:", np.unique(clusters))
joblib.dump(kmeans, "kmeans_landtype.pkl")


Training KMeans...


NameError: name 'KMeans' is not defined

In [7]:

# =============================================
# 5. Train RandomForest (supervised)
# =============================================
print("\nTraining RandomForest Classifier...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

clf = RandomForestClassifier(
    n_estimators=100,   # reduced for speed
    max_depth=15,
    n_jobs=-1,
    random_state=RANDOM_SEED
)

clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)

print("Classification Accuracy:", acc)

joblib.dump(clf, "rf_land_classifier.pkl")
joblib.dump(label_map, "label_map.pkl")



Training RandomForest Classifier...
Classification Accuracy: 0.7114153194705428


['label_map.pkl']