# EuroSAT Baseline Classic Models

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
import time

plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")

%matplotlib inline

## Load dataset

In [None]:
def load_eurosat_dataset(data_dir="data"):
    data_path = Path(data_dir)

    # Get all class directories
    class_dirs = [d for d in data_path.iterdir() if d.is_dir()]
    class_names = sorted([d.name for d in class_dirs])

    print(f"Found {len(class_names)} classes: {class_names}")

    images = []
    labels = []

    # Load images from each class
    for class_idx, class_name in enumerate(class_names):
        class_path = data_path / class_name
        image_files = list(class_path.glob("*.jpg")) + list(class_path.glob("*.png"))

        print(f"Loading {len(image_files)} images from {class_name}...")

        for img_path in image_files:
            try:
                # Load image
                img = Image.open(img_path)
                img_array = np.array(img)

                # Store image and label
                images.append(img_array)
                labels.append(class_idx)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

    # Convert to numpy arrays
    data = np.array(images)
    labels = np.array(labels)

    print(f"\nDataset loaded successfully!")
    print(f"Total images           : {len(data)}")
    print(f"Data shape             : {data.shape}")
    print(f"Labels shape           : {labels.shape}")

    return data, labels, class_names


# Load the dataset
data, labels, class_names = load_eurosat_dataset("data")

## Feature extraction

In [None]:
def extract_features(image):
    features = []

    # RGB channel statistics (6 features)
    for channel in range(3):
        features.append(image[..., channel].mean())  # Mean for R, G, B
        features.append(image[..., channel].std())  # Std for R, G, B

    # Brightness statistics (2 features)
    brightness = 0.299 * image[..., 0] + 0.587 * image[..., 1] + 0.114 * image[..., 2]
    features.append(brightness.mean())
    features.append(brightness.std())

    # Contrast (1 feature)
    contrast = image.std()
    features.append(contrast)

    # Pixel value statistics (3 features)
    features.append(image.min())
    features.append(image.max())
    features.append(np.median(image))

    return np.array(features)


# Extract features from all images
print("Extracting features from images...")
start_time = time.time()

X = np.array([extract_features(img) for img in data])
y = labels

end_time = time.time()
print(
    f"Feature extraction completed in             : {end_time - start_time:.2f} seconds"
)
print(f"Feature matrix shape                        : {X.shape}")
print(f"Number of features per image                : {X.shape[1]}")

In [None]:
# Display feature statistics
feature_names = [
    "R_mean",
    "R_std",
    "G_mean",
    "G_std",
    "B_mean",
    "B_std",
    "brightness_mean",
    "brightness_std",
    "contrast",
    "pixel_min",
    "pixel_max",
    "pixel_median",
]

feature_df = pd.DataFrame(X, columns=feature_names)
print("\nFeature:")
print(feature_df.head(5))

In [None]:
print("\nFeature statistics:")
print(feature_df.describe())

## Data split and feature scaling

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.13, random_state=42, stratify=y_train
)

print(f"Training set size:          {X_train.shape[0]} samples")
print(f"Validation set size:        {X_val.shape[0]} samples")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print("\nFeature scaling completed")

## Model training and evaluation

In [None]:
def evaluate_model_with_gridsearch(
    model, param_grid, X_train, X_val, y_train, y_val, model_name
):
    """Train model with GridSearchCV and evaluate on validation set"""

    print(f"Training {model_name} with GridSearchCV...")
    print(f"Parameter grid: {param_grid}")

    # GridSearchCV with 5-fold cross-validation
    start_time = time.time()
    grid_search = GridSearchCV(
        model, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1, verbose=1
    )
    grid_search.fit(X_train, y_train)
    train_time = time.time() - start_time

    # Get best model
    best_model = grid_search.best_estimator_

    print(f"\nBest parameters                     : {grid_search.best_params_}")
    print(f"Best cross-validation score         : {grid_search.best_score_:.4f}")
    print(f"Training time                       : {train_time:.2f} seconds")

    # Predict on validation set
    start_time = time.time()
    y_pred = best_model.predict(X_val)
    predict_time = time.time() - start_time

    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average="weighted")
    recall = recall_score(y_val, y_pred, average="weighted")
    f1 = f1_score(y_val, y_pred, average="weighted")

    # Print results
    print(f"Prediction time: {predict_time:.2f} seconds")
    print(f"\nValidation Set Performance Metrics:")
    print(f"  Accuracy              : {accuracy:.4f}")
    print(f"  Precision             : {precision:.4f}")
    print(f"  Recall                : {recall:.4f}")
    print(f"  F1-Score              : {f1:.4f}")

    # Classification report
    print(f"\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=class_names))

    return {
        "model": best_model,
        "grid_search": grid_search,
        "model_name": model_name,
        "y_pred": y_pred,
        "best_params": grid_search.best_params_,
        "cv_score": grid_search.best_score_,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "train_time": train_time,
        "predict_time": predict_time,
    }

### 1. K-Nearest Neighbors (KNN)

In [None]:
# K-Nearest Neighbors
knn = KNeighborsClassifier()
knn_param_grid = {
    "n_neighbors": [3, 5, 7, 9, 11],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
}

knn_results = evaluate_model_with_gridsearch(
    knn, knn_param_grid, X_train_scaled, X_val_scaled, y_train, y_val, "KNN"
)

### 2. Support Vector Classifier (SVC)

In [None]:
# Support Vector Classifier
svc = SVC(random_state=42)
svc_param_grid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ["poly", "rbf"],
    "gamma": ["scale", "auto"],
}

svc_results = evaluate_model_with_gridsearch(
    svc, svc_param_grid, X_train_scaled, X_val_scaled, y_train, y_val, "SVC"
)

### 3. Random Forest

In [None]:
# Random Forest
rf = RandomForestClassifier(random_state=42)
rf_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

rf_results = evaluate_model_with_gridsearch(
    rf, rf_param_grid, X_train_scaled, X_val_scaled, y_train, y_val, "Random Forest"
)

## Model comparison

In [None]:
# Compare all models
results_summary = pd.DataFrame(
    {
        "Model": [
            knn_results["model_name"],
            svc_results["model_name"],
            rf_results["model_name"],
        ],
        "Best Params": [
            str(knn_results["best_params"]),
            str(svc_results["best_params"]),
            str(rf_results["best_params"]),
        ],
        "CV Score": [
            knn_results["cv_score"],
            svc_results["cv_score"],
            rf_results["cv_score"],
        ],
        "Val Accuracy": [
            knn_results["accuracy"],
            svc_results["accuracy"],
            rf_results["accuracy"],
        ],
        "Precision": [
            knn_results["precision"],
            svc_results["precision"],
            rf_results["precision"],
        ],
        "Recall": [knn_results["recall"], svc_results["recall"], rf_results["recall"]],
        "F1-Score": [knn_results["f1"], svc_results["f1"], rf_results["f1"]],
        "Train Time (s)": [
            knn_results["train_time"],
            svc_results["train_time"],
            rf_results["train_time"],
        ],
    }
)

print(results_summary.to_string(index=False))

## Confusion matrices

In [None]:
def plot_all_confusion_matrices(y_val, predictions_dict, class_names):
    """Plot confusion matrices for all models in a single figure with 3 subplots"""
    fig, axes = plt.subplots(1, 3, figsize=(24, 7))

    for idx, (model_name, y_pred) in enumerate(predictions_dict.items()):
        cm = confusion_matrix(y_val, y_pred)

        sns.heatmap(
            cm,
            annot=True,
            fmt="d",
            cmap="Blues",
            xticklabels=class_names,
            yticklabels=class_names,
            cbar_kws={"label": "Number of samples"},
            ax=axes[idx],
        )
        axes[idx].set_xlabel("Predicted Label", fontsize=12, fontweight="bold")
        axes[idx].set_ylabel("True Label", fontsize=12, fontweight="bold")
        axes[idx].set_title(
            f"Confusion Matrix - {model_name}", fontsize=14, fontweight="bold"
        )
        axes[idx].set_xticklabels(axes[idx].get_xticklabels(), rotation=45, ha="right")
        axes[idx].set_yticklabels(axes[idx].get_yticklabels(), rotation=0)

    plt.tight_layout()
    plt.show()

In [None]:
# Confusion matrices for all best models
predictions = {
    "KNN (Best Model)": knn_results["y_pred"],
    "SVC (Best Model)": svc_results["y_pred"],
    "Random Forest (Best Model)": rf_results["y_pred"],
}

plot_all_confusion_matrices(y_val, predictions, class_names)