# 🔧 Check Runtime Environment
Detects whether the notebook is running in Google Colab or locally, and sets the `base_dir` accordingly for file access.


In [None]:
import os
import sys

# Check if running in Google Colab
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "/content/drive/MyDrive/WildFire_RemoteSensing_workshop/WildFire_RemoteSensing/"
    print(f"📂 Running in Colab, base_dir set to: {base_dir}")
else:
    base_dir = ""  # Adjust as needed
    print(f"🖥️ Running locally, base_dir set to: {base_dir}")


# 📦 Install Dependencies
Installs required Python packages (`tqdm`, `rasterio`) if not already available in the current environment.


In [None]:
!pip install tqdm rasterio 

# 📁 Define Data Directory
Specifies the directory containing the NetCDF (`.nc`) files that will be used to build the dataset.


In [None]:
data_dir = base_dir+"datacubes_2024"  # Change this

# 🧠 Import Libraries & Define Functions

- Imports all necessary libraries
- Defines the following steps:
  - `get_nc_files()`: finds all `.nc` files in the data directory.
  - `extract_pixels_and_labels()`: extracts pixel-level features and assigns labels (0 = "before fire", 1 = "after fire").
  - `build_dataset()`: loops through files, extracts features, and stacks them into `X` and `y`.
  - `save_dataset_to_csv()`: saves the resulting dataset as a CSV for reuse.
- The process supports Sentinel-2 bands and removes invalid or missing data entries.


In [None]:
import os
import glob
import xarray as xr
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tqdm import tqdm
import pandas as pd
# --- SETTINGS ---
 
max_pixels_per_file = 1000

# --- STEP 1: Collect all .nc files ---
def get_nc_files(directory):
    return glob.glob(os.path.join(directory, "*.nc"))

# --- STEP 2: Load data and assign label ---
def extract_pixels_and_labels1(file_path, label, max_pixels):
    ds = xr.open_dataset(file_path)

    # Filter only numeric bands
    numeric_bands = [var for var in ds.data_vars if np.issubdtype(ds[var].dtype, np.number)]
    ds = ds[numeric_bands]  # Keep only numeric variables

    # Convert to array and remove time dim
    da =   ds.to_array().isel(t=0) .transpose("y", "x", "variable")

    # Extract and clean pixels
    pixels = da.values.reshape(-1, da.shape[2]).astype(np.float32, copy=False)
    pixels = pixels[~np.isnan(pixels).any(axis=1)]

    # Limit to max pixels
    np.random.shuffle(pixels)
    return pixels[:max_pixels], np.full(min(len(pixels), max_pixels), label)
def extract_pixels_and_labels(file_path, label, max_pixels):
    ds = xr.open_dataset(file_path)

    # Define desired Sentinel-2 bands
    desired_bands = [
        'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
        'B07', 'B08', 'B8A', 'B09', 'B11', 'B12'
    ]

    # Filter only available & numeric desired bands
    available_bands = [
        band for band in desired_bands
        if band in ds.data_vars and np.issubdtype(ds[band].dtype, np.number)
    ]

    if not available_bands:
        print(f"⚠️ No valid bands found in {file_path}, skipping.")
        return np.empty((0, len(desired_bands))), np.array([])

    ds = ds[available_bands]

    # Select first time slice if needed
    if "t" in ds.dims:
        ds = ds.isel(t=0)

    # Convert to DataArray and reorder dimensions
    da = ds.to_array().transpose("y", "x", "variable")

    # Flatten and clean
    pixels = da.values.reshape(-1, da.shape[2]).astype(np.float32, copy=False)
    pixels = pixels[~np.isnan(pixels).any(axis=1)]

    # Sample subset
    np.random.shuffle(pixels)
    return pixels[:max_pixels], np.full(min(len(pixels), max_pixels), label)


# --- STEP 3: Aggregate dataset ---
def build_dataset(files, max_pixels):
    X, y = [], []
    for f in tqdm(files, desc="Processing .nc files"):
        label = 0 if "before" in f.lower() else 1
        pixels, labels = extract_pixels_and_labels(f, label, max_pixels)
        X.append(pixels)
        y.append(labels)
    return np.vstack(X), np.hstack(y)

# Save features and labels to CSV
def save_dataset_to_csv(X, y, output_path=base_dir+"DATA/pixel_dataset.csv"):
    df = pd.DataFrame(X, columns=[f"band_{i+1}" for i in range(X.shape[1])])
    df["label"] = y
    df.to_csv(output_path, index=False)
    print(f"✅ Dataset saved to: {output_path}")

# --- MAIN ---

files = get_nc_files(data_dir)
X, y = build_dataset(files, max_pixels_per_file)
save_dataset_to_csv(X, y,)
    




 <h1> Read from file</h1>

In [None]:
import pandas as pd

df=pd.read_csv(base_dir+'DATA/pixel_dataset.csv')
df

In [None]:
df.describe()


# 📈 Band Scatter Plot (B1 vs B12)
Visualizes how two spectral bands (band_1 and band_12) vary across burn labels using a 2D scatter plot.

Try changing the band numbers and see the corelation between bands values

In [None]:
from matplotlib import pyplot as plt

plt.scatter(df['band_1'],df['band_12'],alpha=0.5,c=df['label'])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

 

 
# Create figure with subplots
n_bands = 12
fig, axs = plt.subplots(n_bands, n_bands, figsize=(20, 20))
plt.subplots_adjust(hspace=0.3, wspace=0.3)

# Create custom colormap for burned/unburned
cmap = plt.cm.colors.ListedColormap(['#1f77b4', '#ff7f0e'])  # Blue=unburned, Orange=burned

# Plot each band combination
for i in range(n_bands):
    for j in range(n_bands):
        ax = axs[i, j]
        
        # Hide upper triangle and diagonal
        if i <= j:
            ax.remove()
            continue
            
        # Plot scatter points
        scatter = ax.scatter(df['band_'+str(i+1)], df['band_'+str(j+1)], c=df['label'], cmap=cmap, 
                            alpha=0.6, s=10, edgecolors='none')
        
        # Axis labels
        if i == n_bands-1:  # Bottom row
            ax.set_xlabel(  'band_'+str(i+1))
        if j == 0:  # Left column
            ax.set_ylabel('band_'+str(j+1))
            
        # Remove ticks for readability
        ax.set_xticks([])
        ax.set_yticks([])

# Add colorbar
# cbar = fig.colorbar(scatter, ax=axs, orientation='vertical', fraction=0.02)
# cbar.set_ticks([0.25, 0.75])
# cbar.set_ticklabels(['Unburned', 'Burned'])

plt.suptitle('Band Relationship Scatter Matrix (Burned vs Unburned)', y=0.92)
plt.show()


# 🌿 NDVI vs 🔥 NBR Scatter
Calculate NDVI (Normalized Difference Vegetation Index) and NBR (Normalized Burn Ratio) indices, then plot them to evaluate their ability to separate burned and unburned pixels.


In [None]:
ndvi=(df['band_8']-df['band_4'])/(df['band_8']+df['band_4'])
nbr=(df['band_9']-df['band_12'])/(df['band_9']+df['band_12'])
plt.scatter(ndvi, nbr, c=df['label'], cmap=cmap, alpha=0.6, s=10, edgecolors='none')

# 📉 NDVI and NBR Distributions
Plots histograms of NDVI and NBR values separately for burned and unburned classes to visualize statistical separation.


In [None]:
# Clip values to valid ranges for visualization
ndvi = np.clip(ndvi, -1, 1)
nbr = np.clip(nbr, -1, 1)
labels=df['label']
# Separate NDVI and NBR by label
ndvi_burned = ndvi[labels == 1]
ndvi_unburned = ndvi[labels == 0]
nbr_burned = nbr[labels == 1]
nbr_unburned = nbr[labels == 0]

# Plotting
fig, axs = plt.subplots(1, 2, figsize=(14, 6))

bins = 50

# NDVI histogram
axs[0].hist(ndvi_unburned, bins=bins, alpha=0.6, color='green', label='Unburned', density=True)
axs[0].hist(ndvi_burned, bins=bins, alpha=0.6, color='red', label='Burned', density=True)
axs[0].set_title('NDVI Distribution by Burn Status')
axs[0].set_xlabel('NDVI')
axs[0].set_ylabel('Density')
axs[0].legend()
axs[0].grid(True)

# NBR histogram
axs[1].hist(nbr_unburned, bins=bins, alpha=0.6, color='green', label='Unburned', density=True)
axs[1].hist(nbr_burned, bins=bins, alpha=0.6, color='red', label='Burned', density=True)
axs[1].set_title('NBR Distribution by Burn Status')
axs[1].set_xlabel('NBR')
axs[1].set_ylabel('Density')
axs[1].legend()
axs[1].grid(True)

plt.suptitle('Distributions of NDVI and NBR for Burned vs Unburned Pixels', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

<h1>Create Dataset For classification</h1>

# 🧹 Reload and Clean Dataset
- Reloads pixel data.
- Removes rows with any negative values to clean noise or invalid pixels.


In [None]:
import pandas as pd
# Load the dataset

df = pd.read_csv(base_dir+"DATA/pixel_dataset.csv")

#Clean noise from data
df = df[(df >= 0).all(axis=1)]
 
df.describe()

# 🧪 Prepare Train/Test Sets
- Splits data into features (`X`) and labels (`y`)
- Applies stratified train/test split
- Normalizes features using `StandardScaler` to prepare for model training


In [None]:

from sklearn.model_selection import train_test_split



# Separate features and labels
X = df.drop("label", axis=1).values
y = df["label"].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


from sklearn.preprocessing import StandardScaler
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


print("✅ Data loaded and split:")
print(f"  Train shape: {X_train.shape}")
print(f"  Test shape: {X_test.shape}")


<h2> Logistic Regression</h2>

## 🔍 Logistic Regression
Evaluates a baseline linear classifier for binary classification of burned vs. unburned pixels.
# 📈 Train & Evaluate Logistic Regression
- Trains a logistic regression model on the training set.
- Outputs accuracy, confusion matrix, and classification report on test data.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Assuming X_train_scaled, X_test_scaled, y_train, y_test are ready

# Initialize and train logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Predict on test set
y_pred = lr.predict(X_test)

# Evaluation metrics
print("Logistic Regression Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


# 🤖 Define Multiple Classification Models
Initializes several popular classifiers for comparison:
- Random Forest
- Decision Tree
- K-Nearest Neighbors
- Naive Bayes


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Random Forest": RandomForestClassifier(n_estimators=10, random_state=42),
    "Decision Tree Classifier" : DecisionTreeClassifier( random_state=42),
     "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
     "Naive Bayes": GaussianNB(),
 }


# 🧪 Train & Evaluate Models
Fits each model on the training data and prints accuracy on the test set.
(*Classification report optionally included but commented out.*)


In [None]:
from sklearn.metrics import classification_report

for name, model in models.items():
    print(f"\nTraining and evaluating: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    #print(classification_report(y_test, y_pred))


# 🔍 Hyperparameter Tuning with GridSearchCV
- Defines a parameter grid for `RandomForestClassifier`.
- Runs 5-fold cross-validation to find the best combination.
- Evaluates the best model on the test set.


In [None]:
from sklearn.model_selection import GridSearchCV

# Define model and parameter grid
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [10, 80],
    'max_depth': [None, 10 20],
    'min_samples_split': [2, 5],
}

# Setup GridSearch
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit on training data
grid_search.fit(X_train, y_train)

print("Best RF params:", grid_search.best_params_)
print("Best RF CV accuracy:", grid_search.best_score_)

# Use best estimator to predict test set
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred_rf))


<h2>Stacking Clasifier</h2>

## 🧠 Stacking Classifier
Introduces an ensemble model that combines several base classifiers into a stronger meta-model.


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
# Base classifiers
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', LinearSVC(    random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
]

# Meta-classifier
meta_clf = LogisticRegression(max_iter=1000, random_state=42)

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=meta_clf,
    cv=5,
    n_jobs=-1,
    passthrough=True
)

# Train stacking ensemble
stacking_clf.fit(X_train, y_train)

# Evaluate
y_pred_stack = stacking_clf.predict(X_test)
print("Stacking Classifier Performance:\n", classification_report(y_test, y_pred_stack))


<h2> Use model to predict </h2>

## 🗺️ Use Model to Predict on New Image
Describes application of trained model to classify all pixels in a new `.nc` image file.


In [None]:
import xarray as xr
import numpy as np
import rasterio
from rasterio.transform import from_origin
import joblib  # for loading trained model
import os

def classify_nc_to_tiff(nc_file_path, model, scaler, output_tiff_path):
    # Load dataset
    ds = xr.open_dataset(nc_file_path)
    
    # Select required bands
    desired_bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06',
                     'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']
    ds = ds[[band for band in desired_bands if band in ds.data_vars]]
    
    if "t" in ds.dims:
        ds = ds.isel(t=0)
    
    # Convert to DataArray
    da = ds.to_array().transpose("y", "x", "variable")
    
    # Save original shape and coordinates
    height, width = da.shape[0], da.shape[1]
    coords = ds.coords
    
    # Reshape to (n_pixels, n_bands)
    pixels = da.values.reshape(-1, da.shape[2])
    nan_mask = np.isnan(pixels).any(axis=1)

    # Filter valid pixels
    valid_pixels = pixels[~nan_mask]
    valid_pixels_scaled = scaler.transform(valid_pixels)

    # Predict
    predictions = model.predict(valid_pixels_scaled)

    # Reconstruct full classification map
    full_pred = np.full((pixels.shape[0],), fill_value=-1, dtype=np.int16)
    full_pred[~nan_mask] = predictions
    classification_map = full_pred.reshape((height, width))

    # Save as GeoTIFF
    transform = from_origin(
        float(ds.x[0]), float(ds.y[0]), 
        float(ds.x[1] - ds.x[0]), 
        float(ds.y[0] - ds.y[1])
    )
    
    with rasterio.open(
        output_tiff_path,
        'w',
        driver='GTiff',
        height=height,
        width=width,
        count=1,
        dtype=rasterio.int16,
        crs=ds.rio.crs if hasattr(ds, "rio") else "EPSG:4326",  # fallback CRS
        transform=transform,
    ) as dst:
        dst.write(classification_map, 1)

    print(f"✅ Saved classification map to: {output_tiff_path}")


In [None]:
classify_nc_to_tiff(base_dir+"datacubes_2024/fire_224248_after.nc", stacking_clf, scaler, base_dir+"classified_map.tif")


In [None]:
classify_nc_to_tiff(base_dir+"datacubes_2024/fire_226116_before.nc", stacking_clf, scaler, base_dir+"classified_map_bef.tif")


# ✅ Classify and Save New Image
Uses the trained stacking classifier to predict land cover (burned/unburned) from a new `.nc` file and saves the result as a `.tif` image.


In [None]:
import rasterio
import matplotlib.pyplot as plt
with rasterio.open(base_dir+"classified_map_bef.tif") as src:
    image = src.read(1)  # Read the first band
    plt.imshow(image, cmap='viridis')
    plt.title("Predicted Regression Output")
    plt.colorbar(label="Predicted Value")
    plt.axis('off')
    plt.show()

<h2>Lazy Clasifier</h2>

In [None]:

#%pip install lazypredict

from lazypredict.Supervised import LazyClassifier

# Initialize LazyClassifier
# Consider reducing predictions per model and setting ignore_warnings=True
# for faster execution on potentially smaller datasets
lazy_clf = LazyClassifier(verbose=0, ignore_warnings=False, custom_metric=None)

# Fit the models
models, predictions = lazy_clf.fit(X_train_scaled, X_test_scaled, y_train, y_test)

print("\n--- LazyClassifier Results ---")
models