<img src="https://github.com/nicholasmetherall/digital-earth-pacific-macblue-activities/blob/main/attachments/images/DE_Pacific_banner.JPG?raw=true" width="900"/>

Figure 1.1.a. Jupyter environment + Python notebooks

# Digital Earth Pacific Notebook 1 prepare postcard and load data to csv

The objective of this notebook is to prepare a geomad postcard for your AOI (masking, scaling and loading additional band ratios and spectral indices) and sampling all the datasets into a csv based on your training data geodataframe.

## Step 1.1: Configure the environment

In [13]:
from datetime import datetime
import geopandas as gpd
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [14]:
# Predefined variable for title and version

# Enter your initials
initials = "nm"

# site
site = "tongatapu"

# Date
date = datetime.now()

# Make a clean version string
version = f"{initials}-{site}-{date.strftime('%d%m%Y')}"
print(version)

nm-tongatapu-11122025


### Postcard csv

The objective of this notebook was to train the machine learning model that will allow us to classify an area with land cover classes defined through the training data.

Step 1.2. Input the training data to sample geomad data from the postcard

In [15]:
# joined_df = gpd.read_file(f"training-data/{version}-training.csv")
joined_df = joined_df.astype("float32")
# joined_df
joined_df = gpd.read_file("training-data/nm-tongatapu-11122025_postcard_4-training.csv")

In [21]:
joined_df=joined_df.drop(columns=["y", "x"])

KeyError: "['y', 'x'] not found in axis"

In [23]:
print(len(joined_df.columns))
joined_df.columns

33


Index(['LULC_code', 'nir', 'red', 'blue', 'green', 'emad', 'smad', 'bcmad',
       'nir08', 'nir09', 'swir16', 'swir22', 'coastal', 'rededge1', 'rededge2',
       'rededge3', 'mndwi', 'ndti', 'cai', 'ndvi', 'evi', 'savi', 'ndwi',
       'b_g', 'b_r', 'swir22_swir16', 'mci', 'ndci', 'nbi', 'ndmi', 'bsi',
       'awei', 'tc_wetness'],
      dtype='object')

In [24]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(joined_df, test_size=0.2, random_state=1337)

# The classes are the first column
classes = np.array(training_data)[:, 0]

# The observation data is everything after the second column
observations = np.array(training_data)[:, 1:]

# Create a model...
classifier = RandomForestClassifier(max_depth=4)

# ...and fit it to the data
model = classifier.fit(observations, classes)

In [25]:
# Dynamically create the filename with f-string
file_path = f"models/{version}-test.model"

# Save the model
joblib.dump(model, file_path)

['models/nm-tongatapu-11122025-test.model']

In [29]:
import pandas as pd
test_actual = np.array(test_data)[:, 0]

test_predicted = model.predict(np.array(test_data)[:,1:])


unique_labels = sorted(np.unique(np.concatenate([np.asarray(test_actual), np.asarray(test_predicted)])))


pd.crosstab(test_actual, test_predicted, margins=True)

col_0,1,2,3,4,5,6,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,31,7,1,0,0,0,39
2,5,54,0,3,2,0,64
3,1,2,11,0,0,0,14
4,1,4,0,30,1,0,36
5,0,4,0,9,5,0,18
6,0,0,0,0,0,6,6
All,38,71,12,42,8,6,177


In [30]:
from sklearn.metrics import accuracy_score

accuracy_score(test_actual, test_predicted)

0.7740112994350282

In [31]:
# -- Cohen's kappa (and extras) for the test set --
from sklearn.metrics import cohen_kappa_score, classification_report, confusion_matrix
import numpy as np

# Ensure these variables exist and are aligned
# test_actual = np.array(test_data)[:, 0]      # <- you already had this
# test_predicted = model.predict(np.array(test_data)[:,1:])  # <- you already had this

assert len(test_actual) == len(test_predicted), "y_true and y_pred must have same length"

# If you want to fix label ordering / include labels with zero counts, provide labels=...
unique_labels = np.unique(np.concatenate([np.asarray(test_actual), np.asarray(test_predicted)]))

# Cohen's kappa (unweighted)
kappa = cohen_kappa_score(test_actual, test_predicted, labels=unique_labels)
# Quadratic weighted kappa (useful when class order/ordinality matters)
kappa_quad = cohen_kappa_score(test_actual, test_predicted, labels=unique_labels, weights="quadratic")

print(f"Cohen's kappa: {kappa:.4f}")
print(f"Quadratic weighted kappa: {kappa_quad:.4f}")

# Optional: bootstrap 95% CI for kappa (can be slow if n_boot large)
def bootstrap_kappa(y_true, y_pred, n_boot=1000, seed=0):
    rng = np.random.default_rng(seed)
    ks = []
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n = len(y_true)
    for _ in range(n_boot):
        idx = rng.integers(0, n, n)
        ks.append(cohen_kappa_score(y_true[idx], y_pred[idx], labels=unique_labels))
    ks = np.array(ks)
    return np.mean(ks), np.percentile(ks, 2.5), np.percentile(ks, 97.5)

mean_k, ci_low, ci_high = bootstrap_kappa(test_actual, test_predicted, n_boot=1000, seed=42)
print(f"Bootstrap mean kappa: {mean_k:.4f}, 95% CI [{ci_low:.4f}, {ci_high:.4f}]")

# Print classification report and confusion matrix for context
print("\nClassification report:\n")
print(classification_report(test_actual, test_predicted, labels=unique_labels, zero_division=0))

print("Confusion matrix (rows=true, cols=pred) with labels:", unique_labels)
print(confusion_matrix(test_actual, test_predicted, labels=unique_labels))

Cohen's kappa: 0.6980
Quadratic weighted kappa: 0.8221
Bootstrap mean kappa: 0.6970, 95% CI [0.6131, 0.7785]

Classification report:

              precision    recall  f1-score   support

           1       0.82      0.79      0.81        39
           2       0.76      0.84      0.80        64
           3       0.92      0.79      0.85        14
           4       0.71      0.83      0.77        36
           5       0.62      0.28      0.38        18
           6       1.00      1.00      1.00         6

    accuracy                           0.77       177
   macro avg       0.81      0.76      0.77       177
weighted avg       0.77      0.77      0.76       177

Confusion matrix (rows=true, cols=pred) with labels: ['1' '2' '3' '4' '5' '6']
[[31  7  1  0  0  0]
 [ 5 54  0  3  2  0]
 [ 1  2 11  0  0  0]
 [ 1  4  0 30  1  0]
 [ 0  4  0  9  5  0]
 [ 0  0  0  0  0  6]]
