In [4]:
# Standard library imports
import multiprocessing
from pathlib import Path

# Third party imports
import bottleneck as bn
import geopandas as gpd
import joblib
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
from numpy.typing import NDArray
import pandas as pd
import plotly.express as px

from pyclustering.cluster.kmeans import kmeans, kmeans_visualizer
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer
from pyclustering.utils.metric import type_metric, distance_metric

import rasterio as rio
from scipy.spatial.distance import mahalanobis

# Local imports
from clustering import label_scenes, make_cluster_training_set


# Constants
DATA_DIR = Path(r"C:\Users\Peter\gh\rasmussen-705.603\data\FinalProject")
# Cleanup / outlier detection params
WINDOW = 10
START_INDEX = 10
N_STDS = 1.75
# Parallelization constant
CORES = multiprocessing.cpu_count() - 1
# KMEANS params
K = 4
region_sample_size = 5000

# BOCD model params
BOCD_PARAMS = dict(
    hazard=1/100,
    mean0=1,
    var0=2,
    varx=1,
)

# Define directories
raw_dir = DATA_DIR / "raw"
interim_dir = DATA_DIR / "interim"
processed_dir = DATA_DIR / "processed"
cogs_dir = interim_dir / "cogs"
autoencoder_dir = processed_dir / "autoencoder"
models_dir = interim_dir / "models"
scores_dir = interim_dir / "scores"
encoded_dir = interim_dir / "encoded"
meta_dir = interim_dir / "meta"
cluster_dir = interim_dir / "cluster"

change_maps_dir = interim_dir / "change_maps"

cluster_dir.mkdir(exist_ok=True, parents=True)
change_maps_dir.mkdir(exist_ok=True, parents=True)

cores = multiprocessing.cpu_count()

## Train KMeans model

In [2]:
print(f"[Cluster]: Load regions.")
regions = gpd.read_file(raw_dir / "regions.geojson").set_index("event_key")

print(f"[Cluster]: Prepare cluster training set.")
X = make_cluster_training_set(regions, encoded_dir, region_sample_size=region_sample_size)

print(f"[Cluster]: Train KMeans.")
VI = np.linalg.inv(np.cov(X.T))
metric = distance_metric(type_metric.USER_DEFINED, func=lambda x, y: mahalanobis(x, y, VI))
initial_centers = kmeans_plusplus_initializer(X, K).initialize()
kmeans_instance = kmeans(X, initial_centers, metric=metric)
kmeans_instance.process()
clusters = kmeans_instance.get_clusters()
final_centers = kmeans_instance.get_centers()

[Cluster]: Load regions.
[Cluster]: Train KMeans model.
[Cluster]: Processing af-kharkamar-2022...
[Cluster]: Processing gm-kanifing-2022...
[Cluster]: Processing in-cianjur-2022...
[Cluster]: Processing mg-farafangana-2022...
[Cluster]: Processing tr-islahiye-2023...
[Cluster]: Processing us-baltimore-9999...


## Infer using trained KMeans model

In [5]:
label_scenes(regions, cluster_dir, encoded_dir, cores, kmeans_instance, cogs_dir)

[Cluster][af-kharkamar-2022]: Load region dataset and metadata
[Cluster][af-kharkamar-2022]: Prep inputs.
[Cluster][af-kharkamar-2022]: Labeling pixels...
[Cluster][af-kharkamar-2022]: Saving cluster labels...
[Cluster][gm-kanifing-2022]: Load region dataset and metadata
[Cluster][gm-kanifing-2022]: Prep inputs.
[Cluster][gm-kanifing-2022]: Labeling pixels...
[Cluster][gm-kanifing-2022]: Saving cluster labels...
[Cluster][in-cianjur-2022]: Load region dataset and metadata
[Cluster][in-cianjur-2022]: Prep inputs.
[Cluster][in-cianjur-2022]: Labeling pixels...
[Cluster][in-cianjur-2022]: Saving cluster labels...
[Cluster][mg-farafangana-2022]: Load region dataset and metadata
[Cluster][mg-farafangana-2022]: Prep inputs.
[Cluster][mg-farafangana-2022]: Labeling pixels...
[Cluster][mg-farafangana-2022]: Saving cluster labels...
[Cluster][tr-islahiye-2023]: Load region dataset and metadata
[Cluster][tr-islahiye-2023]: Prep inputs.
[Cluster][tr-islahiye-2023]: Labeling pixels...
[Cluster][tr