In [None]:
# Import necessary libraries
import pandas as pd               # Data manipulation and analysis
import numpy as np                # Numerical operations on arrays
import matplotlib.pyplot as plt   # Plotting library

from matplotlib.lines import Line2D                        # Custom legend creation
from tslearn.metrics import cdist_dtw                      # Compute DTW distances
from sktime.clustering.k_medoids import TimeSeriesKMedoids # K-medoids clustering for time series
from kneed import KneeLocator                              # Detect elbow point in curves

In [None]:
#--- Data import ---
# Load CSV containing Sentinel-2 indices
data = pd.read_csv('path/to/your/data.csv', delimiter=',')

#--- Data exploration ---
# Count unique polygons/areas
print('Number of areas/polygons:', data['ID'].nunique())

#--- Date extraction and conversion ---
# Extract date substring and convert to datetime
data = data.assign(date=data['system:index'].str[:9])
data['date'] = pd.to_datetime(data['date'])

In [3]:
#--- Filtering ---
# Exclude records with NDSI >= 0.4 (likely clouds or snow)
filtered_data = data[data['NDSI'] < 0.4]

In [None]:
#--- Prepare 3D ndarray for clustering ---
unique_ids = filtered_data['ID'].unique()
unique_dates = filtered_data['date'].unique()
features = ['NDMI']  # List of features to cluster on

n_samples = len(unique_ids)
seq_length = len(unique_dates)
n_features = len(features)

In [16]:
# Inicializace ndarray pro výstup
n_samples = len(unique_ids)
seq_length = len(unique_dates)
n_features = len(features)
data_ndarray = np.full((n_samples, seq_length, n_features), np.nan)  
seq_length
n_samples

214

In [17]:
# Initialize array with NaNs for missing entries
data_ndarray = np.full((n_samples, seq_length, n_features), np.nan)

# Create mappings from ID/date to array indices
id_to_idx = {val: idx for idx, val in enumerate(unique_ids)}
date_to_idx = {val: idx for idx, val in enumerate(unique_dates)}

In [18]:
# Populate ndarray with observed feature values
for _, row in filtered_data.iterrows():
    i = id_to_idx[row['ID']]
    j = date_to_idx[row['date']]
    for f_idx, feature in enumerate(features):
        data_ndarray[i, j, f_idx] = row[feature]

In [None]:
#--- Interpolate missing data ---
def interpolate_ts(ts):
    df_ts = pd.DataFrame(ts, columns=['val'])
    # Linear interpolation for both directions
    df_ts['val'] = df_ts['val'].interpolate(method='linear', limit_direction='both')
    return df_ts['val'].values

for i in range(n_samples):
    for f in range(n_features):
        data_ndarray[i, :, f] = interpolate_ts(data_ndarray[i, :, f])

print('NaN count after interpolation:', np.isnan(data_ndarray).sum())
print('Data shape:', data_ndarray.shape)

In [None]:
#--- Determine optimal number of clusters via elbow method ---
Ks = range(2, 25)
distortions = []
inertias = []
for k in Ks:
    model = TimeSeriesKMedoids(
        n_clusters=k,
        metric="dtw",
        random_state=42,
        n_init=750,
        max_iter=500,
        tol=1e-6,
        verbose=True,
        init_algorithm="kmeans++"
    )
    labels = model.fit_predict(data_ndarray)
    # Calculate distortion: mean squared DTW to closest centroid
    dists = cdist_dtw(data_ndarray, model.cluster_centers_)
    distortions.append(np.mean(np.min(dists, axis=1)**2))
    # Record inertia from model
    inertias.append(model.inertia_)

In [None]:
# Plot distortion and inertia vs. number of clusters
plt.figure(figsize=(10, 5))
plt.plot(Ks, distortions, 'bx-', label='Distortion')
plt.plot(Ks, inertias, 'ro-', label='Inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Distortion / Inertia')
plt.title('Elbow Method for Time Series Clustering')
plt.legend()
plt.show()

In [None]:
# Find elbow point automatically
knee = KneeLocator(Ks, distortions, curve='convex', direction='decreasing').knee
print(f'Optimal number of clusters: {knee}')

# Plot with elbow point marked
plt.figure(figsize=(10,5))
plt.plot(Ks, distortions, 'bx-')
plt.axvline(x=knee, color='r', linestyle='--', label=f'Elbow: {knee}')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Distortion')
plt.title('Elbow Method with Detected Point')
plt.legend()
plt.show()


In [None]:
#--- Final clustering with chosen k ---
final_model = TimeSeriesKMedoids(
    n_clusters=knee,
    metric="dtw",
    random_state=42,
    n_init=750,
    max_iter=500,
    tol=1e-6,
    verbose=True,
    init_algorithm="kmeans++"
)
clusters = final_model.fit_predict(data_ndarray)

In [None]:
#--- Visualize clusters ---
# Prepare a colormap and assign unique colors for each feature
cmap = plt.get_cmap('tab10')
feature_colors = [cmap(i / n_features) for i in range(n_features)]

# Create one subplot per cluster arranged vertically
fig, axes = plt.subplots(knee, 1, figsize=(10, 2 * knee), sharex=True)

for cid in range(knee):
    cluster_indices = np.where(clusters == cid)[0]
    print(f'Cluster {cid+1}: {len(cluster_indices)} series')

    ax = axes[cid]
    # Plot each series feature for the cluster, using its assigned color
    for idx in cluster_indices:
        for f in range(n_features):
            ax.plot(
                unique_dates,
                data_ndarray[idx, :, f],
                color=feature_colors[f],
                alpha=0.5,
                linewidth=1
            )

    ax.set_title(f'Cluster {cid+1}')
    ax.grid(True)

    # Add a legend to map colors to feature names in this subplot
    legend_handles = [
        Line2D([0], [0], color=feature_colors[f], lw=2, label=features[f])
        for f in range(n_features)
    ]
    ax.legend(handles=legend_handles, loc='upper right')

# Shared axis labels for all subplots
axes[-1].set_xlabel('Date')
fig.text(0.06, 0.5, 'Value', va='center', rotation='vertical')

plt.tight_layout()
plt.show()

In [None]:
#--- Export cluster assignments ---
cluster_map = pd.DataFrame({'ID': unique_ids, 'Cluster': clusters})
export_data = filtered_data[['ID', 'phase']].drop_duplicates().merge(cluster_map, on='ID')
export_data.to_csv('clusters.csv', index=False)
print('Export completed: clusters.csv')