In [14]:
# Import necessary libraries
import pandas as pd                          # Data loading and manipulation
import numpy as np                           # Numerical operations
import matplotlib.pyplot as plt              # Plotting
import seaborn as sns                        # Advanced plotting (barplots)
import pickle                                # Model serialization

from sklearn.model_selection import train_test_split     # Train/test splitting
from sklearn.metrics import accuracy_score, f1_score     # Performance metrics
from sktime.classification.kernel_based import TimeSeriesSVC  # SVM for time series
from sktime.dists_kernels import AggrDist           # Aggregated distance kernel
from sklearn.gaussian_process.kernels import RBF         # Radial basis function kernel


In [None]:
#--- Data import and merging ---
# Load Sentinel-2 index data
loaded_data = pd.read_csv('path/to/your/data.csv', delimiter=',')

loaded_data = loaded_data.assign(date=(loaded_data['system:index'].str[0:9]))

# Load precomputed cluster assignments and merge
data_cluster=pd.read_csv('path/to/your/data.csv', delimiter=',')


# Sloučení na základě společného sloupce (např. "ID")
merged_data = loaded_data.merge(data_cluster, on='ID', how='inner')

# Convert date to datetime format
merged_data['date'] = pd.to_datetime(loaded_data['date'])


In [None]:
# Remove cloudy/snowy observations
filtered_data = merged_data[(merged_data['NDSI'] < 0.4)]

In [17]:
#--- Feature configuration ---
# Univariate features to evaluate
features = ['NDVI', 'EVI', 'NBR', 'NDMI','NDRE', 'MSAVI', 'FAPAR', 'LAI', 'TCW', 'WNDII']
# Define multivariate feature combinations
multivariate_combinations = {
    'EVI_NDVI_MSAVI_NDMI_TCW': ['EVI', 'NDVI', 'MSAVI', 'NDMI', 'TCW'],
    'NDMI_WNDII_FAPAR_LAI_MSAVI_NDRE_MSAVI_TCW': ['NDMI', 'WNDII', 'FAPAR', 'LAI', 'MSAVI', 'NDRE', 'TCW' ],
    'NDMI_NDVI_TCW': ['NDMI', 'NDVI', 'TCW'],
    'NDVI_EVI_FAPAR' : ['NDVI', 'EVI', 'FAPAR'],
    'WNDII_NDMI' : ['WNDII', 'NDMI']
}

In [None]:
#--- Model training and evaluation ---
# Create DataFrame
df = filtered_data[['ID', 'Cluster_y']].drop_duplicates()

# Stratified split based on 'Cluster_y'
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['Cluster_y'], random_state=42)

# Output
train_output = train_df[['ID', 'Cluster_y']].sort_values('ID').reset_index(drop=True)
test_output = test_df[['ID', 'Cluster_y']].sort_values('ID').reset_index(drop=True)

train_output, test_output

In [None]:
# ---- Define C range ----
C_values = np.logspace(1, 3, num=10)  # 1 až 1000

# ---- Storage ----
results = []

# ---- Use ALL features for tuning ----
selecting = features + ['ID', 'date', 'Cluster_y']
selected = filtered_data[selecting]

# Split data
train = selected[selected['ID'].isin(train_output['ID'])]
test = selected[selected['ID'].isin(test_output['ID'])]

y_train = train[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
y_test = test[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")

X_train = train.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
X_test = test.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()

# ---- Loop over C ----
for C_val in C_values:
    print(f"\nTesting C = {C_val:.4f}")
    
    kernel = AggrDist(RBF())
    clf = TimeSeriesSVC(kernel=kernel, C=C_val)
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Accuracy: {acc:.3f}, F1: {f1:.3f}")
    
    results.append({'C': C_val, 'Accuracy': acc, 'F1_score': f1})

# ---- Results DataFrame ----
results_df = pd.DataFrame(results)
best_row = results_df.loc[results_df['F1_score'].idxmax()]

print("\nBest C value found:")
print(best_row)

# Plot F1 vs C

plt.figure(figsize=(8,5))
plt.semilogx(results_df['C'], results_df['F1_score'], marker='o')
plt.xlabel('C value (log scale)')
plt.ylabel('F1 score')
plt.title('F1 Score vs C Value')
plt.grid(True)
plt.show()

In [None]:
#--- Evaluate univariate and multivariate feature sets ---
# Use the best C value for the final model
# Classifier
mean_gaussian_tskernel = AggrDist(RBF())
clf = TimeSeriesSVC(kernel=mean_gaussian_tskernel, C=215.443469)

accuracies = []
f1s = []
tested_features = features + list(multivariate_combinations.keys())  # Combine univariate + multivariate names

# Loop over univariate first
for feature in features:
    print(f"\nTesting univariate: {feature}")
    selecting = [feature, 'ID', 'date', 'Cluster_y']
    
    selected = filtered_data[selecting]
    
    # Split
    train = selected[selected['ID'].isin(train_output['ID'])]
    test = selected[selected['ID'].isin(test_output['ID'])]
    
    # Prepare y
    y_train = train[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
    y_test = test[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
    
    # Prepare X
    X_train = train.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
    X_test = test.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
    
    # Fit
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Accuracy: {acc:.3f}")
    print(f"F1 score: {f1:.3f}")
    
    accuracies.append(acc)
    f1s.append(f1)

# Loop over multivariate combinations
for combo_name, combo_features in multivariate_combinations.items():
    print(f"\nTesting multivariate: {combo_name}")
    selecting = combo_features + ['ID', 'date', 'Cluster_y']
    
    selected = filtered_data[selecting]
    
    # Split
    train = selected[selected['ID'].isin(train_output['ID'])]
    test = selected[selected['ID'].isin(test_output['ID'])]
    
    # Prepare y
    y_train = train[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
    y_test = test[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
    
    # Prepare X
    X_train = train.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
    X_test = test.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
    
    # Fit
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Accuracy: {acc:.3f}")
    print(f"F1 score: {f1:.3f}")
    
    accuracies.append(acc)
    f1s.append(f1)

In [None]:
# Summary
final_features = features + list(multivariate_combinations.keys())
results_df = pd.DataFrame({
    'Feature_Set': final_features,
    'Accuracy': accuracies,
    'F1_score': f1s
})

print("\nFinal Results:")
results_df


In [None]:
# --- Plot F1 scores ---
plt.figure(figsize=(14, 6))
ax = sns.barplot(data=results_df, x='Feature_Set', y='F1_score')

# Add value labels on top of the bars
for i in ax.containers:
    ax.bar_label(i, fmt='%.2f', label_type='edge', padding=3, fontsize=10)

plt.title('F1 Score', fontsize=14)
plt.ylabel('F1 Score')
plt.xlabel('Feature')
plt.xticks(rotation=45, ha='right')
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
# --- Save the best model ---
combination_name = "NDVI_EVI_FAPAR"
if combination_name in multivariate_combinations:
    combination_features = multivariate_combinations[combination_name]
else:
    raise ValueError(f"Combination {combination_name} was not found.")

print("Features:", combination_features)

# Preparace dat pro model
selecting_train = combination_features + ['ID', 'date', 'Cluster_y']
selected_train = filtered_data[selecting_train]

# Define train/test split
train = selected_train[selected_train['ID'].isin(train_output['ID'])]
test = selected_train[selected_train['ID'].isin(test_output['ID'])]

# Prepare y and X for training
y_train = train[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
y_test = test[['Cluster_y', 'ID']].sort_values('ID').drop_duplicates(subset=['ID']).set_index("ID")
X_train_new = train.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()
X_test_new = test.drop(['Cluster_y'], axis=1).set_index(["ID", "date"]).sort_index()

# Train the model with the best C value
clf_new = TimeSeriesSVC(kernel=AggrDist(RBF()), C=215.443469)
clf_new.fit(X_train_new, y_train)



In [None]:
# Export the model and features
# Save the model and features to a file
model_data = {
    'model': clf_new,
    'features': combination_features,  
    'multivariate_combinations': multivariate_combinations
}
with open('model.pkl', 'wb') as file:
    pickle.dump(model_data, file)