# Libraries

In [None]:
import warnings
import numpy as np
import pandas as pd
import torch

from torchsom.core import SOM
from torchsom.visualization import SOMVisualizer, VisualizationConfig

warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

# Preprocessing 

In [None]:
blobs_df = pd.read_csv(
    filepath_or_buffer="../data/blobs_300_4_3_1.0.csv",
)
blobs_df = pd.read_csv(
    filepath_or_buffer="../data/blobs_5000_50_3_1.0.csv",
)
blobs_df = pd.read_csv(
    filepath_or_buffer="../data/blobs_5000_4_3_1.0.csv",
)

In [None]:
blobs_df.head()

In [None]:
blobs_df.describe()

In [None]:
feature_columns = blobs_df.columns[:-1]  
feature_names = feature_columns.to_list()
feature_names

In [None]:
blobs_df.shape

In [None]:
"""
1. Create a tensor from the iris df and separate the features and the target
2. Randomly shuffle the data
3. Split the data into training and testing sets
"""
blobs_torch = torch.tensor(blobs_df.to_numpy(dtype=np.float32), device=device)
all_features, all_targets = blobs_torch[:, :-1], blobs_torch[:, -1].long()


shuffled_indices = torch.randperm(len(all_features), device=device)
all_features, all_targets = all_features[shuffled_indices], all_targets[shuffled_indices]

train_ratio = 0.8
train_count = int(train_ratio * len(all_features))
train_features, train_targets = all_features[:train_count], all_targets[:train_count]
test_features, test_targets = all_features[train_count:], all_targets[train_count:]

print(train_features.shape, test_features.shape)
print(train_targets.shape, test_targets.shape)

# TorchSOM

In [None]:
som = SOM(
    x=25,
    y=15,
    sigma=1.45,
    learning_rate=0.95,
    neighborhood_order=3,
    epochs=100,
    batch_size=16, # 16 or train_features.shape[0]
    topology="hexagonal",
    distance_function="euclidean",
    neighborhood_function="gaussian",
    num_features=all_features.shape[1],
    lr_decay_function="asymptotic_decay",
    sigma_decay_function="asymptotic_decay",
    initialization_mode="pca",
    device=device,
    random_seed=random_seed,
) 

In [None]:
som.initialize_weights(
    data=train_features,
    mode=som.initialization_mode
)

In [None]:
QE, TE = som.fit(
    data=train_features
)

In [None]:
visualizer = SOMVisualizer(som=som, config=VisualizationConfig(save_format="pdf"))
save_path = f"results/clustering/blob_{blobs_df.shape[0]}_{blobs_df.shape[1]}_{len(blobs_df['Species'].unique())}/{som.topology}" # Set to None if you want a direct plot

In [None]:
visualizer.plot_training_errors(
    quantization_errors=QE, 
    topographic_errors=TE, 
    save_path=save_path
)

In [None]:
visualizer.plot_distance_map(save_path=save_path)

In [None]:
visualizer.plot_hit_map(
    data=train_features,
    save_path=save_path
)

In [None]:
visualizer.plot_classification_map(
    data=train_features,
    target=train_targets,
    save_path=save_path
)

In [None]:
visualizer.plot_component_planes(
    component_names=feature_names,
    save_path=save_path
)

# Clustering

In [None]:
cluster = som.cluster(
    method="hdbscan", # hdbscan, kmeans, gmm
    n_clusters=len(blobs_df['Species'].unique()),
    feature_space="weights",
)

In [None]:
visualizer.plot_cluster_map(
    cluster_result=cluster,
    save_path=save_path,
)

In [None]:
visualizer.plot_elbow_analysis(
    max_k=10,
    feature_space="weights",
    save_path=save_path
)

In [None]:
results = []
methods = ["kmeans", "gmm", "hdbscan"]
feature_spaces = ["weights", "positions", "combined"]
for method in methods:
    for space in feature_spaces:
        result = som.cluster(method=method, feature_space=space)
        results.append(result)

In [None]:
visualizer.plot_cluster_quality_comparison(
    results_list=results,
    save_path=save_path
)