# Libraries

In [1]:
import warnings
import numpy as np
import pandas as pd
import torch

from torchsom.core import SOM
from torchsom.visualization import SOMVisualizer, VisualizationConfig

warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

In [2]:
random_seed = 42
torch.manual_seed(random_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing 

In [3]:
blobs_df = pd.read_csv(
    filepath_or_buffer="../data/blobs_300_4_3_1.0.csv",
)

In [4]:
blobs_df.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Species
0,0.811794,0.431383,-0.667598,0.919633,3
1,0.195979,0.995486,1.395606,-1.606078,1
2,1.168003,0.293319,-0.881263,0.86459,3
3,-1.025886,-1.278504,-0.494537,0.165862,2
4,1.411226,0.223531,-0.566074,1.009634,3


In [5]:
blobs_df.describe()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Species
count,300.0,300.0,300.0,300.0,300.0
mean,7.105427000000001e-17,4.1448330000000005e-17,3.5527140000000005e-17,-2.842171e-16,2.0
std,1.001671,1.001671,1.001671,1.001671,0.817861
min,-1.831168,-1.594269,-1.062787,-1.904807,1.0
25%,-1.023119,-1.225516,-0.7534358,-1.068313,1.0
50%,-0.0288003,0.3172274,-0.598837,0.3038764,2.0
75%,1.028202,0.9314312,1.289972,0.8040659,3.0
max,1.79133,1.587973,1.731712,1.694317,3.0


In [6]:
feature_columns = blobs_df.columns[:-1]  
feature_names = feature_columns.to_list()
feature_names

['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4']

In [7]:
blobs_df.shape

(300, 5)

In [8]:
"""
1. Create a tensor from the iris df and separate the features and the target
2. Randomly shuffle the data
3. Split the data into training and testing sets
"""
blobs_torch = torch.tensor(blobs_df.to_numpy(dtype=np.float32))
all_features, all_targets = blobs_torch[:, :4], blobs_torch[:, 4].long()


shuffled_indices = torch.randperm(len(all_features))
all_features, all_targets = all_features[shuffled_indices], all_targets[shuffled_indices]

train_ratio = 0.8
train_count = int(train_ratio * len(all_features))
train_features, train_targets = all_features[:train_count], all_targets[:train_count]
test_features, test_targets = all_features[train_count:], all_targets[train_count:]

print(train_features.shape, test_features.shape)
print(train_targets.shape, test_targets.shape)

torch.Size([240, 4]) torch.Size([60, 4])
torch.Size([240]) torch.Size([60])


# TorchSOM

In [None]:
som = SOM(
    x=25,
    y=15,
    sigma=1.45,
    learning_rate=0.95,
    neighborhood_order=3,
    epochs=100,
    batch_size=16,
    topology="rectangular",
    distance_function="euclidean",
    neighborhood_function="gaussian",
    num_features=all_features.shape[1],
    lr_decay_function="asymptotic_decay",
    sigma_decay_function="asymptotic_decay",
    initialization_mode="pca",
    device=device,
    random_seed=random_seed,
) 

In [10]:
som.initialize_weights(
    data=train_features,
    mode=som.initialization_mode
)

In [11]:
QE, TE = som.fit(
    data=train_features
)

Training SOM: 100%|██████████| 100/100 [00:17<00:00,  5.61epoch/s]


In [12]:
visualizer = SOMVisualizer(som=som, config=None)
save_path = f"results/clustering/blob_{blobs_df.shape[0]}_{blobs_df.shape[1]}_{len(blobs_df['Species'].unique())}/{som.topology}" # Set to None if you want a direct plot

In [13]:
visualizer.plot_training_errors(
    quantization_errors=QE, 
    topographic_errors=TE, 
    save_path=save_path
)

In [14]:
visualizer.plot_distance_map(save_path=save_path)

In [15]:
visualizer.plot_hit_map(
    data=train_features,
    save_path=save_path
)

In [16]:
visualizer.plot_classification_map(
    data=train_features,
    target=train_targets,
    save_path=save_path
)

In [17]:
visualizer.plot_component_planes(
    component_names=feature_names,
    save_path=save_path
)

# Clustering

In [18]:
cluster = som.cluster(
    method="hdbscan", # hdbscan, kmeans, gmm
    n_clusters=len(blobs_df['Species'].unique()),
    feature_space="weights",
)

In [19]:
visualizer.plot_cluster_map(
    cluster_result=cluster,
    save_path=save_path,
)

In [20]:
visualizer.plot_elbow_analysis(
    max_k=10,
    feature_space="weights",
    save_path=save_path
)

In [21]:
results = []
methods = ["kmeans", "gmm", "hdbscan"]
feature_spaces = ["weights", "positions", "combined"]
for method in methods:
    for space in feature_spaces:
        result = som.cluster(method=method, feature_space=space)
        results.append(result)

In [22]:
visualizer.plot_cluster_quality_comparison(
    results_list=results,
    save_path=save_path
)