# Libraries

In [None]:
import time
import random
import numpy as np
import pandas as pd
import torch
import yaml
from pathlib import Path

from torchsom.core import SOM
from torchsom.visualization import SOMVisualizer, VisualizationConfig
from minisom import MiniSom

# Settings

In [None]:
random_seed = 42

In [None]:
# Python & NumPy
random.seed(random_seed)
np.random.seed(random_seed)

# Torch CPU
torch.manual_seed(random_seed)

# Torch GPU (all devices)
if torch.cuda.is_available():
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
device_log = "cuda" # "cpu" or "cuda"

In [None]:
device = torch.device(device_log)

In [None]:
n_iter = 5

In [None]:
n_samples = 300 # 300 | 5000 | 20000
n_features = 4 # 4 | 50 | 300
data_path = f"../data/benchmark/blobs_{n_samples}_{n_features}.csv"

In [None]:
blobs_df = pd.read_csv(data_path)

In [None]:
feature_columns = blobs_df.columns[:-1]  
feature_names = feature_columns.to_list()
# feature_names

In [None]:
blobs_df.shape

In [None]:
"""
1. Create a tensor from the blobs df and separate the features and the target
2. Randomly shuffle the data
3. Split the data into training and testing sets
"""
blobs_torch = torch.tensor(blobs_df.to_numpy(dtype=np.float32), device=device)
all_features, all_targets = blobs_torch[:, :-1], blobs_torch[:, -1].long()


shuffled_indices = torch.randperm(len(all_features), device=device)
all_features, all_targets = all_features[shuffled_indices], all_targets[shuffled_indices]

train_ratio = 0.8
train_count = int(train_ratio * len(all_features))
train_features, train_targets = all_features[:train_count], all_targets[:train_count]
test_features, test_targets = all_features[train_count:], all_targets[train_count:]

print(train_features.shape, test_features.shape)
print(train_targets.shape, test_targets.shape)

In [None]:
# Convert the train and test features and targets to numpy arrays for MiniSom
train_features_np = train_features.cpu().numpy().astype(np.float32)
train_targets_np = train_targets.cpu().numpy().astype(np.float32)
test_features_np = test_features.cpu().numpy().astype(np.float32)
test_targets_np = test_targets.cpu().numpy().astype(np.float32)

# Simulation

In [None]:
x_size = 25
y_size = 15
sigma = 1.45
learning_rate = 0.95
epochs = 100
topology = "rectangular"

# ! To ensure a fair comparison with MiniSom training mechanism, we need to use the full data for each epoch
batch_size = train_features.shape[0] 
# batch_size = 16

In [None]:
save_path = f"results/blob_{n_samples}_{n_features}/{topology}/{device}" 
record_file = Path(f"{save_path}/results.yml")
record_file.parent.mkdir(parents=True, exist_ok=True)

In [None]:
init_results = {
    "dataset": f"blob_{n_samples}_{n_features}",
    "device": device_log,
    "n_iter": n_iter,
    "x_size": x_size,
    "y_size": y_size,
    "sigma": sigma,
    "learning_rate": learning_rate,
    "epochs": epochs,
    "batch_size": batch_size,
    "topology": topology,
}

In [None]:
with record_file.open("w") as f:
    yaml.safe_dump(init_results, f, sort_keys=False)

## torchsom

In [None]:
torchsom = SOM(
    # * Exact same parameters as MiniSom
    x=x_size,
    y=y_size,
    num_features=all_features.shape[1],
    sigma=sigma,
    learning_rate=learning_rate,
    lr_decay_function="asymptotic_decay",
    sigma_decay_function="asymptotic_decay",
    neighborhood_function="gaussian",
    topology=topology,
    distance_function="euclidean",
    random_seed=random_seed,
    epochs=epochs,
    initialization_mode="pca",
    # * Additional parameters for TorchSOM
    batch_size=batch_size, #  Important to ensure one pass over the data per eopch. One epoch = train_features.shape[0] samples
    neighborhood_order=3, # Not used for the benchmark (time and learning curves)
    device=device, # Important to specify GPU usage
) 

In [None]:
times_init = []
times_fit = []
for _ in range(n_iter):
    start = time.perf_counter()
    torchsom.initialize_weights(data=train_features, mode=torchsom.initialization_mode)
    end = time.perf_counter()
    times_init.append(end - start)

    start = time.perf_counter()
    QE, TE = torchsom.fit(data=train_features)
    end = time.perf_counter()
    times_fit.append(end - start)

In [None]:
total_fit = [init + fit for init, fit in zip(times_init, times_fit)]

In [None]:
print(f"Number of iterations: {n_iter}")
print(f"Init: mean={np.mean(times_init):.2f}s, std={np.std(times_init):.2f}")
print(f"Fit:  mean={np.mean(times_fit):.2f}s, std={np.std(times_fit):.2f}")
print(f"Total: mean={np.mean(total_fit):.2f}s, std={np.std(total_fit):.2f}")

In [None]:
full_train_QE = torchsom.quantization_error(
    data=train_features
)
# full_train_QE

In [None]:
full_train_TE = torchsom.topographic_error(
    data=train_features
)
# full_train_TE

In [None]:
full_test_QE = torchsom.quantization_error(
    data=test_features
)
# full_test_QE

In [None]:
full_test_TE = torchsom.topographic_error(
    data=test_features
)
# full_test_TE

In [None]:
torchsom_results = {
    "torchsom": {
        "avg_init_time": f"{np.mean(times_init):.2f}s",
        "std_init_time": f"{np.std(times_init):.2f}s",
        "avg_train_time": f"{np.mean(times_fit):.2f}s",
        "std_train_time": f"{np.std(times_fit):.2f}s",
        "avg_total_time": f"{np.mean(total_fit):.2f}s",
        "std_total_time": f"{np.std(total_fit):.2f}s",
        "final_full_train_QE": f"{full_train_QE:.2f}",
        "final_full_train_TE": f"{full_train_TE:.2f}",
        "final_full_test_QE": f"{full_test_QE:.2f}",
        "final_full_test_TE": f"{full_test_TE:.2f}",
        
    },
}

In [None]:
with record_file.open("a") as f:
    yaml.safe_dump(torchsom_results, f, sort_keys=False, explicit_start=True)

### Visualization

In [None]:
config = VisualizationConfig(save_format="pdf")
visualizer = SOMVisualizer(som=torchsom, config=config)

In [None]:
visualizer.plot_training_errors(
    quantization_errors=QE, 
    topographic_errors=TE, 
    save_path=save_path
)

In [None]:
visualizer.plot_distance_map(save_path=save_path)

In [None]:
visualizer.plot_hit_map(
    data=train_features,
    save_path=save_path
)

In [None]:
visualizer.plot_classification_map(
    data=train_features,
    target=train_targets,
    save_path=save_path
)

In [None]:
visualizer.plot_component_planes(
    component_names=feature_names,
    save_path=save_path
)

## minisom

In [None]:
som = MiniSom(
    x=x_size,
    y=y_size,
    input_len=all_features.shape[1],
    sigma=sigma,
    learning_rate=learning_rate,
    decay_function="asymptotic_decay",
    sigma_decay_function="asymptotic_decay",
    neighborhood_function="gaussian",
    topology="rectangular",
    activation_distance="euclidean",
    random_seed=random_seed,
)

In [None]:
times_init = []
times_fit = []
for _ in range(n_iter):
    start = time.perf_counter()
    # som.random_weights_init(data=train_features_np)
    som.pca_weights_init(data=train_features_np)
    end = time.perf_counter()
    times_init.append(end - start)

    start = time.perf_counter()
    som.train(
        data=train_features_np,
        num_iteration=epochs,
        random_order=True,
        verbose=True,
        use_epochs=True, # ! Important: If true: num_iterations x train_features.shape[0] in total samples , if False: num_iterations samples
    )
    end = time.perf_counter()
    times_fit.append(end - start)

In [None]:
total_fit = [init + fit for init, fit in zip(times_init, times_fit)]

In [None]:
print(f"Number of iterations: {n_iter}")
print(f"Init: mean={np.mean(times_init):.2f}s, std={np.std(times_init):.2f}")
print(f"Fit:  mean={np.mean(times_fit):.2f}s, std={np.std(times_fit):.2f}")
print(f"Total: mean={np.mean(total_fit):.2f}s, std={np.std(total_fit):.2f}")

In [None]:
full_train_QE = som.quantization_error(
    data=train_features_np
)
# full_train_QE

In [None]:
full_train_TE = som.topographic_error(
    data=train_features_np
)
# full_train_TE

In [None]:
full_test_QE = som.quantization_error(
    data=test_features_np
)
# full_test_QE

In [None]:
full_test_TE = som.topographic_error(
    data=test_features_np
)
# full_test_TE

In [None]:
minisom_results = {
    "minisom": {
        "avg_init_time": f"{np.mean(times_init):.2f}s",
        "std_init_time": f"{np.std(times_init):.2f}s",
        "avg_train_time": f"{np.mean(times_fit):.2f}s",
        "std_train_time": f"{np.std(times_fit):.2f}s",
        "avg_total_time": f"{np.mean(total_fit):.2f}s",
        "std_total_time": f"{np.std(total_fit):.2f}s",
        "final_full_train_QE": f"{full_train_QE:.2f}",
        "final_full_train_TE": f"{full_train_TE:.2f}",
        "final_full_test_QE": f"{full_test_QE:.2f}",
        "final_full_test_TE": f"{full_test_TE:.2f}",
    },
}

In [None]:
with record_file.open("a") as f:
    yaml.safe_dump(minisom_results, f, sort_keys=False, explicit_start=True)