# Implementation

In [7]:
import numpy as np
import argparse
from data_loader import load_data, preprocess
from model import Autoencoder, ClusteringLayer 
from trainer import train, get_embeddings
from cluster import cluster_embeddings, evaluate_cluster
import torch

def main(filepath, file_format, n_clusters, encoding_dim, epochs, learning_rate, batch_size, normalize, scale, log_transform, n_top_genes):

    #Load and Preprocess Data
    print("Loading and Preprocessing Data")
    data, labels = load_data(filepath, format=file_format)
    if data is None:
        print("Failed to load data. Exiting.")
        return
    processed_data, scaler = preprocess(data, normalize, scale, log_transform, n_top_genes)
    input_dim = processed_data.shape[1]

    #define and train Model
    print("Defining and Training Model")
    autoencoder = Autoencoder(input_dim, encoding_dim).to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) # move to device
    clustering_layer = ClusteringLayer(n_clusters, encoding_dim).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    #use the train function
    trained_model = train(autoencoder, clustering_layer, processed_data, epochs, batch_size, learning_rate, n_clusters) 

    #get embeddings
    print("Obtaining Embeddings")
    embeddings = get_embeddings(autoencoder, processed_data) # Add .cpu().numpy()

    #clustering
    print("Clustering")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    cluster_labels = clustering_layer(torch.tensor(embeddings, dtype=torch.float32).to(device)).argmax(1).cpu().numpy()

    #eval
    if labels is not None:  #real labels if available
        true_labels = labels
    else:
        true_labels = np.random.randint(0, n_clusters, embeddings.shape[0]) #otherwise use placeholder

    evaluation_results = evaluate_cluster(true_labels, cluster_labels)
    print(f"Clustering Evaluation: ARI={evaluation_results['ARI']:.4f}, NMI={evaluation_results['NMI']:.4f}")
    return cluster_labels, embeddings, scaler

if __name__ == "__main__":
    filepath = 'scDeepClustering_Sample_Data/mouse_bladder_cell_select_2100.h5'
    # filepath = 'Splatter_Sim_Data/splatter_simulate_data_1.h5'
    file_format = 'h5'
    n_clusters = 3
    encoding_dim = 32
    epochs = 100
    learning_rate = 0.001
    batch_size = 32
    normalize = True
    scale = True
    log_transform = True
    n_top_genes = 2000

    main(filepath, file_format, n_clusters, encoding_dim, epochs, learning_rate, batch_size, normalize, scale, log_transform, n_top_genes)

Loading and Preprocessing Data
inside h5 format
Keys in file: ['X', 'Y']
=== DATA: ===
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]


=== DATA AFTER: ===
[[-0.33427826 -0.35183723 -0.35727728 ...  0.08506461  0.19411842
  -0.66443265]
 [-0.33427826 -0.35183723 -0.35727728 ... -0.87155712  0.47428055
   0.60681282]
 [-0.33427826 -0.35183723 -0.35727728 ...  2.17525911 -0.87431899
  -0.66443265]
 ...
 [-0.33427826 -0.35183723 -0.35727728 ... -0.87155712  0.24852136
   0.22669642]
 [-0.33427826 -0.35183723 -0.35727728 ...  1.09703078 -0.33337379
  -0.66443265]
 [-0.33427826 -0.35183723 -0.35727728 ...  0.92421636  0.27599096
  -0.66443265]]


Defining and Training Model
Obtaining Embeddings
Clustering
Clustering Evaluation: ARI=0.3743, NMI=0.5256
