In [1]:
import numpy as np
import pandas as pd
import leader_implementation
import time

def create_gaussian_dataset(samples = 2000, dimensions=5, clusters=3):
        np.random.seed(42)  # For reproducibility

        # Parameters for the Gaussian distributions
        num_dimensions = dimensions
        num_samples = samples
        num_clusters = clusters
        samples_per_cluster = num_samples // num_clusters

        # Means for the Gaussian distributions (randomly chosen)
        means = [np.random.rand(num_dimensions) * 10 for _ in range(num_clusters)]

        # Covariance matrices for the Gaussian distributions (randomly chosen, positive definite)
        covariances = [np.diag(np.random.rand(num_dimensions) + 0.1) for _ in range(num_clusters)]

        # Generate samples for each cluster
        X = []
        y = []
        for cluster_id in range(num_clusters):
            cluster_data = np.random.multivariate_normal(means[cluster_id], covariances[cluster_id], samples_per_cluster)
            X.append(cluster_data)
            y.append(np.full(samples_per_cluster, cluster_id))

        # Concatenate the data and labels
        X = np.vstack(X)
        y = np.concatenate(y)
        
        column_names = [f'Feature_{i}' for i in range(X.shape[1])]  # Create column names for features
        X = pd.DataFrame(X, columns=column_names)

        return X, y

X, y = create_gaussian_dataset(100, 10, 5)
varinfo_dict = {i: {'name': X.columns[i], 'type': 'Continuous'} for i in range(len(X.columns))}  
varinfo = pd.DataFrame(varinfo_dict).transpose()

leader = leader_implementation.LeaderAlgorithms(X, varinfo, similarity_func='gower', seed=42, verbose=0)

k, leaders, clusters = leader.Leader(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader2(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader_Medoid(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader2_Medoid(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader3_Medoid(s_min=0.7, verbose=0)

In [5]:
Ns = [500000] #[1000, 5000, 10000, 50000, 100000] #, 500000, 1000000]
Ms = [1, 5, 10, 25, 50, 100]
s_min=0.75

dataset = []
M=25

for N in Ns:
    X, y = create_gaussian_dataset(N, M, 5)
    varinfo_dict = {i: {'name': X.columns[i], 'type': 'Continuous'} for i in range(len(X.columns))}  
    varinfo = pd.DataFrame(varinfo_dict).transpose()
    leader = leader_implementation.LeaderAlgorithms(X, varinfo, similarity_func='gower', seed=42, verbose=0)

    print("start")
    for i in range(3):
        start_time = time.time()
        k, leaders, clusters = leader.Leader(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset.append({'algorithm':'Leader', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        start_time = time.time()
        k, leaders, clusters = leader.Leader2(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset.append({'algorithm':'Leader2', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset.append({'algorithm':'Ldr_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader2_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset.append({'algorithm':'Ldr2_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader3_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset.append({'algorithm':'Ldr3_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        print(f"iter {i}")
    
    



df = pd.DataFrame(dataset)

start


In [3]:
df

Unnamed: 0,algortithm,N,M,time,K
0,Leader,1000,25,0.000000,5
1,Leader2,1000,25,0.000000,5
2,Ldr_Medoid,1000,25,0.023994,5
3,Ldr2_Medoid,1000,25,0.024024,5
4,Ldr3_Medoid,1000,25,0.056054,5
...,...,...,...,...,...
70,Leader,100000,25,0.079545,5
71,Leader2,100000,25,0.189080,5
72,Ldr_Medoid,100000,25,191.465327,5
73,Ldr2_Medoid,100000,25,176.674557,5


In [None]:
df.to_csv('time_M25_500k.csv')