# Time complexity

Experiments performed for the thesis

In [None]:
import numpy as np
import pandas as pd
import leader_implementation
import time

def create_gaussian_dataset(samples = 2000, dimensions=5, clusters=3):
        np.random.seed(42)  # For reproducibility

        # Parameters for the Gaussian distributions
        num_dimensions = dimensions
        num_samples = samples
        num_clusters = clusters
        samples_per_cluster = num_samples // num_clusters

        # Means for the Gaussian distributions (randomly chosen)
        means = [np.random.rand(num_dimensions) * 10 for _ in range(num_clusters)]

        # Covariance matrices for the Gaussian distributions (randomly chosen, positive definite)
        covariances = [np.diag(np.random.rand(num_dimensions) + 0.1) for _ in range(num_clusters)]

        # Generate samples for each cluster
        X = []
        y = []
        for cluster_id in range(num_clusters):
            cluster_data = np.random.multivariate_normal(means[cluster_id], covariances[cluster_id], samples_per_cluster)
            X.append(cluster_data)
            y.append(np.full(samples_per_cluster, cluster_id))

        # Concatenate the data and labels
        X = np.vstack(X)
        y = np.concatenate(y)
        
        column_names = [f'Feature_{i}' for i in range(X.shape[1])]  # Create column names for features
        X = pd.DataFrame(X, columns=column_names)

        return X, y

X, y = create_gaussian_dataset(100, 10, 5)
varinfo_dict = {i: {'name': X.columns[i], 'type': 'Continuous'} for i in range(len(X.columns))}  
varinfo = pd.DataFrame(varinfo_dict).transpose()

leader = leader_implementation.LeaderAlgorithms(X, varinfo, similarity_func='gower', seed=42, verbose=0)

k, leaders, clusters = leader.Leader(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader2(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader_Medoid(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader2_Medoid(s_min=0.7, verbose=0)
k, leaders, clusters = leader.Leader3_Medoid(s_min=0.7, verbose=0)

In [None]:
Ns = [1000, 5000, 10000, 50000, 100000]

s_min=0.75

dataset_N = []
M=25

for N in Ns:
    X, y = create_gaussian_dataset(N, M, 5)
    varinfo_dict = {i: {'name': X.columns[i], 'type': 'Continuous'} for i in range(len(X.columns))}  
    varinfo = pd.DataFrame(varinfo_dict).transpose()
    leader = leader_implementation.LeaderAlgorithms(X, varinfo, similarity_func='gower', seed=42, verbose=0)

    print("start")
    for i in range(3):
        start_time = time.time()
        k, leaders, clusters = leader.Leader(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_N.append({'algorithm':'Leader', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        start_time = time.time()
        k, leaders, clusters = leader.Leader2(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_N.append({'algorithm':'Leader2', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_N.append({'algorithm':'Ldr_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader2_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_N.append({'algorithm':'Ldr2_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader3_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_N.append({'algorithm':'Ldr3_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        print(f"iter {i}")

df_N = pd.DataFrame(dataset_N)

In [None]:
df_N.to_csv('time_N.csv')
df_N.head()

In [None]:
Ms = [1, 5, 10, 25, 50, 100]

s_min=0.75

dataset_M = []
N=50000

for M in Ms:
    X, y = create_gaussian_dataset(N, M, 5)
    varinfo_dict = {i: {'name': X.columns[i], 'type': 'Continuous'} for i in range(len(X.columns))}  
    varinfo = pd.DataFrame(varinfo_dict).transpose()
    leader = leader_implementation.LeaderAlgorithms(X, varinfo, similarity_func='gower', seed=42, verbose=0)

    print("start")
    for i in range(3):
        start_time = time.time()
        k, leaders, clusters = leader.Leader(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_M.append({'algorithm':'Leader', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        start_time = time.time()
        k, leaders, clusters = leader.Leader2(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_M.append({'algorithm':'Leader2', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_M.append({'algorithm':'Ldr_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader2_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_M.append({'algorithm':'Ldr2_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})

        start_time = time.time()
        k, leaders, clusters = leader.Leader3_Medoid(s_min=s_min, verbose=0)
        elapsed_time=time.time()-start_time
        dataset_M.append({'algorithm':'Ldr3_Medoid', 'N':N, 'M':M, 'time':elapsed_time, 'K':k})
        
        print(f"iter {i}")

df_M = pd.DataFrame(dataset_M)

In [None]:
df_M.to_csv('time_N.csv')
df_M.head()