In [1]:
from scipy.cluster.hierarchy import ward, dendrogram, linkage
from scipy.spatial.distance import pdist

from knn_chain import knn_chain
from py_knn_chain import py_knn_chain
from nn_chain_algorithm_standard import standard_nn_chain

import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
def generate_multivariate_gaussian_data(num_samples, num_features, num_distributions):
    """
    Generate a massive multidimensional, multivariate Gaussian dataset.

    Parameters:
    - num_samples: Total number of samples to generate.
    - num_features: Number of features (dimensions) for each sample.
    - num_distributions: Number of different Gaussian distributions.

    Returns:
    - data: Generated dataset of shape (num_samples, num_features).
    """
    data = []
    samples_per_distribution = num_samples // num_distributions

    for i in range(num_distributions):
        mean = np.random.rand(num_features) * 100
        cov = np.random.rand(num_features, num_features)
        cov = np.dot(cov, cov.transpose())

        distribution_data = np.random.multivariate_normal(mean, cov, samples_per_distribution)
        data.append(distribution_data)

    data = np.vstack(data)
    return data

In [3]:
num_samples = 1000
num_features = 10
num_distributions = 5

X = list(generate_multivariate_gaussian_data(num_samples, num_features, num_distributions))

print(f"Generated dataset shape: {X.shape}")

Generated dataset shape: (1000, 10)


In [4]:
# plt.scatter(X[:, 0], X[:, 1])
# for i, c in enumerate(X):

#     label = i, list(c)

#     plt.annotate(label,
#                  (c),
#                  textcoords="offset points",
#                  xytext=(0,10),
#                  ha='center')

# plt.grid()
# plt.show()

In [5]:
# Cython K-NN algorithm
# get the start time
st = time.process_time()
########## FIX 
A = knn_chain(list(X), 3)
##########
# get the end time
et = time.process_time()
# get execution time
cy_time_knn = et - st

In [6]:
# Python K-NN algorithm
# get the start time
st = time.process_time()
########## FIX 
B = py_knn_chain(X, 3)
##########
# get the end time
et = time.process_time()
# get execution time
py_time_knn = et - st

<class 'numpy.ndarray'>


In [7]:
# Cython standard NN algorithm
# get the start time
st = time.process_time()
########## FIX 
y = pdist(X) 
Z = ward(y)
##########
# get the end time
et = time.process_time()
# get execution time
cy_time_nn = et - st

In [8]:
# Python standard NN algorithm
# get the start time
st = time.process_time()
########## 
y = pdist(X)
C = standard_nn_chain(y, len(X))
##########
# get the end time
et = time.process_time()
# get execution time
py_time_nn = et - st

## Execution Time comparison:

In [9]:
print(f"""CPU Execution time for the: 
      1. cython knn function = {cy_time_knn} s,
      2. python knn function = {py_time_knn} s,
      3. cython standard function = {cy_time_nn} s,
      4. python standard function = {py_time_nn} s.""")

CPU Execution time for the: 
      1. cython knn function = 5.423497212 s,
      2. python knn function = 5.876842580000001 s,
      3. cython standard function = 0.01597536000000055 s,
      4. python standard function = 37.00958249 s.
