In [1]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys



___

### Start Timer

In [2]:
# track runtime
start = time.time()

___

### Define File Paths

In [3]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


____

### Import Modules

In [4]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

___

### Parameters

In [5]:
# transformed data csv file name
data_file = "/curated/trans_data_design.csv"

# target vector csv file name
target_file = "/curated/beans_target.csv"

____

### Load Data

In [6]:
filename = '/curated/shuffled_target.csv'
target_vector_shuffled = pd.read_csv(data_path + filename)

filename = '/curated/shuffled_design.csv'
design_matrix_shuffled = pd.read_csv(data_path + filename)


____

### Sample Data

In [7]:
sample = True
if sample:
    # num samples
    n = 1000

    # random seed
    rand_seed = 42

    # sample target and save copy to csv
    target_vector_shuffled = target_vector_shuffled.sample(n=n, random_state=rand_seed)
    filename = '/curated/sampled_target.csv'
    target_vector_shuffled.to_csv(data_path + filename, index=False)

     # sample target and save copy to csv
    design_matrix_shuffled = design_matrix_shuffled.sample(n=n, random_state=rand_seed)
    filename = '/curated/sampled_design.csv'
    design_matrix_shuffled.to_csv(data_path + filename, index=False)

___

### Create capX

In [8]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
print(f'cap_x shape: {cap_x.shape}')
cap_x

cap_x shape: (1000, 16)


array([[ 0.47951119,  0.75617026,  0.9025797 , ..., -1.08640545,
        -1.03819934, -2.53011841],
       [-0.49798027, -0.60106105, -0.58065225, ...,  0.38428313,
         0.26243519,  0.25945463],
       [-0.0963504 , -0.06761403, -0.09561878, ..., -0.10319885,
         0.13672302, -0.70899572],
       ...,
       [-0.30471913, -0.47210471, -0.71409026, ...,  1.3837794 ,
         1.95995551,  0.86201627],
       [ 3.61104419,  3.02864018,  2.64390167, ..., -1.24739667,
         0.33815956, -0.56965621],
       [ 0.00623113,  0.11297058,  0.20609565, ..., -0.56222324,
        -0.50146285, -0.33099767]])

___

### Select Hyperparameters for Gridsearch

In [9]:
# select hyperparams
params1=False
params2=True
params3=False
params4=False

if params1:

    min_dist_list = [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 20, 30, 50, 100]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params2:

    min_dist_list = [0.01, 0.05, 0.1]
    n_neighbors_list = [5, 10, 15]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params3:

    min_dist_list = np.linspace(0, 0.5, num = 40)
    n_neighbors_list = np.arange(5, 35, 5)
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params4:

    min_dist_list = [0.01]
    n_neighbors_list = [10]
    metric_list =['euclidean']
    n_components_list = [2, 3]




___

### Gridsearch Algorithm

In [10]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)
results_df

****************************************************************************************************
Run 1 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.12136865457220114
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  11
Validity Index:  0.5419393459458002
****************************************************************************************************
Run 2 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.14420927107337905
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  51
Validity Index:  0.4652657691709719
****************************************************************************************************
Run 3 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.1388590656107254
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.45650660276274735
****************************************************************************************************
Run 4 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.1488173555780401
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  35
Validity Index:  0.39559532776010264
****************************************************************************************************
Run 5 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.1668643145480758
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6850850736071387
****************************************************************************************************
Run 6 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.162733471235099
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.7506538843330217
****************************************************************************************************
Run 7 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.15084884346254007
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  12
Validity Index:  0.5353157070481516
****************************************************************************************************
Run 8 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.16337638201557983
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  41
Validity Index:  0.402230056050403
****************************************************************************************************
Run 9 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.16660193653651348
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.4925624514986181
****************************************************************************************************
Run 10 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.18881118319231574
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.7497404092444798
****************************************************************************************************
Run 11 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.16419624571872124
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.5389737850435434
****************************************************************************************************
Run 12 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.18631022117304313
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6041362506238536
****************************************************************************************************
Run 13 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.17203294751387685
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  12
Validity Index:  0.3699204899554707
****************************************************************************************************
Run 14 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2031993113796081
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  13
Validity Index:  0.3417870694196792
****************************************************************************************************
Run 15 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.18543555367457018
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.572536963455768
****************************************************************************************************
Run 16 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.21624522834183962
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6140995397311425
****************************************************************************************************
Run 17 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.2087070049585756
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.5701898438349163
****************************************************************************************************
Run 18 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2381511801171793
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.4999276900836349
****************************************************************************************************
Run 19 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.06256229231664077
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  48
Validity Index:  0.4965278071686381
****************************************************************************************************
Run 20 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.06794296137747298
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  54
Validity Index:  0.5296889837059758
****************************************************************************************************
Run 21 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.0767646757481604
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  10
Validity Index:  0.5080694489640659
****************************************************************************************************
Run 22 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.08450692991598235
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6704770531174873
****************************************************************************************************
Run 23 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.09111515064428163
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6813479667625437
****************************************************************************************************
Run 24 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.08809325996195964
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.7421206195864078
****************************************************************************************************
Run 25 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.07920230369246958
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.45252966443423687
****************************************************************************************************
Run 26 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.08876394548430118
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  45
Validity Index:  0.486284039179823
****************************************************************************************************
Run 27 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.09039252552266434
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  8
Validity Index:  0.48122953362065707
****************************************************************************************************
Run 28 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.1001532509797041
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  43
Validity Index:  0.3738537733773292
****************************************************************************************************
Run 29 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1003661354683319
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6365264787085808
****************************************************************************************************
Run 30 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.09927400853592522
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.6866871897351141
****************************************************************************************************
Run 31 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.10275865594648645
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  41
Validity Index:  0.36077332004830337
****************************************************************************************************
Run 32 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.10200352585166146
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.4559106235414783
****************************************************************************************************
Run 33 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1104406914863631
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.47228274681327703
****************************************************************************************************
Run 34 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.11708537882064796
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6102890786619912
****************************************************************************************************
Run 35 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.12082531156447045
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.408608312348232
****************************************************************************************************
Run 36 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.13099289478132645
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6623422172262821


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,11,,,,,0.121369,5,0.01,euclidean,2,0.98122,0.641051,5,0.541939,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
1,dbscan,51,,,,,0.144209,5,0.01,manhattan,2,0.978543,0.266174,6,0.465266,"[0, 45, 1, 1, 8, 2, 3, 4, 0, 4, 5, 16, 6, 1, 7..."
2,dbscan,7,,,,,0.138859,10,0.01,euclidean,2,0.974305,0.473004,6,0.456507,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
3,dbscan,35,,,,,0.148817,10,0.01,manhattan,2,0.970405,0.230564,6,0.395595,"[0, 1, 2, 2, 3, 4, 5, 6, 0, 6, 7, 0, 8, 2, 5, ..."
4,dbscan,5,,,,,0.166864,15,0.01,euclidean,2,0.971605,0.448304,6,0.685085,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
5,dbscan,5,,,,,0.162733,15,0.01,manhattan,2,0.967436,0.522399,6,0.750654,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
6,dbscan,12,,,,,0.150849,5,0.05,euclidean,2,0.981524,0.572343,5,0.535316,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
7,dbscan,41,,,,,0.163376,5,0.05,manhattan,2,0.978425,0.298624,6,0.40223,"[0, -1, 1, 1, 9, 2, 3, 4, 0, 4, 5, 6, 7, 1, 8,..."
8,dbscan,7,,,,,0.166602,10,0.05,euclidean,2,0.978176,0.462389,6,0.492562,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
9,dbscan,4,,,,,0.188811,10,0.05,manhattan,2,0.970375,0.599744,6,0.74974,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."


___

## All Results

In [11]:
results_df.to_csv(f'../data/all_results_{str(datetime.now()).replace(" ", "")}.csv')

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,11,,,,,0.121369,5,0.01,euclidean,2,0.98122,0.641051,5,0.541939,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
1,dbscan,51,,,,,0.144209,5,0.01,manhattan,2,0.978543,0.266174,6,0.465266,"[0, 45, 1, 1, 8, 2, 3, 4, 0, 4, 5, 16, 6, 1, 7..."
2,dbscan,7,,,,,0.138859,10,0.01,euclidean,2,0.974305,0.473004,6,0.456507,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
3,dbscan,35,,,,,0.148817,10,0.01,manhattan,2,0.970405,0.230564,6,0.395595,"[0, 1, 2, 2, 3, 4, 5, 6, 0, 6, 7, 0, 8, 2, 5, ..."
4,dbscan,5,,,,,0.166864,15,0.01,euclidean,2,0.971605,0.448304,6,0.685085,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
5,dbscan,5,,,,,0.162733,15,0.01,manhattan,2,0.967436,0.522399,6,0.750654,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
6,dbscan,12,,,,,0.150849,5,0.05,euclidean,2,0.981524,0.572343,5,0.535316,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
7,dbscan,41,,,,,0.163376,5,0.05,manhattan,2,0.978425,0.298624,6,0.40223,"[0, -1, 1, 1, 9, 2, 3, 4, 0, 4, 5, 6, 7, 1, 8,..."
8,dbscan,7,,,,,0.166602,10,0.05,euclidean,2,0.978176,0.462389,6,0.492562,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
9,dbscan,4,,,,,0.188811,10,0.05,manhattan,2,0.970375,0.599744,6,0.74974,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."


In [12]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

____

### Get 10 Best Results (Kmenas or DBSCAN)

In [13]:
results_df['combined_score'] = results_df['silhouette_score'].fillna(results_df['validity_index'])
results_df = results_df.sort_values(by='combined_score', ascending=False)
filter = results_df.head(10)
filtered_results_df = results_df[results_df.index.isin(filter.index)]
filtered_results_df = filtered_results_df.drop(columns=['combined_score'])

filtered_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
5,dbscan,5,,,,,0.162733,15,0.01,manhattan,2,0.967436,0.522399,6,0.750654,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
9,dbscan,4,,,,,0.188811,10,0.05,manhattan,2,0.970375,0.599744,6,0.74974,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
23,dbscan,5,,,,,0.088093,15,0.01,manhattan,3,0.96769,0.465981,6,0.742121,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
29,dbscan,5,,,,,0.099274,15,0.05,manhattan,3,0.968684,0.453951,5,0.686687,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
4,dbscan,5,,,,,0.166864,15,0.01,euclidean,2,0.971605,0.448304,6,0.685085,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
22,dbscan,5,,,,,0.091115,15,0.01,euclidean,3,0.972572,0.543401,6,0.681348,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
21,dbscan,5,,,,,0.084507,10,0.01,manhattan,3,0.970933,0.653201,6,0.670477,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
35,dbscan,4,,,,,0.130993,15,0.1,manhattan,3,0.967873,0.689014,5,0.662342,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
28,dbscan,5,,,,,0.100366,15,0.05,euclidean,3,0.97394,0.569202,6,0.636526,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
15,dbscan,4,,,,,0.216245,10,0.1,manhattan,2,0.970116,0.535237,5,0.6141,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."


____

### Save Filtered Results to CSV

In [None]:
filename = f'/curated/filtered_results_{str(datetime.now()).replace(" ", "")}.csv'
filtered_results_df.to_csv(data_path + filename, index=False)

___

## Runtime

In [14]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 00:02.13
