## Final Project: Phase 3 - Clustering Pipeline Optimization
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/21/2024  
___

In [1]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys

import warnings

# Disable runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



___

### Start Timer

In [2]:
# track runtime
start = time.time()

___

### Define File Paths

In [3]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


____

### Import Modules

In [4]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

___

### Parameters

In [5]:
# transformed data csv file name
design_file = '/curated/shuffled_target.csv'

# target vector csv file name
target_file = "/curated/beans_target.csv"

# sampled target dataframe csv loc
target_vector_shuffled_filename = f'/sampled/sampled_target{str(datetime.now()).replace(" ", "")}.csv'

# sampled design dataframe csv loc
design_matrix_shuffled_filename = f'/sampled/sampled_design{str(datetime.now()).replace(" ", "")}.csv'

# all results dataframe csv loc
all_results_file = f'/results/all_results_{str(datetime.now()).replace(" ", "")}.csv'

# filtered results dataframe csv loc
filtered_results_file = f'/results/filtered_results_{str(datetime.now()).replace(" ", "")}.csv'


____

### Load Data

In [6]:
design_file = '/curated/shuffled_target.csv'
target_vector_shuffled = pd.read_csv(data_path + design_file)

target_file = '/curated/shuffled_design.csv'
design_matrix_shuffled = pd.read_csv(data_path + target_file)


In [7]:
#design_matrix_shuffled = design_matrix_shuffled.loc[:, :'numerical__ShapeFactor1']

____

### Sample Data

In [8]:
sample = False
if sample:
    # num samples
    n = 1000

    # random seed
    rand_seed = 42

    # sample target and save copy to csv
    target_vector_shuffled = target_vector_shuffled.sample(n=n, random_state=rand_seed)
    target_vector_shuffled.to_csv(data_path + target_vector_shuffled_filename, index=False)
    print("Sampled target dataframe saved at:", data_path + target_vector_shuffled_filename)


     # sample target and save copy to csv
    design_matrix_shuffled = design_matrix_shuffled.sample(n=n, random_state=rand_seed)
    design_matrix_shuffled.to_csv(data_path + design_matrix_shuffled_filename, index=False)
    print("Sampled design dataframe saved at:", data_path + design_matrix_shuffled_filename)


___

### Create capX

In [9]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
cap_x = design_matrix_shuffled
print(f'cap_x shape: {cap_x.shape}')
cap_x.shape

cap_x shape: (13611, 17)


(13611, 17)

___

### Select Hyperparameters for Gridsearch

In [10]:
# select hyperparams
params1=False
params2=False
params3=False
params4=True

if params1:

    min_dist_list = [0.0, 0.01, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 25, 50, 100, 200]
    metric_list =['euclidean', 'canberra', 'mahalanobis', 'correlation']
    n_components_list = range(2, 4)


if params2:

    min_dist_list = [0.5]
    n_neighbors_list = [100]
    metric_list = ['euclidean']
    n_components_list = [3]


if params3:

    min_dist_list = [0.0, 0.1, 0.5]
    n_neighbors_list = [10, 25, 100, 200]
    metric_list =['euclidean', 'canberra', 'chebyshev', 'correlation']
    n_components_list = range(2, 4)


if params4:

    min_dist_list = [0.0]
    n_neighbors_list = [50, 100, 250, 500, 1000]
    metric_list =['cosine', 'euclidean']
    n_components_list = range(2, 9)




___

### Gridsearch Algorithm

In [11]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)


****************************************************************************************************
Run 1 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.08096146089983365
Silhouette Score:  0.7944241
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.9074445569880059
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 2 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.08305082503668063
Silhouette Score:  0.43277013
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  -0.313180872122108
[0 1 2 3 4]
Noise Ratio:  0.0
****************************************************************************************************
Run 3 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.07189898970598273
Silhouette Score:  0.85177314
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  12
Validity Index:  0.5846677383675505
[-1  0  1  2  3  4  5  6  7  8  9 10 11]
Noise Ratio:  0.006906178825949599
****************************************************************************************************
Run 4 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.07013626861716737
Silhouette Score:  0.43679437
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  -0.317922967238405
[0 1 2 3 4]
Noise Ratio:  0.0
****************************************************************************************************
Run 5 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.05839232797330654
Silhouette Score:  0.8945332
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.9478449896497944
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 6 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.0616162333174791
Silhouette Score:  0.44502565
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  70
Validity Index:  -0.4422056285598386
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69]
Noise Ratio:  0.0016898097127323489
****************************************************************************************************
Run 7 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.05865752520020976
Silhouette Score:  0.87934715
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9913497866290429
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 8 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.056533029015206196
Silhouette Score:  0.46642193
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  119
Validity Index:  -0.22790731046046564
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118]
Noise Ratio:  0.001763279700242451
****

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.06338987145357075
Silhouette Score:  0.901399
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9996531401835571
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 10 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.05322681992326175
Silhouette Score:  0.4726586
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  33
Validity Index:  -0.5127010252675601
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32]
Noise Ratio:  0.00022040996253030638
****************************************************************************************************
Run 11 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.0531109981152839
Silhouette Score:  0.77963567
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  2
Validity Index:  0.9379094976886077
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 12 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.021387570449237437
Silhouette Score:  0.4346687
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.2134578626325106
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 13 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.044121050824282124
Silhouette Score:  0.8527895
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.8846329306425474
[-1  0  1  2  3  4]
Noise Ratio:  0.0006612298875909191
****************************************************************************************************
Run 14 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.019115042809989387
Silhouette Score:  0.4374736
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.10349916356645303
[0 1 2 3 4]
Noise Ratio:  0.0
****************************************************************************************************
Run 15 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.02684548843736292
Silhouette Score:  0.93983126
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.998973580284145
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 16 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.016829977942660827
Silhouette Score:  0.44727898
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  99
Validity Index:  -0.056695613708867436
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
 95 96 97 98]
Noise Ratio:  0.0008816398501212255
****************************************************************************************************
Run 17 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.03556065368991657
Silhouette Score:  0.9183111
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.8646206877240633
[-1  0  1  2  3  4]
Noise Ratio:  0.0016898097127323489
****************************************************************************************************
Run 18 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.015516449667064737
Silhouette Score:  0.45067775
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.22020349115012325
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 19 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.03573682684106513
Silhouette Score:  0.9175352
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9815287475995369
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 20 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.014329780329477805
Silhouette Score:  0.4725756
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.0014711304962621574
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 21 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.03793164948987385
Silhouette Score:  0.79863966
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9017497505859979
[-1  0  1  2  3  4]
Noise Ratio:  0.00036734993755051064
****************************************************************************************************
Run 22 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.010781043281379491
Silhouette Score:  0.44664207
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.4669755763533609
[0 1 2 3 4]
Noise Ratio:  0.0
****************************************************************************************************
Run 23 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.027733026855550708
Silhouette Score:  0.8844208
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9132519574915581
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 24 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.009027554272039379
Silhouette Score:  0.46342307
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.44861691098519046
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 25 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.021191440109268586
Silhouette Score:  0.8975837
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9903680289706714
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 26 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.00795152898987547
Silhouette Score:  0.47278216
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.7397638277086991
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 27 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.01852091904809092
Silhouette Score:  0.8965765
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.8662670782232207
[-1  0  1  2  3  4]
Noise Ratio:  0.0008081698626111233
****************************************************************************************************
Run 28 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.007323311045155532
Silhouette Score:  0.4734569
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.692418265677694
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 29 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.02033315228448357
Silhouette Score:  0.9314697
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9739509999875486
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 30 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.006925806647575628
Silhouette Score:  0.48501852
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.6597955124507685
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 31 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.027634821957523075
Silhouette Score:  0.808294
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.8602699850529492
[-1  0  1  2  3  4  5]
Noise Ratio:  7.346998751010212e-05
****************************************************************************************************
Run 32 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.006568036588411132
Silhouette Score:  0.46479115
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6969368254745201
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 33 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.018891788836524863
Silhouette Score:  0.88620245
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9081644632194446
[-1  0  1  2  3  4]
Noise Ratio:  7.346998751010212e-05
****************************************************************************************************
Run 34 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.005675001034685075
Silhouette Score:  0.4746773
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.5807190280848904
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 35 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.013738859350030877
Silhouette Score:  0.8954216
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.8732764775738863
[-1  0  1  2  3  4]
Noise Ratio:  0.00036734993755051064
****************************************************************************************************
Run 36 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.005015603914628562
Silhouette Score:  0.4893569
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.36720509395271844
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 37 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.016843895532952968
Silhouette Score:  0.8866074
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9987081464459185
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 38 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.0045849431007202925
Silhouette Score:  0.49429825
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.6546083235289444
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 39 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.012893199582062384
Silhouette Score:  0.928778
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9536842331059862
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 40 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.004445325268303368
Silhouette Score:  0.50527364
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.37669527133651215
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 41 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  6
Hopkin's Statistic = 0.01909476581925416
Silhouette Score:  0.81059784
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9413505517217327
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 42 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  6
Hopkin's Statistic = 0.004823490367445867
Silhouette Score:  0.4805703
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6409214186900505
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 43 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  6
Hopkin's Statistic = 0.013480771051211371
Silhouette Score:  0.8531644
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9867052001781115
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 44 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  6
Hopkin's Statistic = 0.0041942219928897675
Silhouette Score:  0.48841313
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.7044804093105688
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 45 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  6
Hopkin's Statistic = 0.012368341727587502
Silhouette Score:  0.8898228
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.8801072421034617
[-1  0  1  2  3  4]
Noise Ratio:  0.0005877599000808169
****************************************************************************************************
Run 46 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  6
Hopkin's Statistic = 0.00370797145285805
Silhouette Score:  0.49750474
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.5542443337082292
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 47 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  6
Hopkin's Statistic = 0.014711684965811937
Silhouette Score:  0.8977958
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9981120416684837
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 48 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  6
Hopkin's Statistic = 0.0035331459772637813
Silhouette Score:  0.5074825
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.635706685286514
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 49 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  6
Hopkin's Statistic = 0.009782524281112263
Silhouette Score:  0.9231059
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9632801614216652
[-1  0  1  2  3]
Noise Ratio:  0.00022040996253030638
****************************************************************************************************
Run 50 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  6
Hopkin's Statistic = 0.003441419379018216
Silhouette Score:  0.51356775
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.7597374619213992
[-1  0  1]
Noise Ratio:  7.346998751010212e-05
****************************************************************************************************
Run 51 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  7
Hopkin's Statistic = 0.017342483105245034
Silhouette Score:  0.8072114
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9282813472621673
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 52 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  7
Hopkin's Statistic = 0.003698055733621701
Silhouette Score:  0.4929566
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6026319221036831
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 53 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  7
Hopkin's Statistic = 0.010688093135570356
Silhouette Score:  0.86462444
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9277313262600867
[0 1 2 3 4]
Noise Ratio:  0.0
****************************************************************************************************
Run 54 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  7
Hopkin's Statistic = 0.003334883604723954
Silhouette Score:  0.4976354
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6100783162911588
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 55 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  7
Hopkin's Statistic = 0.009012507477445768
Silhouette Score:  0.89000225
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9845175421512209
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 56 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  7
Hopkin's Statistic = 0.0029897950886834793
Silhouette Score:  0.5105371
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  8
****************************************************************************************************
Run 57 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  7
Hopkin's Statistic = 0.010877119031700555
Silhouette Score:  0.9037089
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.990113573194426
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 58 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  7
Hopkin's Statistic = 0.002894595726594104
Silhouette Score:  0.51362556
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  8
****************************************************************************************************
Run 59 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  7
Hopkin's Statistic = 0.009410355535408211
Silhouette Score:  0.926346
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9998688356000708
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 60 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  7
Hopkin's Statistic = 0.002807245746021458
Silhouette Score:  0.5167888
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  8
****************************************************************************************************
Run 61 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  8
Hopkin's Statistic = 0.015278372888031685
Silhouette Score:  0.8055395
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.8981024275824722
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 62 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  8
Hopkin's Statistic = 0.003048160944093011
Silhouette Score:  0.5000267
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.688170411431382
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 63 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  8
Hopkin's Statistic = 0.00963599806927345
Silhouette Score:  0.8523936
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9283135544080722
[-1  0  1  2  3  4]
Noise Ratio:  0.00014693997502020423
****************************************************************************************************
Run 64 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  8
Hopkin's Statistic = 0.002782533801695725
Silhouette Score:  0.5054573
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.8357226196821922
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 65 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  cosine
n_components:  8
Hopkin's Statistic = 0.010715030062902847
Silhouette Score:  0.8970852
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.984315369284518
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 66 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  250
min_dist:  0.0
metric:  euclidean
n_components:  8
Hopkin's Statistic = 0.002672621811544844
Silhouette Score:  0.5107572
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.5067700023727475
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 67 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  cosine
n_components:  8
Hopkin's Statistic = 0.010239359651099416
Silhouette Score:  0.8927562
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9903191046690152
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 68 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  500
min_dist:  0.0
metric:  euclidean
n_components:  8
Hopkin's Statistic = 0.0026333681044570178
Silhouette Score:  0.51396453
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.5379647925988382
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 69 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  cosine
n_components:  8
Hopkin's Statistic = 0.009970730076281634
Silhouette Score:  0.92963475
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9997873503351963
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 70 of 70


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  1000
min_dist:  0.0
metric:  euclidean
n_components:  8
Hopkin's Statistic = 0.0025787904762104696
Silhouette Score:  0.5150345
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  9


___

## All Results

In [12]:
results_df.to_csv(data_path + all_results_file, index=False)
print("Filtered results dataframe saved at:", data_path + all_results_file)

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/all_results_2024-04-1722:42:11.819198.csv


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
0,dbscan,2,,,,,0.080961,50,0.0,cosine,2,0.89305,0.100099,5.0,cosine,0.907445,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,dbscan,5,,,,,0.083051,50,0.0,euclidean,2,0.999435,0.171741,5.0,euclidean,-0.313181,0.0,"[0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 3, 1, ..."
2,dbscan,12,,,,,0.071899,100,0.0,cosine,2,0.895107,0.119371,5.0,euclidean,0.584668,0.006906,"[0, 6, 0, 1, 2, 0, 3, 0, 4, 2, 2, 2, 2, 5, 2, ..."
3,dbscan,5,,,,,0.070136,100,0.0,euclidean,2,0.999229,0.145034,5.0,euclidean,-0.317923,0.0,"[0, 0, 0, 0, 1, 0, 2, 0, 0, 1, 1, 3, 1, 3, 1, ..."
4,dbscan,2,,,,,0.058392,250,0.0,cosine,2,0.897391,0.10167,5.0,cosine,0.947845,0.0,"[0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, ..."
5,dbscan,70,,,,,0.061616,250,0.0,euclidean,2,0.998591,0.074673,5.0,euclidean,-0.442206,0.00169,"[0, 1, 2, 3, 4, 5, 6, 0, 7, 4, 8, 9, 10, 11, 4..."
6,dbscan,3,,,,,0.058658,500,0.0,cosine,2,0.875679,0.135633,5.0,cosine,0.99135,0.0,"[0, 1, 0, 0, 1, 0, 1, 0, 2, 1, 1, 1, 1, 1, 1, ..."
7,dbscan,119,,,,,0.056533,500,0.0,euclidean,2,0.996734,0.066758,5.0,euclidean,-0.227907,0.001763,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
8,dbscan,4,,,,,0.06339,1000,0.0,cosine,2,0.865591,0.127078,5.0,cosine,0.999653,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ..."
9,dbscan,33,,,,,0.053227,1000,0.0,euclidean,2,0.99032,0.073935,5.0,euclidean,-0.512701,0.00022,"[0, 1, 2, 3, 4, 2, 5, 0, 6, 7, 4, 8, 9, 10, 11..."


In [13]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

____

### Get n Best Results (kmeans and DBSCAN)

#### Kmeans Results

In [25]:
n=1

kmeans_results_df = results_df[results_df['algo'] == 'k_means']
kmeans_results_df = kmeans_results_df.sort_values(by='silhouette_score', ascending=False)
best_kmeans_results_df = kmeans_results_df.head(n)
best_kmeans_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels,combined_score
59,k_means,8,15.0,15.0,15.0,0.516789,0.002807,1000,0.0,euclidean,7,0.999931,,,,,,"[12, 7, 5, 5, 1, 5, 10, 12, 4, 8, 14, 11, 6, 0...",0.516789


#### DBSCAN Results

In [26]:
dbscan_results_df = results_df[results_df['algo'] == 'dbscan']
dbscan_results_df = dbscan_results_df.sort_values(by='validity_index', ascending=False)
best_dbscan_results_df = dbscan_results_df.head(n)
best_dbscan_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels,combined_score
58,dbscan,4,,,,,0.00941,1000,0.0,cosine,7,0.84725,0.265387,5.0,cosine,0.999869,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ...",0.999869


In [27]:
filtered_results_df = pd.concat([best_dbscan_results_df, best_kmeans_results_df])
filtered_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels,combined_score
58,dbscan,4,,,,,0.00941,1000,0.0,cosine,7,0.84725,0.265387,5.0,cosine,0.999869,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ...",0.999869
59,k_means,8,15.0,15.0,15.0,0.516789,0.002807,1000,0.0,euclidean,7,0.999931,,,,,,"[12, 7, 5, 5, 1, 5, 10, 12, 4, 8, 14, 11, 6, 0...",0.516789


____

### Save Filtered Results to CSV

In [28]:
filtered_results_df.to_csv(data_path + filtered_results_file, index=False)
print("Filtered results dataframe saved at:", data_path + filtered_results_file)

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/filtered_results_2024-04-1722:42:11.819218.csv


___

## Runtime

In [17]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 07:59.23
