## Final Project: Phase 3 - Clustering Pipeline Optimization
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/21/2024  
___

In [1]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys

import warnings

# Disable runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



___

### Start Timer

In [2]:
# track runtime
start = time.time()

___

### Define File Paths

In [3]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


____

### Import Modules

In [4]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

___

### Parameters

In [5]:
# transformed data csv file name
design_file = '/curated/shuffled_target.csv'

# target vector csv file name
target_file = "/curated/beans_target.csv"

# sampled target dataframe csv loc
target_vector_shuffled_filename = f'/sampled/sampled_target{str(datetime.now()).replace(" ", "")}.csv'

# sampled design dataframe csv loc
design_matrix_shuffled_filename = f'/sampled/sampled_design{str(datetime.now()).replace(" ", "")}.csv'

# all results dataframe csv loc
all_results_file = f'/results/all_results_{str(datetime.now()).replace(" ", "")}.csv'

# filtered results dataframe csv loc
filtered_results_file = f'/results/filtered_results_{str(datetime.now()).replace(" ", "")}.csv'


____

### Load Data

In [6]:
design_file = '/curated/shuffled_target.csv'
target_vector_shuffled = pd.read_csv(data_path + design_file)

target_file = '/curated/shuffled_design.csv'
design_matrix_shuffled = pd.read_csv(data_path + target_file)


In [7]:
design_matrix_shuffled = design_matrix_shuffled.loc[:, :'numerical__ShapeFactor1']

____

### Sample Data

In [8]:
sample = True
if sample:
    # num samples
    n = 1000

    # random seed
    rand_seed = 42

    # sample target and save copy to csv
    target_vector_shuffled = target_vector_shuffled.sample(n=n, random_state=rand_seed)
    target_vector_shuffled.to_csv(data_path + target_vector_shuffled_filename, index=False)
    print("Sampled target dataframe saved at:", data_path + target_vector_shuffled_filename)


     # sample target and save copy to csv
    design_matrix_shuffled = design_matrix_shuffled.sample(n=n, random_state=rand_seed)
    design_matrix_shuffled.to_csv(data_path + design_matrix_shuffled_filename, index=False)
    print("Sampled design dataframe saved at:", data_path + design_matrix_shuffled_filename)


Sampled target dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/sampled/sampled_target2024-04-1716:37:36.600376.csv
Sampled design dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/sampled/sampled_design2024-04-1716:37:36.600396.csv


___

### Create capX

In [9]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
cap_x = design_matrix_shuffled
print(f'cap_x shape: {cap_x.shape}')
cap_x.shape

cap_x shape: (1000, 14)


(1000, 14)

___

### Select Hyperparameters for Gridsearch

In [10]:
# select hyperparams
params1=False
params2=False
params3=False
params4=True

if params1:

    min_dist_list = [0.0, 0.01, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 25, 50, 100, 200]
    metric_list =['euclidean', 'canberra', 'mahalanobis', 'correlation']
    n_components_list = range(2, 4)


if params2:

    min_dist_list = [0.5]
    n_neighbors_list = [100]
    metric_list = ['euclidean']
    n_components_list = [3]


if params3:

    min_dist_list = [0.0, 0.1, 0.5]
    n_neighbors_list = [10, 25, 100, 200]
    metric_list =['euclidean', 'canberra', 'chebyshev', 'correlation']
    n_components_list = range(2, 4)


if params4:

    min_dist_list = [0.0]
    n_neighbors_list = [10, 25, 50, 100]
    metric_list =['cosine', 'euclidean']
    n_components_list = range(2, 6)




___

### Gridsearch Algorithm

In [11]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)


****************************************************************************************************
Run 1 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.082702563243827
Silhouette Score:  0.8556588
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.9387943607939297
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 2 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.04896204926143304
Silhouette Score:  0.61919236
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  6
****************************************************************************************************
Run 3 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.07210219918826391
Silhouette Score:  0.9134478
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.999546438502425
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 4 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.05329845818125139
Silhouette Score:  0.62460595
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  69
Validity Index:  0.6820227211547496
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68]
Noise Ratio:  0.015
****************************************************************************************************
Run 5 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.04762904447611593
Silhouette Score:  0.93522424
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.9481019185059812
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 6 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.05389156335550949
Silhouette Score:  0.63013864
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  58
Validity Index:  0.6444453862878878
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57]
Noise Ratio:  0.016
****************************************************************************************************
Run 7 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  2
Hopkin's Statistic = 0.06475633000119725
Silhouette Score:  0.90176
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9735180959222254
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 8 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.06082587955359263
Silhouette Score:  0.62401545
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  36
Validity Index:  0.6251772166542938
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35]
Noise Ratio:  0.01
****************************************************************************************************
Run 9 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.04204574786072489
Silhouette Score:  0.8743765
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.9047418242686658
[0 1 2 3 4 5 6]
Noise Ratio:  0.0
****************************************************************************************************
Run 10 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.01839909193750253
Silhouette Score:  0.5784646
Test2 Pass: Kmeans successfully clustered.
Number of Clusters:  6
****************************************************************************************************
Run 11 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.028256154156505536
Silhouette Score:  0.9289428
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9987557995077854
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 12 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.0207989468500042
Silhouette Score:  0.6321295
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  81
Validity Index:  0.6969036537775475
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80]
Noise Ratio:  0.015
****************************************************************************************************
Run 13 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.03891001348028234
Silhouette Score:  0.9448147
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9727851997101844
[-1  0  1  2  3  4]
Noise Ratio:  0.002
****************************************************************************************************
Run 14 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.02429910380831576
Silhouette Score:  0.62100416
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  61
Validity Index:  0.675520571410346
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60]
Noise Ratio:  0.025
****************************************************************************************************
Run 15 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  3
Hopkin's Statistic = 0.03129599066055156
Silhouette Score:  0.93636954
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.985182806082262
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 16 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.02617326634360245
Silhouette Score:  0.6080383
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  0.9287036069603165
[0 1]
Noise Ratio:  0.0
****************************************************************************************************
Run 17 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.02935706123115888
Silhouette Score:  0.87881887
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.9025336570831011
[-1  0  1  2  3  4  5  6]
Noise Ratio:  0.001
****************************************************************************************************
Run 18 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.012578801973583468
Silhouette Score:  0.594573
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  91
Validity Index:  0.7535401774937186
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90]
Noise Ratio:  0.009
****************************************************************************************************
Run 19 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.026404029868453322
Silhouette Score:  0.91147083
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.9380652718050279
[0 1 2 3]
Noise Ratio:  0.0
****************************************************************************************************
Run 20 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.014922172953344245
Silhouette Score:  0.6247847
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  80
Validity Index:  0.6497930458344499
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79]
Noise Ratio:  0.028
****************************************************************************************************
Run 21 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.0395887790631245
Silhouette Score:  0.9014726
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.9296934048965504
[-1  0  1  2  3  4  5]
Noise Ratio:  0.007
****************************************************************************************************
Run 22 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.017122352947733212
Silhouette Score:  0.60324615
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  18
Validity Index:  0.6377602337850505
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]
Noise Ratio:  0.0
****************************************************************************************************
Run 23 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  4
Hopkin's Statistic = 0.02906959946755523
Silhouette Score:  0.91327465
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  3
Validity Index:  0.9931972440075587
[0 1 2]
Noise Ratio:  0.0
****************************************************************************************************
Run 24 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  4
Hopkin's Statistic = 0.016997038627907912
Silhouette Score:  0.61521477
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  49
Validity Index:  0.6195156285213614
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48]
Noise Ratio:  0.037
****************************************************************************************************
Run 25 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.029004224270937514
Silhouette Score:  0.87526715
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.927625603463215
[0 1 2 3 4 5 6]
Noise Ratio:  0.0
****************************************************************************************************
Run 26 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.009881067492758501
Silhouette Score:  0.62585044
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  86
Validity Index:  0.7228503441206184
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85]
Noise Ratio:  0.003
****************************************************************************************************
Run 27 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.025124563997532982
Silhouette Score:  0.9007248
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.9318507992924853
[0 1 2 3 4 5 6]
Noise Ratio:  0.0
****************************************************************************************************
Run 28 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  25
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.012746157623452855
Silhouette Score:  0.60875493
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  44
Validity Index:  0.6445904782653712
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43]
Noise Ratio:  0.001
****************************************************************************************************
Run 29 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.029828703956827556
Silhouette Score:  0.8916793
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9468210640848225
[-1  0  1  2  3  4]
Noise Ratio:  0.006
****************************************************************************************************
Run 30 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  50
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.013563388073868615
Silhouette Score:  0.6117172
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  48
Validity Index:  0.6787161772925193
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47]
Noise Ratio:  0.01
****************************************************************************************************
Run 31 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  cosine
n_components:  5
Hopkin's Statistic = 0.027438026045911713
Silhouette Score:  0.93842983
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.9658106336610682
[-1  0  1  2  3  4]
Noise Ratio:  0.008
****************************************************************************************************
Run 32 of 32


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  100
min_dist:  0.0
metric:  euclidean
n_components:  5
Hopkin's Statistic = 0.015994393890720705
Silhouette Score:  0.6013469
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  15
Validity Index:  0.6964188377575146
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Noise Ratio:  0.0


___

## All Results

In [12]:
results_df.to_csv(data_path + all_results_file, index=False)
print("Filtered results dataframe saved at:", data_path + all_results_file)

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/all_results_2024-04-1716:37:36.600412.csv


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
0,dbscan,2,,,,,0.082703,10,0.0,cosine,2,0.89445,0.20982,5.0,cosine,0.938794,0.0,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
1,k_means,6,15.0,15.0,15.0,0.619192,0.048962,10,0.0,euclidean,2,0.999496,,,,,,"[0, 8, 1, 1, 10, 2, 4, 10, 13, 10, 5, 0, 3, 1,..."
2,dbscan,3,,,,,0.072102,25,0.0,cosine,2,0.902647,0.188687,5.0,cosine,0.999546,0.0,"[0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
3,dbscan,69,,,,,0.053298,25,0.0,euclidean,2,0.999059,0.191525,5.0,euclidean,0.682023,0.015,"[0, 1, 2, 3, 4, 41, 5, 6, 7, 6, 8, 9, 10, 3, 1..."
4,dbscan,2,,,,,0.047629,50,0.0,cosine,2,0.89742,0.171585,5.0,cosine,0.948102,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,dbscan,58,,,,,0.053892,50,0.0,euclidean,2,0.998653,0.178019,5.0,chebyshev,0.644445,0.016,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 7, 9, 10, 11, 3, 1..."
6,dbscan,3,,,,,0.064756,100,0.0,cosine,2,0.895656,0.193121,4.0,cosine,0.973518,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,dbscan,36,,,,,0.060826,100,0.0,euclidean,2,0.998682,0.23506,5.0,euclidean,0.625177,0.01,"[0, 1, 2, 3, 2, 4, 5, 6, 7, 6, 8, 9, 10, 3, 11..."
8,dbscan,7,,,,,0.042046,10,0.0,cosine,3,0.89952,0.301447,5.0,chebyshev,0.904742,0.0,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 3, 0, 1, 1, 1, ..."
9,k_means,6,15.0,15.0,15.0,0.578465,0.018399,10,0.0,euclidean,3,0.999485,,,,,,"[0, 13, 1, 1, 1, 3, 4, 14, 9, 14, 2, 0, 5, 1, ..."


In [13]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

____

### Get n Best Results (kmeans or DBSCAN)

In [17]:
n=20

results_df['combined_score'] = results_df['silhouette_score'].fillna(results_df['validity_index'])
results_df = results_df.sort_values(by='combined_score', ascending=False)
filter = results_df.head(n)
filtered_results_df = results_df[results_df.index.isin(filter.index)]
filtered_results_df = filtered_results_df.drop(columns=['combined_score'])

#filtered_results_df[filtered_results_df['n_clusters_found'] == 7]
filtered_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
26,dbscan,7,,,,,0.025125,25,0.0,cosine,5,0.905225,0.354272,5.0,chebyshev,0.931851,0.0,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 3, 0, 1, 1, 1, ..."
24,dbscan,7,,,,,0.029004,10,0.0,cosine,5,0.898245,0.26618,4.0,chebyshev,0.927626,0.0,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 3, 0, 1, 1, 1, ..."
8,dbscan,7,,,,,0.042046,10,0.0,cosine,3,0.89952,0.301447,5.0,chebyshev,0.904742,0.0,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 3, 0, 1, 1, 1, ..."
16,dbscan,7,,,,,0.029357,10,0.0,cosine,4,0.897494,0.388948,5.0,euclidean,0.902534,0.001,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 3, 0, 1, 1, 1, ..."


____

### Save Filtered Results to CSV

In [15]:
filtered_results_df.to_csv(data_path + filtered_results_file, index=False)
print("Filtered results dataframe saved at:", data_path + filtered_results_file)

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/filtered_results_2024-04-1716:37:36.600428.csv


___

## Runtime

In [16]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 00:03.01
