In [4]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys



In [5]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

ModuleNotFoundError: No module named 'dimensionality_reduction'

___

### Define File Paths

In [None]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


___

### Parameters

In [None]:
# transformed data csv file name
data_file = "/curated/trans_data_design.csv"

# target vector csv file name
target_file = "/curated/beans_target.csv"

___

### Load Data

In [None]:
# read in design matrix dataframe
design_matrix  = pd.read_csv( data_path + data_file )

# read in encoded target vector dataframe
target_vector = pd.read_csv( data_path + target_file, index_col=0)

___

### Check Dimensions

In [None]:
assert len(design_matrix) == len(target_vector)

print("Design Matrix Shape:", design_matrix.shape)
print("Target Vector Shape:", target_vector.shape)

Design Matrix Shape: (13611, 16)
Target Vector Shape: (13611, 2)


In [None]:
# relable target vector, we can go back an change this later
target_vector = target_vector.rename(columns={'id': 'ID', 'target_encoded': 'Target'})
target_vector

Unnamed: 0,ID,Target
0,0,5
1,1,5
2,2,5
3,3,5
4,4,5
...,...,...
13606,13606,3
13607,13607,3
13608,13608,3
13609,13609,3


___

### Shuffle Data

In [None]:
# Add ID columns
cols = list(design_matrix.columns)
design_matrix['ID'] = np.arange(0, len(design_matrix))
col_order = ['ID'] + cols
design_matrix = design_matrix[col_order]
design_matrix_shuffled = shuffle(design_matrix, random_state=42, n_samples=None)
design_matrix_shuffled

Unnamed: 0,ID,numerical__Area,numerical__Perimeter,numerical__MajorAxisLength,numerical__MinorAxisLength,numerical__AspectRation,numerical__Eccentricity,numerical__ConvexArea,numerical__EquivDiameter,numerical__Extent,numerical__Solidity,numerical__roundness,numerical__Compactness,numerical__ShapeFactor1,numerical__ShapeFactor2,numerical__ShapeFactor3,numerical__ShapeFactor4
1488,1488,-0.365218,-0.532278,-0.699519,0.111983,-1.330113,-1.592944,-0.372549,-0.352918,0.437396,1.064426,1.597939,1.498162,-0.370573,1.153692,1.541930,0.989265
2611,2611,0.518320,1.089549,0.586132,0.791512,-0.106143,0.169698,0.550947,0.704943,0.365695,-3.127637,-2.516723,-0.064090,-1.007778,-0.625351,-0.102226,-1.995235
749,749,-0.518101,-0.683994,-0.836862,-0.175910,-1.236665,-1.391983,-0.521807,-0.566452,-0.103759,0.648568,1.240429,1.358954,-0.000975,1.263821,1.387803,0.726306
99,99,-0.678828,-0.965544,-1.139157,-0.278714,-1.663886,-2.482813,-0.684870,-0.805076,0.412487,1.254524,1.972582,1.997703,0.133443,2.168059,2.107283,1.008149
11298,11298,-0.788571,-0.969347,-0.961919,-0.917663,-0.432975,-0.165137,-0.787511,-0.977908,0.724802,-0.043581,0.394278,0.344776,1.223497,0.858969,0.309933,0.104866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,5191,1.030512,1.224997,1.497611,0.788917,1.228794,1.054634,1.016389,1.225843,0.967838,0.807939,-0.601791,-1.196511,-1.043987,-1.330581,-1.176625,-0.170536
13418,13418,-0.449861,-0.466169,-0.426161,-0.507908,-0.010367,0.256019,-0.451343,-0.469661,-1.173051,0.242246,0.075035,-0.090740,0.489755,0.052009,-0.128644,0.485566
5390,5390,1.260297,1.406165,1.595202,1.113733,0.920507,0.898803,1.243435,1.444142,0.696877,0.901208,-0.467030,-0.953662,-1.319088,-1.295505,-0.954530,-0.235434
860,860,-0.498662,-0.671412,-0.857094,-0.082613,-1.381553,-1.711199,-0.502764,-0.538625,0.048110,0.696000,1.357839,1.567757,-0.127278,1.415513,1.619543,0.861651


In [None]:
target_vector_shuffled = shuffle(target_vector, random_state=42, n_samples=None)
target_vector_shuffled

Unnamed: 0,ID,Target
1488,1488,5
2611,2611,0
749,749,5
99,99,5
11298,11298,3
...,...,...
5191,5191,2
13418,13418,3
5390,5390,2
860,860,5


___

### Save Shuffled Data

In [None]:
filename = '/curated/shuffled_target.csv'
target_vector_shuffled.to_csv(data_path + filename, index=False)
print('Target_vector saved to: ', filename)

filename = '/curated/shuffled_design.csv'
design_matrix_shuffled.to_csv(data_path + filename, index=False)
print('Design_matrix saved to: ', filename)

Target_vector saved to:  /curated/shuffled_target.csv
Design_matrix saved to:  /curated/shuffled_design.csv


___

### Create capX

In [None]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
print(f'cap_x shape: {cap_x.shape}')
cap_x

cap_x shape: (13611, 16)


array([[-0.36521767, -0.5322778 , -0.69951932, ...,  1.15369168,  1.54192977,  0.98926454],
       [ 0.51832029,  1.08954897,  0.58613222, ..., -0.6253513 , -0.10222622, -1.99523524],
       [-0.51810098, -0.68399389, -0.83686158, ...,  1.2638209 ,  1.38780271,  0.72630639],
       ...,
       [ 1.26029711,  1.40616472,  1.595202  , ..., -1.29550493, -0.95452972, -0.2354344 ],
       [-0.49866232, -0.67141233, -0.85709382, ...,  1.41551329,  1.61954278,  0.86165138],
       [ 0.36079899,  0.6631759 ,  0.94108189, ..., -1.22107966, -1.40651082, -0.99763592]])

___

### Select Hyperparameters for Gridsearch

In [None]:
# select hyperparams
params1=False
params2=True
params3=False
params4=False

if params1:

    min_dist_list = [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 20, 30, 50, 100]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params2:

    min_dist_list = [0.01, 0.05, 0.1]
    n_neighbors_list = [5, 10, 15]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params3:

    min_dist_list = np.linspace(0, 0.5, num = 40)
    n_neighbors_list = np.arange(5, 35, 5)
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params4:

    min_dist_list = [0.01]
    n_neighbors_list = [10]
    metric_list =['euclidean']
    n_components_list = [2, 3]




___

### Gridsearch Algorithm

In [None]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)
results_df

****************************************************************************************************
Run 1 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.23346598184966277
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  38
Validity Index:  -0.4569558983111058
****************************************************************************************************
Run 2 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2538683025078454
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  8
Validity Index:  -0.06666541315246687
****************************************************************************************************
Run 3 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.24392432449802504
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.10497118118632143
****************************************************************************************************
Run 4 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.26069330622120335
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  -0.023137961911798042
****************************************************************************************************
Run 5 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.24433513993535447
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.7058666483794744
****************************************************************************************************
Run 6 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.24492362755855512
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  -0.009214727020604989
****************************************************************************************************
Run 7 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.2684507968260741
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  88
Validity Index:  -0.41399928653121476
****************************************************************************************************
Run 8 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2848981449691166
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  83
Validity Index:  -0.4060622025522858
****************************************************************************************************
Run 9 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.27176018970494825
Fail: Kmeans did not successfully cluster.


  result /= distance_matrix.shape[0] - 1


zero-size array to reduction operation maximum which has no identity


  result /= distance_matrix.shape[0] - 1


zero-size array to reduction operation maximum which has no identity
DBSCAN
Number of Clusters:  5
Validity Index:  0.19635220672856085
****************************************************************************************************
Run 10 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2759538285574897
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.0010043773200816772
****************************************************************************************************
Run 11 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.25853376120981575
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.6046863503693324
****************************************************************************************************
Run 12 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2892319421762929
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.053198346969286284
****************************************************************************************************
Run 13 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.3012332731244453
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  42
Validity Index:  -0.6136598981902669
****************************************************************************************************
Run 14 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.3189012398976975
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  2
Validity Index:  -0.1586259833492345
****************************************************************************************************
Run 15 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.2941687045033104
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  -0.04283449421656064
****************************************************************************************************
Run 16 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.2959163937405051
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  -0.4122764794625967
****************************************************************************************************
Run 17 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.28783355100283
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.0556343087085267
****************************************************************************************************
Run 18 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  manhattan
n_components:  2
Hopkin's Statistic = 0.3125586182723469
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  -0.038665873113782126
****************************************************************************************************
Run 19 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.11014305397762583
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.26107024695750536
****************************************************************************************************
Run 20 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.09847335337275658
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  9
Validity Index:  0.030164895031365824
****************************************************************************************************
Run 21 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.13125200449177699
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  5
Validity Index:  0.44141595719488846
****************************************************************************************************
Run 22 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.12086161612462153
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  8
Validity Index:  0.10404890096604422
****************************************************************************************************
Run 23 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.13991653232074233
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.7564793958659264
****************************************************************************************************
Run 24 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.01
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.12551418433741093
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.5927635294721825
****************************************************************************************************
Run 25 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.13521566914525796
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.27501153352448504
****************************************************************************************************
Run 26 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.11899684689895183
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  8
Validity Index:  0.06963427480098575
****************************************************************************************************
Run 27 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1560806072987133
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  4
Validity Index:  0.645430074523605
****************************************************************************************************
Run 28 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.13851499396064673
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  8
Validity Index:  0.035136937942933516
****************************************************************************************************
Run 29 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1608773689187138
Fail: Kmeans did not successfully cluster.


  result /= distance_matrix.shape[0] - 1


zero-size array to reduction operation maximum which has no identity
DBSCAN
Number of Clusters:  4
Validity Index:  0.7267281534955983
****************************************************************************************************
Run 30 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.05
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.1435825761655511
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  6
Validity Index:  0.4961486442520901
****************************************************************************************************
Run 31 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.15950072005764043
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  -0.33916255887760005
****************************************************************************************************
Run 32 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  5
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.14778130591513228
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  6
Validity Index:  0.03125113179273738
****************************************************************************************************
Run 33 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1808378068303732
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  5
Validity Index:  0.26437637669456604
****************************************************************************************************
Run 34 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.1630096214745071
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  4
Validity Index:  0.3359237310164338
****************************************************************************************************
Run 35 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.1846103196741861
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  4
Validity Index:  0.530747835592478
****************************************************************************************************
Run 36 of 36


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  manhattan
n_components:  3
Hopkin's Statistic = 0.16613500687071706
Fail: Kmeans did not successfully cluster.
zero-size array to reduction operation minimum which has no identity
DBSCAN
Number of Clusters:  5
Validity Index:  0.20820463011903773


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,38,,,,,0.233466,5,0.01,euclidean,2,0.989639,0.194048,6,-0.456956,"[0, 1, 0, 0, 2, 0, 3, 0, 4, 5, 5, 2, 2, 6, 5, ..."
1,dbscan,8,,,,,0.253868,5,0.01,manhattan,2,0.987549,0.327054,6,-0.066665,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
2,dbscan,7,,,,,0.243924,10,0.01,euclidean,2,0.984171,0.231272,5,0.104971,"[0, 1, 0, 0, 2, 0, 3, 0, 4, 2, 2, 2, 2, 5, 2, ..."
3,dbscan,6,,,,,0.260693,10,0.01,manhattan,2,0.981372,0.312284,6,-0.023138,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
4,dbscan,5,,,,,0.244335,15,0.01,euclidean,2,0.981127,0.204592,6,0.705867,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
5,dbscan,5,,,,,0.244924,15,0.01,manhattan,2,0.978211,0.233072,6,-0.009215,"[0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 3, 0, ..."
6,dbscan,88,,,,,0.268451,5,0.05,euclidean,2,0.989238,0.157228,6,-0.413999,"[0, 1, 0, 2, 3, 0, 4, 0, 5, 6, 7, 8, 9, 10, 6,..."
7,dbscan,83,,,,,0.284898,5,0.05,manhattan,2,0.987076,0.184377,6,-0.406062,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 4, 2, 5, 5, -1, 4,..."
8,dbscan,5,,,,,0.27176,10,0.05,euclidean,2,0.979518,0.254145,6,0.196352,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
9,dbscan,5,,,,,0.275954,10,0.05,manhattan,2,0.980805,0.288727,6,0.001004,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."


___

## All Results

In [None]:
results_df.to_csv(f'../data/all_results_{str(datetime.now()).replace(" ", "")}.csv')

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,38,,,,,0.233466,5,0.01,euclidean,2,0.989639,0.194048,6,-0.456956,"[0, 1, 0, 0, 2, 0, 3, 0, 4, 5, 5, 2, 2, 6, 5, ..."
1,dbscan,8,,,,,0.253868,5,0.01,manhattan,2,0.987549,0.327054,6,-0.066665,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
2,dbscan,7,,,,,0.243924,10,0.01,euclidean,2,0.984171,0.231272,5,0.104971,"[0, 1, 0, 0, 2, 0, 3, 0, 4, 2, 2, 2, 2, 5, 2, ..."
3,dbscan,6,,,,,0.260693,10,0.01,manhattan,2,0.981372,0.312284,6,-0.023138,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
4,dbscan,5,,,,,0.244335,15,0.01,euclidean,2,0.981127,0.204592,6,0.705867,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
5,dbscan,5,,,,,0.244924,15,0.01,manhattan,2,0.978211,0.233072,6,-0.009215,"[0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 3, 0, ..."
6,dbscan,88,,,,,0.268451,5,0.05,euclidean,2,0.989238,0.157228,6,-0.413999,"[0, 1, 0, 2, 3, 0, 4, 0, 5, 6, 7, 8, 9, 10, 6,..."
7,dbscan,83,,,,,0.284898,5,0.05,manhattan,2,0.987076,0.184377,6,-0.406062,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 4, 2, 5, 5, -1, 4,..."
8,dbscan,5,,,,,0.27176,10,0.05,euclidean,2,0.979518,0.254145,6,0.196352,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
9,dbscan,5,,,,,0.275954,10,0.05,manhattan,2,0.980805,0.288727,6,0.001004,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."


In [None]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

___

### Get Best Result for Each Value of n_components

In [None]:
index_list= []
for n_components in set(results_df['umap_n_components'].values):
    
    #filter df by value of n_components
    filtered_results_df = results_df[results_df['umap_n_components'] == n_components]
    # find index for row with the best internal index
    best_idx = filtered_results_df[['validity_index', 'silhouette_score']].max(axis=1).idxmax()
    # append idx to index list
    index_list.append(best_idx)

# convert dict_list to dataframe
best_results_df = results_df[results_df.index.isin(index_list)]
best_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
4,dbscan,5,,,,,0.244335,15,0.01,euclidean,2,0.981127,0.204592,6,0.705867,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 4, 2, ..."
22,dbscan,4,,,,,0.139917,15,0.01,euclidean,3,0.991852,0.439143,6,0.756479,"[0, 1, 0, 0, 2, 0, 1, 0, 3, 2, 2, 2, 2, 1, 2, ..."


___

### External Indicies

In [None]:
# add col for adj ran dscore
#best_results_df.loc[:, 'adjusted_rand_score'] = np.nan
dataframe_dict = {}
df_row_dict_list = []
for idx, row in best_results_df.iterrows():
    print('\n') 
    print('*'*100)
    print('Clustering Results')
    print('*'*100)
    print(row)
    

    # get algo
    algo = row['algo']

    # get n_components 
    n_components = row['umap_n_components']

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_vector_shuffled, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]
   
    
    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    best_results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)


    # get optimized contingency matrix
    modes_df = get_modes(n_components, labels_df)
    print(modes_df.info())
    cluster_mapping = get_mapping(n_components, labels_df, modes_df)
    
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print('\n') 
    print('*'*100)
    print('CONTINGENCY MATRIX')
    print('*'*100)
    print('algo: ', algo)
    print('n_components: ', n_components)
    print('Adjusted Rand Score: ', adj_rand)
    print('True Labels: ', true_labels.values)
    print('Clustering Predicted Labels: ', cluster_labels.values)
    print('Matrix Trace: ', matrix_trace)
    print('Contingency Matrix: ')
    print(cont_matrix)
    print('\n')
    print('REMAPPED CONTINGENCY MATRIX')
    print('Remapped Matrix Trace: ', remapped_matrix_trace)
    print('Contingency Matrix: ')
    print(remapped_cont_matrix)
    print('\n')
    print('Mapping: ', dict(sorted(cluster_mapping.items())))

    df_row_dict_list.append({
                        'algo: ': algo,
                        'n_components: ': n_components,
                        'adj_rand_score: ': adj_rand,
                        'true_labels': true_labels,
                        'cluster_labels: ': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df



****************************************************************************************************
Clustering Results
****************************************************************************************************
algo                                                                             dbscan
n_clusters_found                                                                      5
n_clusters_db_score_is_min                                                          NaN
n_clusters_ch_score_is_max                                                          NaN
n_clusters_silhouette_score_is_max                                                  NaN
silhouette_score                                                                    NaN
hopkins_statistic                                                              0.244335
umap_n_neighbors                                                                     15
umap_min_dist                                                            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_results_df.loc[idx, 'adjusted_rand_score'] = adj_rand


Unnamed: 0,algo:,n_components:,adj_rand_score:,true_labels,cluster_labels:,matrix_trace,contingency_matrix,remapped_cont_matrix,mapping
0,dbscan,2,-0.000304,1488 5 2611 0 749 5 99 5 11...,1488 1 2611 2 749 2 99 2 11...,1463,"[[190, 309, 599, 58, 165], [75, 106, 252, 15, ...","[[599, 309, 190, 165, 58], [252, 106, 75, 73, ...","{2.0: 3.0, 1.0: 8, 0.0: 9, 4.0: 10, 3.0: 11}"
1,dbscan,3,-0.000332,1488 5 2611 0 749 5 99 5 11...,1488 1 2611 2 749 2 99 2 11...,1300,"[[189, 475, 600, 58], [74, 179, 254, 15], [222...","[[600, 475, 189, 58], [254, 179, 74, 15], [791...","{2.0: 3.0, 1.0: 8, 0.0: 9, 3.0: 10}"


____

### Get Combined labels_df with Mapping Applied

In [None]:
dfs = list(dataframe_dict.values())

# Merge DataFrames in the list
labels_df = dfs[0]
for df in dfs[1:]:
    labels_df = pd.merge(labels_df, df, on=['ID', 'Target'])
labels_df

Unnamed: 0,ID,Target,2,3
0,1488,5,8.0,8.0
1,2611,0,3.0,3.0
2,749,5,3.0,3.0
3,99,5,3.0,3.0
4,11298,3,3.0,3.0
...,...,...,...,...
13592,5191,2,3.0,3.0
13593,13418,3,3.0,3.0
13594,5390,2,3.0,3.0
13595,860,5,3.0,3.0
