In [2]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys



___

### Start Timer

In [3]:
# track runtime
start = time.time()

___

### Define File Paths

In [4]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


____

### Import Modules

In [5]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

___

### Parameters

In [6]:
# transformed data csv file name
design_file = '/curated/shuffled_target.csv'

# target vector csv file name
target_file = "/curated/beans_target.csv"

____

### Load Data

In [7]:
design_file = '/curated/shuffled_target.csv'
target_vector_shuffled = pd.read_csv(data_path + design_file)

target_file = '/curated/shuffled_design.csv'
design_matrix_shuffled = pd.read_csv(data_path + target_file)


____

### Sample Data

In [8]:
sample = True
if sample:
    # num samples
    n = 1000

    # random seed
    rand_seed = 42

    # sample target and save copy to csv
    target_vector_shuffled = target_vector_shuffled.sample(n=n, random_state=rand_seed)
    filename = '/curated/sampled_target.csv'
    target_vector_shuffled.to_csv(data_path + filename, index=False)

     # sample target and save copy to csv
    design_matrix_shuffled = design_matrix_shuffled.sample(n=n, random_state=rand_seed)
    filename = '/curated/sampled_design.csv'
    design_matrix_shuffled.to_csv(data_path + filename, index=False)

___

### Create capX

In [9]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
print(f'cap_x shape: {cap_x.shape}')
cap_x

cap_x shape: (1000, 16)


array([[ 0.47951119,  0.75617026,  0.9025797 , ..., -1.08640545,
        -1.03819934, -2.53011841],
       [-0.49798027, -0.60106105, -0.58065225, ...,  0.38428313,
         0.26243519,  0.25945463],
       [-0.0963504 , -0.06761403, -0.09561878, ..., -0.10319885,
         0.13672302, -0.70899572],
       ...,
       [-0.30471913, -0.47210471, -0.71409026, ...,  1.3837794 ,
         1.95995551,  0.86201627],
       [ 3.61104419,  3.02864018,  2.64390167, ..., -1.24739667,
         0.33815956, -0.56965621],
       [ 0.00623113,  0.11297058,  0.20609565, ..., -0.56222324,
        -0.50146285, -0.33099767]])

___

### Select Hyperparameters for Gridsearch

In [10]:
# select hyperparams
params1=False
params2=False
params3=False
params4=True

if params1:

    min_dist_list = [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 20, 30, 50, 100]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params2:

    min_dist_list = [0.01, 0.05, 0.1]
    n_neighbors_list = [5, 10, 15]
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params3:

    min_dist_list = np.linspace(0, 0.5, num = 40)
    n_neighbors_list = np.arange(5, 35, 5)
    metric_list =['euclidean', 'manhattan']
    n_components_list = [2, 3]


if params4:

    min_dist_list = [0.01]
    n_neighbors_list = [10]
    metric_list =['euclidean']
    n_components_list = [2, 3]




___

### Gridsearch Algorithm

In [11]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)
results_df

****************************************************************************************************
Run 1 of 2


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  2
Hopkin's Statistic = 0.1388590656107254
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  7
Validity Index:  0.45650660276274735
****************************************************************************************************
Run 2 of 2


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  10
min_dist:  0.01
metric:  euclidean
n_components:  3
Hopkin's Statistic = 0.0767646757481604
Fail: Kmeans did not successfully cluster.
DBSCAN
Number of Clusters:  10
Validity Index:  0.5080694489640659


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,7,,,,,0.138859,10,0.01,euclidean,2,0.974305,0.473004,6,0.456507,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
1,dbscan,10,,,,,0.076765,10,0.01,euclidean,3,0.976942,0.436696,6,0.508069,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 4, 0, 5, 2, 1, ..."


___

## All Results

In [12]:
results_df.to_csv(f'../data/all_results_{str(datetime.now()).replace(" ", "")}.csv')

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,7,,,,,0.138859,10,0.01,euclidean,2,0.974305,0.473004,6,0.456507,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."
1,dbscan,10,,,,,0.076765,10,0.01,euclidean,3,0.976942,0.436696,6,0.508069,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 4, 0, 5, 2, 1, ..."


In [13]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

____

### Get 10 Best Results (Kmenas or DBSCAN)

In [14]:
results_df['combined_score'] = results_df['silhouette_score'].fillna(results_df['validity_index'])
results_df = results_df.sort_values(by='combined_score', ascending=False)
filter = results_df.head(10)
filtered_results_df = results_df[results_df.index.isin(filter.index)]
filtered_results_df = filtered_results_df.drop(columns=['combined_score'])

filtered_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
1,dbscan,10,,,,,0.076765,10,0.01,euclidean,3,0.976942,0.436696,6,0.508069,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 4, 0, 5, 2, 1, ..."
0,dbscan,7,,,,,0.138859,10,0.01,euclidean,2,0.974305,0.473004,6,0.456507,"[0, 1, 2, 2, 1, 3, 1, 1, 0, 1, 0, 0, 1, 2, 1, ..."


____

### Save Filtered Results to CSV

In [15]:
filename = f'/curated/filtered_results_{str(datetime.now()).replace(" ", "")}.csv'
filtered_results_df.to_csv(data_path + filename, index=False)
print("Results saved at:", data_path + filename)

Results saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/curated/filtered_results_2024-04-1618:24:31.967695.csv


___

## Runtime

In [16]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 00:00.10
