## Final Project: Phase 3 - Clustering Pipeline Optimization
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/21/2024  
___

In [1]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys

import warnings

# Disable runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)



___

### Start Timer

In [2]:
# track runtime
start = time.time()

___

### Define File Paths

In [3]:
# define path
NB_PATH = Path(os.getcwd())
print(NB_PATH)
PATH = str(NB_PATH.parent)
print(PATH)

# path to figs folder
FIGS_PATH = PATH + '/figs'

# path to data
DATA_PATH= PATH + '/data'

# path to src folder
SRC_PATH = PATH + '/src'
print(SRC_PATH)

# sys path
sys.path.append(SRC_PATH)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


____

### Import Modules

In [4]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

___

### Parameters

In [5]:
# transformed data csv file name
DESIGN_FILE = '/curated/shuffled_design.csv'

# target vector csv file name
TARGET_FILE = "/curated/shuffled_target.csv"

# sampled target dataframe csv loc
TARGET_VECTOR_SHUFFLED_FILENAME = f'/sampled/sampled_target{str(datetime.now()).replace(" ", "")}.csv'

# sampled design dataframe csv loc
DESIGN_MATRIX_SHUFFLED_FILENAME = f'/sampled/sampled_design{str(datetime.now()).replace(" ", "")}.csv'

# all results dataframe csv loc
ALL_RESULTS_FILENAME = f'/results/all_results_{str(datetime.now()).replace(" ", "")}.csv'

# filtered results dataframe csv loc
FILTERED_RESULTS_FILENAME = f'/results/filtered_results_{str(datetime.now()).replace(" ", "")}.csv'


____

### Load Data

In [6]:

target_vector_shuffled = pd.read_csv(DATA_PATH + TARGET_FILE)

design_matrix_shuffled = pd.read_csv(DATA_PATH + DESIGN_FILE)


In [7]:
#design_matrix_shuffled = design_matrix_shuffled.loc[:, :'numerical__ShapeFactor1']

____

### Sample Data

In [8]:
sample = False
if sample:
    # num samples
    n = 1000

    # random seed
    rand_seed = 42

    # sample target and save copy to csv
    target_vector_shuffled = target_vector_shuffled.sample(n=n, random_state=rand_seed)
    target_vector_shuffled.to_csv(DATA_PATH + TARGET_VECTOR_SHUFFLED_FILENAME, index=False)
    print("Sampled target dataframe saved at:", DATA_PATH + TARGET_VECTOR_SHUFFLED_FILENAME)


     # sample target and save copy to csv
    design_matrix_shuffled = design_matrix_shuffled.sample(n=n, random_state=rand_seed)
    design_matrix_shuffled.to_csv(DATA_PATH + DESIGN_MATRIX_SHUFFLED_FILENAME, index=False)
    print("Sampled design dataframe saved at:", DATA_PATH + DESIGN_MATRIX_SHUFFLED_FILENAME)


___

### Create capX

In [9]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
cap_x = design_matrix_shuffled
print(f'cap_x shape: {cap_x.shape}')
cap_x.shape

cap_x shape: (13611, 17)


(13611, 17)

___

### Select Hyperparameters for Gridsearch

In [10]:
# select hyperparams
params1=False
params2=True
params3=False
params4=False

if params1:

    min_dist_list = [0.0, 0.01, 0.1, 0.25, 0.5]
    n_neighbors_list = [10, 25, 50, 100, 200]
    metric_list =['euclidean', 'canberra', 'mahalanobis', 'correlation']
    n_components_list = range(2, 4)


if params2:

    min_dist_list = [0.0, 0.1]
    n_neighbors_list = [100, 500, 750, 1000, 1500]
    metric_list =['cosine', 'euclidean']
    n_components_list = range(2, 9)


if params3:

    min_dist_list = [0]
    n_neighbors_list = [1000]
    metric_list =['euclidean', 'cosine']
    n_components_list = [7]


if params4:

    min_dist_list = [0.0]
    n_neighbors_list = [50, 100, 250, 500, 1000]
    metric_list =['cosine', 'euclidean']
    n_components_list = range(2, 9)




___

### Gridsearch Algorithm

In [11]:
runs = len(n_components_list)*len(min_dist_list)*len(n_neighbors_list)*len(metric_list)
run = 0
df_row_dict_list = []
for n_components in n_components_list:
  for min_dist in min_dist_list:
    for n_neighbors in n_neighbors_list: 
      for metric in metric_list:
        run +=1
        print('*'*100)
        print(f'Run {run} of {runs}')
        results_dict = dr.umap_dim_red(cap_x, n_neighbors, min_dist, metric, n_components) 
        df_row_dict = cl.clustering(results_dict)
        df_row_dict_list.append(df_row_dict)
results_df = pd.DataFrame(df_row_dict_list)


****************************************************************************************************
Run 1 of 140


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


KeyboardInterrupt: 

___

## All Results

In [None]:
results_df.to_csv(DATA_PATH + ALL_RESULTS_FILENAME, index=False)
print("Filtered results dataframe saved at:", DATA_PATH + ALL_RESULTS_FILENAME)

# display max rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
results_df

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/all_results_2024-04-1811:12:48.064435.csv


Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
0,k_means,15,15.0,15.0,15.0,0.516789,0.002807,1000,0,euclidean,7,0.999931,,,,,,"[12, 7, 5, 5, 1, 5, 10, 12, 4, 8, 14, 11, 6, 0..."
1,dbscan,4,,,,,0.00941,1000,0,cosine,7,0.84725,0.265387,5.0,cosine,0.999869,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ..."


In [None]:
# set max rows back to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

____

### Get n Best Results (kmeans and DBSCAN)

#### Kmeans Results

In [None]:
n=1

kmeans_results_df = results_df[results_df['algo'] == 'k_means']
kmeans_results_df = kmeans_results_df.sort_values(by='silhouette_score', ascending=False)
best_kmeans_results_df = kmeans_results_df.head(n)
best_kmeans_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
0,k_means,15,15.0,15.0,15.0,0.516789,0.002807,1000,0,euclidean,7,0.999931,,,,,,"[12, 7, 5, 5, 1, 5, 10, 12, 4, 8, 14, 11, 6, 0..."


#### DBSCAN Results

In [None]:
dbscan_results_df = results_df[results_df['algo'] == 'dbscan']
dbscan_results_df = dbscan_results_df.sort_values(by='validity_index', ascending=False)
best_dbscan_results_df = dbscan_results_df.head(n)
best_dbscan_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
1,dbscan,4,,,,,0.00941,1000,0,cosine,7,0.84725,0.265387,5.0,cosine,0.999869,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ..."


In [None]:
filtered_results_df = pd.concat([best_dbscan_results_df, best_kmeans_results_df])
filtered_results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
1,dbscan,4,,,,,0.00941,1000,0,cosine,7,0.84725,0.265387,5.0,cosine,0.999869,0.0,"[0, 1, 0, 2, 1, 0, 1, 0, 3, 1, 1, 1, 1, 1, 1, ..."
0,k_means,15,15.0,15.0,15.0,0.516789,0.002807,1000,0,euclidean,7,0.999931,,,,,,"[12, 7, 5, 5, 1, 5, 10, 12, 4, 8, 14, 11, 6, 0..."


In [None]:
filtered_results_df['cluster_labels'] = filtered_results_df['cluster_labels'].apply(lambda x: ','.join(map(str, x)))

____

### Save Filtered Results to CSV

In [None]:
filtered_results_df.to_csv(DATA_PATH + FILTERED_RESULTS_FILENAME, index=False)
print("Filtered results dataframe saved at:", DATA_PATH + FILTERED_RESULTS_FILENAME)

Filtered results dataframe saved at: /Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/data/results/filtered_results_2024-04-1811:12:48.064452.csv


___

## Runtime

In [None]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 00:52.51
