In [48]:
# general
import os
import sys
import time
from pathlib import Path
import numpy as np
import pandas as pd

# umap
import umap
from sklearn.manifold import trustworthiness
from scipy.spatial.distance import pdist, squareform

# clustering
from sklearn.utils import shuffle


___

### Start Timer

In [49]:
# track runtime
start = time.time()

___

### Pathing

In [50]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path= path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


___

### Import Util Modules

In [51]:
from cluster_utils import *

____

## Define Functions

### UMAP

In [52]:
def umap_dim_red(cap_x):
    ''' 
    Description:
    Params:
    Returns:
    '''
    
    # create umap object
    reducer = umap.UMAP()

    # fit and embed
    reducer.fit(cap_x)
    embedding = reducer.transform(cap_x)

    # verify results
    assert(np.all(embedding == reducer.embedding_))

    # get params
    params = reducer.get_params()

    
    # trustworthiness
    cap_x_dist = squareform(pdist(cap_x))
    cap_x_dist_embed = squareform(pdist(embedding))
    trust = trustworthiness(X=cap_x_dist, 
                             X_embedded=cap_x_dist_embed, 
                             n_neighbors=params['n_neighbors'],
                             metric=params['metric'])

    results_dict = {
    'embedding' : embedding,
    'n_neighbors' : params['n_neighbors'],
    'min_dist' : params['min_dist'],
    'metric' : params['metric'],
    'n_components': params['n_components'],
    'trustworthiness' : trust
    }

    return results_dict


#### K-means

In [53]:
def cluster_kmeans(cap_x, n_clusters, df_row_dict_list):
    '''
    Description: Performs k-means clustering.

        https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

    Input:
            cap_x: embedding (ndarray)
            n_cclusters: value for n_clusters (int)
            df_row_dict_list = list for dicts of kmeans results    
    Returns:
            df_row_dict_list.append({
                                    'n_clusters': n_clusters,
                                    'inertia': inertia,
                                    'calinski_harabasz_score': indices_dict['calinski_harabasz_score'],
                                    'davies_bouldin_score': indices_dict['davies_bouldin_score'],
                                    'silhouette_score': indices_dict['silhouette_score']
                                }
    '''
    # define kmeans object and set params
    kmeans = KMeans()

    # fit k means
    kmeans.fit_predict(cap_x)


    # get lables and inertia
    labels = kmeans.labels_
    inertia = kmeans.inertia_

    # internal indices
    indices_dict = kmeans_indices(cap_x, labels)

    # add values to dict list
    df_row_dict_list.append(
        {
            'n_clusters': n_clusters,
            'inertia': inertia,
            'calinski_harabasz_score': indices_dict['calinski_harabasz_score'],
            'davies_bouldin_score': indices_dict['davies_bouldin_score'],
            'silhouette_score': indices_dict['silhouette_score'],
            'cluster_labels': labels
        }
    )
    return df_row_dict_list

In [54]:
def kmeans(results_dict):
    ''' 
    Description:
    Params:
    Returns:
    '''
    print('*'*100)
    print('*'*100)
    print('Hyperparameters:')
    print('n_neighbors: ', results_dict['n_neighbors'])
    print('min_dist: ', results_dict['min_dist'])
    print('metric: ', results_dict['metric'])
    print('n_components: ', results_dict['n_components'])


    # set umap embedding as cap_x
    cap_x = results_dict['embedding']

    # choose a range of n_clusters to try for kmeans
    n_clusters_list = np.arange(2, 16, 1)

    # init dict
    df_row_dict_list = []

    # iterate over values of n in n_clusters_list
    for n_clusters in n_clusters_list:

        # run clusters for values of n
        # results for each iteration collected in df_row_dict_list 
        cluster_kmeans(cap_x, n_clusters, df_row_dict_list)
    
    # convert results dicts to dataframe
    results_df = pd.DataFrame(df_row_dict_list)

    # determine elbow location
    n_clusters_found = find_elbow(results_df, sensitivity=1.0)

    ## hopkins statistic
    cap_h = get_hopkins(cap_x)
    print(f"Hopkin's Statistic = {cap_h}")
    
    ## testing KMEANS using internal indicies
    n_clusters_db_score_is_min = results_df.loc[results_df['davies_bouldin_score'].idxmin(), 'n_clusters']
    n_clusters_ch_score_is_max = results_df.loc[results_df['calinski_harabasz_score'].idxmax(), 'n_clusters']
    n_clusters_silhouette_score_is_max = results_df.loc[results_df['silhouette_score'].idxmax(), 'n_clusters']
    sil_score = results_df.loc[results_df['silhouette_score'].idxmax(), 'silhouette_score']
    cluster_labels = results_df.loc[results_df['n_clusters'] == n_clusters_found, 'cluster_labels']

    # will return valid results in df_row_dict
    df_row_dict = {
        'algo': 'k_means',
        'n_clusters_found' : n_clusters_found,
        'n_clusters_db_score_is_min' : n_clusters_db_score_is_min,
        'n_clusters_ch_score_is_max' : n_clusters_ch_score_is_max,
        'n_clusters_silhouette_score_is_max' : n_clusters_silhouette_score_is_max,
        'silhouette_score' : sil_score,
        'hopkins_statistic' : cap_h,
        'umap_n_neighbors' : results_dict['n_neighbors'],
        'umap_min_dist' : results_dict['min_dist'],
        'umap_metric' : results_dict['metric'],
        'umap_n_components' : results_dict['n_components'],
        'trustworthiness' : results_dict['trustworthiness'],
        'eps' : np.nan,
        'dbscan_min_samples' : np.nan,
        'validity_index' : np.nan,
        'cluster_labels': cluster_labels,
        'embedding' : cap_x
        }
    
    # test1
    if n_clusters_found == n_clusters_db_score_is_min == n_clusters_ch_score_is_max == n_clusters_silhouette_score_is_max:
        print("Test1 Pass: Kmeans successfully clustered.")
        print('Number of Clusters: ', n_clusters_found)
        return df_row_dict, True
    # test2
    if  n_clusters_db_score_is_min == n_clusters_ch_score_is_max == n_clusters_silhouette_score_is_max:
        print("Test2 Pass: Kmeans successfully clustered.")
        print('Number of Clusters: ', n_clusters_found)
        return df_row_dict, True
    else:
        print("Fail: Kmeans did not successfully cluster.")
        return df_row_dict, False

### DBSCAN

In [55]:
def dbscan(results_dict):
    ''' 
    Description:
    Params:
    Returns:
    '''
    
    # set umap embedding as cap_x
    cap_x = results_dict['embedding']

    # get eps and min_samples from knee locator
    eps, min_samples = find_eps(cap_x)

    # iterate over a range near eps to find best eps value, determined by valididty score
    eps_scan_range = [0.8, 1.8, 0.1]
    f_eps_list = factor_eps(eps, eps_scan_range)
    
    # iterate dbscan over the eps values in f_eps_list
    results_df = cluster_dbscan(cap_x, f_eps_list, min_samples)
    
    # get values where validy score is greatest
    validity_index = results_df.loc[results_df['validity_index'].idxmax(), 'validity_index']
    eps = results_df.loc[results_df['validity_index'].idxmax(), 'k_dist_eps']
    min_samples = results_df.loc[results_df['validity_index'].idxmax(), 'min_samples']
    n_clusters_found = results_df.loc[results_df['validity_index'].idxmax(), 'n_clusters']
    cluster_label = results_df.loc[results_df['validity_index'].idxmax(), 'cluster_labels']

    print('DBSCAN')
    print('Number of Clusters: ', n_clusters_found)
    print('Validity Index: ', validity_index)

    # return results in df_row_dict
    df_row_dict = {
            'algo': 'dbscan',
            'n_clusters_found' : n_clusters_found,
            'n_clusters_db_score_is_min' : np.nan,
            'n_clusters_ch_score_is_max' : np.nan,
            'n_clusters_silhouette_score_is_max' : np.nan,
            'silhouette_score' : np.nan,
            'hopkins_statistic' : results_dict['hopkins_statistic'],
            'umap_n_neighbors' : results_dict['umap_n_neighbors'],
            'umap_min_dist' : results_dict['umap_min_dist'],
            'umap_metric' : results_dict['umap_metric'],
            'umap_n_components' : results_dict['umap_n_components'],
            'trustworthiness' : results_dict['trustworthiness'],
            'eps' : eps,
            'dbscan_min_samples' : min_samples,
            'validity_index' : validity_index,
            'cluster_labels': cluster_label
            }


    return df_row_dict

# Pipeline

____

## Load Data

In [56]:
# transformed data csv file name
data_file = "/curated/trans_data_design.csv"

# read in design matrix dataframe
design_matrix  = pd.read_csv( data_path + data_file )


# target vector csv file name
target_file = "/curated/beans_target.csv"


# read in encoded target vector dataframe
target_vector = pd.read_csv( data_path + target_file, index_col=0)

___

## Check Dimensions

In [57]:
assert len(design_matrix) == len(target_vector)

print("Design Matrix Shape:", design_matrix.shape)
print("Target Vector Shape:", target_vector.shape)

Design Matrix Shape: (13611, 16)
Target Vector Shape: (13611, 2)


In [58]:
# relable target vector, we can go back an change this later
target_vector = target_vector.rename(columns={'id': 'ID', 'target_encoded': 'Target'})
target_vector

Unnamed: 0,ID,Target
0,0,5
1,1,5
2,2,5
3,3,5
4,4,5
...,...,...
13606,13606,3
13607,13607,3
13608,13608,3
13609,13609,3


___

## Shuffle Data Objects

### Feature Matrix

In [59]:
# Add ID columns
cols = list(design_matrix.columns)
design_matrix['ID'] = np.arange(0, len(design_matrix))
col_order = ['ID'] + cols
design_matrix = design_matrix[col_order]
design_matrix_shuffled = shuffle(design_matrix, random_state=42, n_samples=None)
design_matrix_shuffled

Unnamed: 0,ID,numerical__Area,numerical__Perimeter,numerical__MajorAxisLength,numerical__MinorAxisLength,numerical__AspectRation,numerical__Eccentricity,numerical__ConvexArea,numerical__EquivDiameter,numerical__Extent,numerical__Solidity,numerical__roundness,numerical__Compactness,numerical__ShapeFactor1,numerical__ShapeFactor2,numerical__ShapeFactor3,numerical__ShapeFactor4
1488,1488,-0.365218,-0.532278,-0.699519,0.111983,-1.330113,-1.592944,-0.372549,-0.352918,0.437396,1.064426,1.597939,1.498162,-0.370573,1.153692,1.541930,0.989265
2611,2611,0.518320,1.089549,0.586132,0.791512,-0.106143,0.169698,0.550947,0.704943,0.365695,-3.127637,-2.516723,-0.064090,-1.007778,-0.625351,-0.102226,-1.995235
749,749,-0.518101,-0.683994,-0.836862,-0.175910,-1.236665,-1.391983,-0.521807,-0.566452,-0.103759,0.648568,1.240429,1.358954,-0.000975,1.263821,1.387803,0.726306
99,99,-0.678828,-0.965544,-1.139157,-0.278714,-1.663886,-2.482813,-0.684870,-0.805076,0.412487,1.254524,1.972582,1.997703,0.133443,2.168059,2.107283,1.008149
11298,11298,-0.788571,-0.969347,-0.961919,-0.917663,-0.432975,-0.165137,-0.787511,-0.977908,0.724802,-0.043581,0.394278,0.344776,1.223497,0.858969,0.309933,0.104866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,5191,1.030512,1.224997,1.497611,0.788917,1.228794,1.054634,1.016389,1.225843,0.967838,0.807939,-0.601791,-1.196511,-1.043987,-1.330581,-1.176625,-0.170536
13418,13418,-0.449861,-0.466169,-0.426161,-0.507908,-0.010367,0.256019,-0.451343,-0.469661,-1.173051,0.242246,0.075035,-0.090740,0.489755,0.052009,-0.128644,0.485566
5390,5390,1.260297,1.406165,1.595202,1.113733,0.920507,0.898803,1.243435,1.444142,0.696877,0.901208,-0.467030,-0.953662,-1.319088,-1.295505,-0.954530,-0.235434
860,860,-0.498662,-0.671412,-0.857094,-0.082613,-1.381553,-1.711199,-0.502764,-0.538625,0.048110,0.696000,1.357839,1.567757,-0.127278,1.415513,1.619543,0.861651


### Target Vector:

In [60]:
target_vector_shuffled = shuffle(target_vector, random_state=42, n_samples=None)
target_vector_shuffled

Unnamed: 0,ID,Target
1488,1488,5
2611,2611,0
749,749,5
99,99,5
11298,11298,3
...,...,...
5191,5191,2
13418,13418,3
5390,5390,2
860,860,5


___

## Convert Feature Matrix to ndarray:

In [61]:
# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
print(f'cap_x shape: {cap_x.shape}')
cap_x

cap_x shape: (13611, 16)


array([[-0.36521767, -0.5322778 , -0.69951932, ...,  1.15369168,
         1.54192977,  0.98926454],
       [ 0.51832029,  1.08954897,  0.58613222, ..., -0.6253513 ,
        -0.10222622, -1.99523524],
       [-0.51810098, -0.68399389, -0.83686158, ...,  1.2638209 ,
         1.38780271,  0.72630639],
       ...,
       [ 1.26029711,  1.40616472,  1.595202  , ..., -1.29550493,
        -0.95452972, -0.2354344 ],
       [-0.49866232, -0.67141233, -0.85709382, ...,  1.41551329,
         1.61954278,  0.86165138],
       [ 0.36079899,  0.6631759 ,  0.94108189, ..., -1.22107966,
        -1.40651082, -0.99763592]])

___

## UMAP (default params)

In [62]:
results_dict = umap_dim_red(cap_x)

___

## K-Means  (default params)

In [63]:
results_dict, kmeans_solution = kmeans(results_dict)

****************************************************************************************************
****************************************************************************************************
Hyperparameters:
n_neighbors:  15
min_dist:  0.1
metric:  euclidean
n_components:  2


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


Hopkin's Statistic = 0.28460511961584883
Fail: Kmeans did not successfully cluster.


___

## DBSCAN  (default params)

In [64]:
if kmeans_solution:
    results_df = pd.DataFrame(results_dict)
else:
    results_dict = dbscan(results_dict)
    results_df = pd.DataFrame(results_dict)


DBSCAN
Number of Clusters:  5
Validity Index:  0.19793856319553912


___

# Results

In [65]:
display(results_df)

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0
1,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,1
2,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0
3,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0
4,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,1
13607,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0
13608,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,1
13609,dbscan,5,,,,,0.284605,15,0.1,euclidean,2,0.978482,0.244055,6,0.197939,0


___

## Runtime

In [66]:
finish = time.time()
hours = int((finish - start) // 3600)
minutes = int(((finish - start) % 3600) // 60)
seconds = int((finish - start) % 60)
print(f"Total Run Time(hh:mm.ss): {hours:02d}:{minutes:02d}.{seconds:02d}")

Total Run Time(hh:mm.ss): 00:02.41
