In [46]:
### Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import ast

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix, fowlkes_mallows_score, normalized_mutual_info_score
from sklearn.metrics import jaccard_score, f1_score

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys

In [47]:
# define path
nb_path = Path(os.getcwd())
print(nb_path)
path = str(nb_path.parent)
print(path)

# path to figs folder
figs_path = path + '/figs'

# path to data
data_path = path + '/data'

# path to src folder
src_path = path + '/src'
print(src_path)

# sys path
sys.path.append(src_path)

/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src


In [48]:
import cluster_utils as cu
import external_indices_utils as exi

In [49]:
# Define the conversion function
def convert_string_to_array(string):
    string = string.strip('[]').replace('\n', '')
    num_list = string.split()
    num_array = np.array([int(num) for num in num_list])
    return num_array

In [52]:
# transformed data csv file name
results_file = "/curated/filtered_results_2024-04-1613:24:07.967152.csv"
results_df = pd.read_csv(data_path + results_file)

target_sample = "/curated/sampled_target.csv"
target_df = pd.read_csv(data_path + target_sample)

In [53]:
# convert cluster_labels elements back to numpy array
results_df['cluster_labels'] = results_df['cluster_labels'].apply(convert_string_to_array)

In [54]:
target_df

Unnamed: 0,ID,Target
0,4148,2
1,13224,3
2,9754,6
3,9159,6
4,10139,3
...,...,...
995,13402,3
996,12266,3
997,1657,5
998,3494,1


In [55]:
# get the true number of clusters
num_true_labels = len(np.unique(target_df.Target.values))

In [56]:
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,5,,,,,0.078775,5,0.05,euclidean,3,0.984051,0.847529,6,0.772864,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
1,dbscan,5,,,,,0.092285,15,0.01,manhattan,3,0.96878,0.510088,6,0.739844,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
2,dbscan,5,,,,,0.079654,15,0.01,euclidean,3,0.973456,0.534698,6,0.734985,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
3,dbscan,4,,,,,0.102622,10,0.05,manhattan,3,0.970124,0.682128,6,0.732093,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
4,dbscan,5,,,,,0.178634,15,0.01,manhattan,2,0.967995,0.410219,6,0.720937,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
5,dbscan,5,,,,,0.096094,15,0.05,euclidean,3,0.972711,0.545795,6,0.672288,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
6,dbscan,5,,,,,0.1929,10,0.1,euclidean,2,0.97471,0.511043,6,0.661111,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
7,dbscan,4,,,,,0.125957,15,0.1,manhattan,3,0.968348,0.755612,6,0.645069,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
8,dbscan,4,,,,,0.215127,10,0.1,manhattan,2,0.968369,0.590364,6,0.638967,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
9,dbscan,4,,,,,0.1197,10,0.1,manhattan,3,0.968699,0.647266,6,0.634241,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."


In [57]:
# add col for adj
#best_results_df.loc[:, 'adjusted_rand_score'] = np.nan
dataframe_dict = {}
df_row_dict_list = []
i = 1
for idx, row in results_df.iterrows():
    print('\n')
    print('*' * 100)
    print(f'Results: {i}')
    print('*'*100)

    print('-'*100)
    print(f'UMAP & CLUSTER ALGORYTHM INFORMATION:')
    print('-'*100)
    #print(row)
    
    # get algo
    algo = row['algo']

    # get hypers
    n_components = row['umap_n_components']
    min_dist = row['umap_min_dist']
    n_neighbors = row['umap_n_neighbors']
    trustworthiness = row['trustworthiness']
    n_clusters_found = row['n_clusters_found']
    validity_index = row['validity_index']
    silhouette_score = row['silhouette_score']

    print(f"Algorithm: {algo}")
    print(f'Number of Clusers Found: {n_clusters_found}')
    if algo == 'dbscan':
        print(f"Validity Index: {validity_index}")
    else:
        print(f'Silhouette Score: {silhouette_score}')
    print(f'UMAP Number of Components: {n_components}')
    print(f'UMAP Min Distance: {min_dist}')
    print(f'UMAP Number of Neighbors: {n_neighbors}')
    print(f'UMAP Trustworthiness: {trustworthiness}')

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_df, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]
   
    ########################################
    ######## Get External Indices #########
    ########################################

    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get falks and mallows score
    fawlks_and_mallows_ = fowlkes_mallows_score(true_labels, cluster_labels)

    # get F-1 score
    label = np.unique(true_labels)
    f1_score_ = f1_score(true_labels, cluster_labels, average = 'macro')

    # jaccard score
    jaccard_score_ = jaccard_score(true_labels, cluster_labels, average = 'macro')

    # ge the normalized mutual info score
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)

    # get optimized contingency matrix
    modes_df = exi.get_modes(n_components, labels_df)

    #print(modes_df.info())
    cluster_mapping = exi.get_mapping(n_components, labels_df, modes_df)
    
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print('-'*100)
    print('CONTINGENCY MATRIX')
    print('-'*100)
    #print('True Labels: ', true_labels.values)
    #print('Clustering Predicted Labels: ', cluster_labels.values)
    print('Matrix Trace: ', matrix_trace)
    print('Contingency Matrix: ')
    print(cont_matrix)
    print('-'*100)
    print('REMAPPED CONTINGENCY MATRIX')
    print('Best Mapping: ', dict(sorted(cluster_mapping.items())))
    print('-'*100)
    print('Remapped Matrix Trace: ', remapped_matrix_trace)
    print('Contingency Matrix: ')
    print(remapped_cont_matrix)
    print('-'*100)
    print('EXTERMAL INDICES: PERFORMANCE EVALUATION METRICS')
    print('-'*100)
    print('Adjusted Rand Score: ', adj_rand)
    print('Fawlks and Mallows Score: ', fawlks_and_mallows_)
    print('F-1 Score: ', f1_score_)
    print('Jaccard Score:', jaccard_score_)
    print('Normalized Mutual Info Score:', nmi)

    df_row_dict_list.append({
                        'algo': algo,
                        'umap_n_components': n_components,
                        'umap_min_dist': min_dist,
                        'umap_n_neighbors': n_neighbors,
                        'umap_metric': trustworthiness,
                        'trustworthiness': trustworthiness,
                        'n_clusters_found': n_clusters_found,
                        'true_num_clusters': num_true_labels,
                        'validity_index': validity_index,
                        'adj_rand_score': adj_rand,
                        'fawlks_and_mallows': fawlks_and_mallows_,
                        'nmi': nmi,
                        'jaccard_score': jaccard_score_,
                        'f1_score': f1_score_,
                        'true_labels': true_labels,
                        'cluster_labels': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    i += 1




****************************************************************************************************
Results: 1
****************************************************************************************************
----------------------------------------------------------------------------------------------------
UMAP & CLUSTER ALGORYTHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.7728636186933401
UMAP Number of Components: 3
UMAP Min Distance: 0.05
UMAP Number of Neighbors: 5
UMAP Trustworthiness: 0.9840512096774192
----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Matrix Trace:  202
Contingency Matrix: 
[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [139   0   0   0   1]
 [  0 248   5

In [40]:
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df

Unnamed: 0,algo,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,n_clusters_found,true_num_clusters,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score,true_labels,cluster_labels,matrix_trace,contingency_matrix,remapped_cont_matrix,mapping
0,dbscan,3,0.05,5,0.984051,0.984051,5,7,0.772864,0.56867,0.688071,0.725968,0.162821,0.201145,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [139, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 139, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
1,dbscan,3,0.01,15,0.96878,0.96878,5,7,0.739844,0.595963,0.708737,0.756915,0.183664,0.215668,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,218,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
2,dbscan,3,0.01,15,0.973456,0.973456,5,7,0.734985,0.572957,0.692753,0.735355,0.165292,0.203046,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
3,dbscan,3,0.05,10,0.970124,0.970124,4,7,0.732093,0.412312,0.612829,0.662947,0.057402,0.082261,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,100,"[[99, 2, 0, 0], [0, 0, 40, 0], [139, 0, 0, 1],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 139, 0, 1],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
4,dbscan,2,0.01,15,0.967995,0.967995,5,7,0.720937,0.594251,0.707398,0.754686,0.182827,0.215302,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,217,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [139, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 139, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
5,dbscan,3,0.05,15,0.972711,0.972711,5,7,0.672288,0.572957,0.692753,0.735355,0.165292,0.203046,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
6,dbscan,2,0.1,10,0.97471,0.97471,5,7,0.661111,0.573603,0.69259,0.734721,0.163802,0.202192,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
7,dbscan,3,0.1,15,0.968348,0.968348,4,7,0.645069,0.408982,0.611291,0.662052,0.057177,0.082035,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,100,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
8,dbscan,2,0.1,10,0.968369,0.968369,4,7,0.638967,0.407088,0.610868,0.663478,0.056799,0.081281,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,99,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
9,dbscan,3,0.1,10,0.968699,0.968699,4,7,0.634241,0.404124,0.608485,0.657665,0.056571,0.081048,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,99,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"


In [41]:
keep_cols = ['true_num_clusters', 'umap_n_components','umap_min_dist', 
             'umap_n_neighbors', 'umap_metric', 'trustworthiness', 'algo', 'n_clusters_found',
            'validity_index', 'adj_rand_score', 'fawlks_and_mallows', 'nmi', 'jaccard_score', 'f1_score']

finalized_results_frame = matrix_results_df[keep_cols]

finalized_results_frame

Unnamed: 0,true_num_clusters,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,algo,n_clusters_found,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score
0,7,3,0.05,5,0.984051,0.984051,dbscan,5,0.772864,0.56867,0.688071,0.725968,0.162821,0.201145
1,7,3,0.01,15,0.96878,0.96878,dbscan,5,0.739844,0.595963,0.708737,0.756915,0.183664,0.215668
2,7,3,0.01,15,0.973456,0.973456,dbscan,5,0.734985,0.572957,0.692753,0.735355,0.165292,0.203046
3,7,3,0.05,10,0.970124,0.970124,dbscan,4,0.732093,0.412312,0.612829,0.662947,0.057402,0.082261
4,7,2,0.01,15,0.967995,0.967995,dbscan,5,0.720937,0.594251,0.707398,0.754686,0.182827,0.215302
5,7,3,0.05,15,0.972711,0.972711,dbscan,5,0.672288,0.572957,0.692753,0.735355,0.165292,0.203046
6,7,2,0.1,10,0.97471,0.97471,dbscan,5,0.661111,0.573603,0.69259,0.734721,0.163802,0.202192
7,7,3,0.1,15,0.968348,0.968348,dbscan,4,0.645069,0.408982,0.611291,0.662052,0.057177,0.082035
8,7,2,0.1,10,0.968369,0.968369,dbscan,4,0.638967,0.407088,0.610868,0.663478,0.056799,0.081281
9,7,3,0.1,10,0.968699,0.968699,dbscan,4,0.634241,0.404124,0.608485,0.657665,0.056571,0.081048
