## Final Project: Phase 3 - Validation with External Indices
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 03/11/2024  
___

In [1]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics.cluster import (adjusted_rand_score, contingency_matrix, 
                                     fowlkes_mallows_score, normalized_mutual_info_score)
from sklearn.metrics import jaccard_score, f1_score

# pathing
from pathlib import Path
import os
import sys

### Set Paths

In [2]:
# define path
HOME = Path(os.getcwd())
print(HOME)

HOME_PARENT_STR = str(HOME.parent)
print(HOME_PARENT_STR)

# path to figs folder
PATH_TO_FIGS_FOLDER = HOME_PARENT_STR + '/figs'

# path to data
PATH_TO_DATA_FOLDER = HOME_PARENT_STR + '/data'

# path to src folder
PATH_TO_SRC = HOME_PARENT_STR + '/src'
print(PATH_TO_SRC)

# sys path
sys.path.append(PATH_TO_SRC)

/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src


### Import Functions

In [3]:
import cluster_utils as cu
import external_indices_utils as exi

### Import Results DF & Target DF

In [4]:
# transformed data csv file name
results_file = "/curated/filtered_results_2024-04-1617:37:14.353021.csv"
results_df = pd.read_csv(PATH_TO_DATA_FOLDER + results_file)

target_sample = "/curated/sampled_target.csv"
target_df = pd.read_csv(PATH_TO_DATA_FOLDER + target_sample)

In [5]:
# convert cluster_labels elements back to numpy array
results_df['cluster_labels'] = results_df['cluster_labels'].apply(exi.convert_string_to_array)

In [6]:
target_df

Unnamed: 0,ID,Target
0,4148,2
1,13224,3
2,9754,6
3,9159,6
4,10139,3
...,...,...
995,13402,3
996,12266,3
997,1657,5
998,3494,1


In [7]:
# get the true number of clusters
num_true_labels = len(np.unique(target_df.Target.values))

In [8]:
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,validity_index,cluster_labels
0,dbscan,5,,,,,0.078775,5,0.05,euclidean,3,0.984051,0.847529,6,0.772864,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
1,dbscan,5,,,,,0.092285,15,0.01,manhattan,3,0.96878,0.510088,6,0.739844,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
2,dbscan,5,,,,,0.079654,15,0.01,euclidean,3,0.973456,0.534698,6,0.734985,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
3,dbscan,4,,,,,0.102622,10,0.05,manhattan,3,0.970124,0.682128,6,0.732093,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
4,dbscan,5,,,,,0.178634,15,0.01,manhattan,2,0.967995,0.410219,6,0.720937,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
5,dbscan,5,,,,,0.096094,15,0.05,euclidean,3,0.972711,0.545795,6,0.672288,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
6,dbscan,5,,,,,0.1929,10,0.1,euclidean,2,0.97471,0.511043,6,0.661111,"[0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
7,dbscan,4,,,,,0.125957,15,0.1,manhattan,3,0.968348,0.755612,6,0.645069,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
8,dbscan,4,,,,,0.215127,10,0.1,manhattan,2,0.968369,0.590364,6,0.638967,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."
9,dbscan,4,,,,,0.1197,10,0.1,manhattan,3,0.968699,0.647266,6,0.634241,"[0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, ..."


___

### Compute External Indices
___

This cell will compute the external indices used to validate the clutering solution.

The following external indices will the used:

* ```Adjusted Rand Score``` <a href="#ref1">[1]</a>
  * The Rand index is a measure similarity between two clustering solutions by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
  * This Rand index is then adjusted for change with:
    <br><br>
    * $ARI = \dfrac{RI - \text{Expected\_RI}}{\textit{max(RI)} - \text{Expected\_RI}}$
* ```Jaccard``` <a href="#ref1">[2]</a>
  * The size of the intersection divided by the size of the union of the two labeled sets.
  * We will use ```average = macro```, this returns the mean of all the independently computed Jaccard indices for each true label. This is acceptble when the true label frequencies are similar.
  <br><br>
* ```Fawlks and Mallows Index```
  * The geometric mean between precision and recall
  <br><br>
  * $FMI = \dfrac{TP}{\sqrt{(TP + FP) \cdot (TP + FN)}}$<a href="#ref1">[3]</a>
  <br><br>
* ```NMI Measure```
* ```F-Measure``` <a href="#ref1">[4]</a>
  * The harmonic mean of the precision and recall.
  <br><br>
  * $F1 = \dfrac{2 \cdot TP}{2 \cdot TP + FP + FN}$
* ```Purity```
  * In this analysis we will compute purity as the proportuon of true labels that were labeled correctly, i.e., that after the best mapping had the same cluster label. We will compute this metric for each true label class and over the entire dataset.

<p id="ref1"><sup>[1]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html">this link</a>.</p>
<p id="ref1"><sup>[2]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html">this link</a>.</p>
<p id="ref1"><sup>[3]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fowlkes_mallows_score.html">this link</a>.</p>
<p id="ref1"><sup>[4]</sup> scikit-learn.org. More details available at <a href="hhttps://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html">this link</a>.</p>



In [12]:
dataframe_dict = {}
df_row_dict_list = []
i = 1
for idx, row in results_df.iterrows():
    print('\n')
    print('*' * 100)
    print(f'Results: {i}')
    print('*'*100)
    print()

    print(f'UMAP & CLUSTER ALGORITHM INFORMATION:')
    print('-'*100)
    
    # get algo
    algo = row['algo']

    ###################################################
    ### Get UMAP & Cluster Algorithm Information ####
    ###################################################

    n_components = row['umap_n_components']
    min_dist = row['umap_min_dist']
    n_neighbors = row['umap_n_neighbors']
    trustworthiness = row['trustworthiness']
    n_clusters_found = row['n_clusters_found']
    validity_index = row['validity_index']
    silhouette_score = row['silhouette_score']

    print(f"Algorithm: {algo}")
    print(f'Number of Clusers Found: {n_clusters_found}')
    if algo == 'dbscan':
        print(f"Validity Index: {validity_index:.5f}")
    else:
        print(f'Silhouette Score: {silhouette_score:.5f}')
    print(f'UMAP Number of Components: {n_components}')
    print(f'UMAP Min Distance: {min_dist}')
    print(f'UMAP Number of Neighbors: {n_neighbors}')
    print(f'UMAP Trustworthiness: {trustworthiness:.5f}')

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_df, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]

    ########################################
    ######## Get External Indices #########
    ########################################

    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get falks and mallows score
    fawlks_and_mallows_ = fowlkes_mallows_score(true_labels, cluster_labels)

    # get F-1 score
    label = np.unique(true_labels)
    f1_score_ = f1_score(true_labels, cluster_labels, average = 'macro')

    # jaccard score
    jaccard_score_ = jaccard_score(true_labels, cluster_labels, average = 'macro')

    # ge the normalized mutual info score
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)

    # get optimized contingency matrix
    modes_df = exi.get_modes(n_components, labels_df)
    cluster_mapping = exi.get_mapping(n_components, labels_df, modes_df)
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # get purities
    purity_df, overall_purity = exi.get_true_label_purity(remapped_cont_matrix, true_labels)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print()
    print('-'*100)
    print('CONTINGENCY MATRIX')
    print('-'*100)
    #print('Matrix Trace: ', matrix_trace)
    print('Contingency Matrix: ')
    print()
    print(cont_matrix)
    print()
    print('-'*100)
    print('REMAPPED CONTINGENCY MATRIX')
    print('-'*100)
    print()
    print('Best Mapping: ', dict(sorted(cluster_mapping.items())))
    print()
    #print('Remapped Matrix Trace: ', remapped_matrix_trace)
    print('Contingency Matrix: ')
    print()
    print(remapped_cont_matrix)
    print()
    print('-'*100)
    print('EXTERMAL INDICES: PERFORMANCE EVALUATION METRICS')
    print('-'*100)
    print()
    print(f'Adjusted Rand Score: {adj_rand:.5f}')
    print(f'Fawlks and Mallows Score: {fawlks_and_mallows_:.5f}')
    print(f'F-1 Score: {f1_score_:.5f}')
    print(f'Jaccard Score: {jaccard_score_:.5f}')
    print(f'Normalized Mutual Info Score: {nmi:.5f}')
    print()
    print("True Label Purities:")
    print()
    display(purity_df)
    print()
    print(f'Overall Purity: {overall_purity:.5f}')

    df_row_dict_list.append({
                        'algo': algo,
                        'umap_n_components': n_components,
                        'umap_min_dist': min_dist,
                        'umap_n_neighbors': n_neighbors,
                        'umap_metric': trustworthiness,
                        'trustworthiness': trustworthiness,
                        'n_clusters_found': n_clusters_found,
                        'true_num_clusters': num_true_labels,
                        'validity_index': validity_index,
                        'adj_rand_score': adj_rand,
                        'fawlks_and_mallows': fawlks_and_mallows_,
                        'nmi': nmi,
                        'jaccard_score': jaccard_score_,
                        'f1_score': f1_score_,
                        'true_labels': true_labels,
                        'cluster_labels': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    i += 1




****************************************************************************************************
Results: 1
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.77286
UMAP Number of Components: 3
UMAP Min Distance: 0.05
UMAP Number of Neighbors: 5
UMAP Trustworthiness: 0.98405

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [139   0   0   0   1]
 [  0 248   5   0   2]
 [ 24   2   0   0 103]
 [  1   4 138   0   0]
 [  6 181   4   0   1]]

----------------------------------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,0.992857
3,0.972549
4,0.79845
5,0.965035
6,0.0



Overall Purity: 0.66800


****************************************************************************************************
Results: 2
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.73984
UMAP Number of Components: 3
UMAP Min Distance: 0.01
UMAP Number of Neighbors: 15
UMAP Trustworthiness: 0.96878

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [140   0   0   0   0]
 [  0 248   6   0   1]
 [  6   4   0   0 119]
 [  0   5 138   0   0]
 [  2 182   4   0   4]]

--------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,0.972549
4,0.922481
5,0.965035
6,0.0



Overall Purity: 0.68500


****************************************************************************************************
Results: 3
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.73498
UMAP Number of Components: 3
UMAP Min Distance: 0.01
UMAP Number of Neighbors: 15
UMAP Trustworthiness: 0.97346

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [140   0   0   0   0]
 [  0 249   5   0   1]
 [ 22   4   0   0 103]
 [  1   4 138   0   0]
 [  3 184   4   0   1]]

--------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,0.976471
4,0.79845
5,0.965035
6,0.0



Overall Purity: 0.67000


****************************************************************************************************
Results: 4
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 4
Validity Index: 0.73209
UMAP Number of Components: 3
UMAP Min Distance: 0.05
UMAP Number of Neighbors: 10
UMAP Trustworthiness: 0.97012

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0]
 [  0   0  40   0]
 [139   0   0   1]
 [  0 254   0   1]
 [  6   2   0 121]
 [  0 143   0   0]
 [  2 185   0   5]]

------------------------------------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,0.992857
3,0.996078
4,0.937984
5,0.0
6,0.0



Overall Purity: 0.55400


****************************************************************************************************
Results: 5
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.72094
UMAP Number of Components: 2
UMAP Min Distance: 0.01
UMAP Number of Neighbors: 15
UMAP Trustworthiness: 0.96799

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [139   0   0   0   1]
 [  0 246   6   0   0]
 [  6   5   0   0 118]
 [  0   5 138   0   0]
 [  2 181   4   0   4]]

--------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,0.992857
3,0.97619
4,0.914729
5,0.965035
6,0.0



Overall Purity: 0.68373


****************************************************************************************************
Results: 6
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.67229
UMAP Number of Components: 3
UMAP Min Distance: 0.05
UMAP Number of Neighbors: 15
UMAP Trustworthiness: 0.97271

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [140   0   0   0   0]
 [  0 249   5   0   1]
 [ 22   4   0   0 103]
 [  1   4 138   0   0]
 [  3 184   4   0   1]]

--------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,0.976471
4,0.79845
5,0.965035
6,0.0



Overall Purity: 0.67000


****************************************************************************************************
Results: 7
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 5
Validity Index: 0.66111
UMAP Number of Components: 2
UMAP Min Distance: 0.1
UMAP Number of Neighbors: 10
UMAP Trustworthiness: 0.97471

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0   0]
 [  0   0   0  40   0]
 [140   0   0   0   0]
 [  0 249   5   0   1]
 [ 22   4   0   0 103]
 [  0   4 139   0   0]
 [  3 182   4   0   3]]

---------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,0.976471
4,0.79845
5,0.972028
6,0.0



Overall Purity: 0.67100


****************************************************************************************************
Results: 8
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 4
Validity Index: 0.64507
UMAP Number of Components: 3
UMAP Min Distance: 0.1
UMAP Number of Neighbors: 15
UMAP Trustworthiness: 0.96835

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0]
 [  0   0  40   0]
 [140   0   0   0]
 [  0 254   0   1]
 [  6   4   0 119]
 [  0 143   0   0]
 [  2 186   0   4]]

-------------------------------------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,0.996078
4,0.922481
5,0.0
6,0.0



Overall Purity: 0.55300


****************************************************************************************************
Results: 9
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 4
Validity Index: 0.63897
UMAP Number of Components: 2
UMAP Min Distance: 0.1
UMAP Number of Neighbors: 10
UMAP Trustworthiness: 0.96837

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0]
 [  0   0  40   0]
 [140   0   0   0]
 [  0 255   0   0]
 [  7   5   0 117]
 [  0 143   0   0]
 [  1 187   0   4]]

-------------------------------------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,1.0
4,0.906977
5,0.0
6,0.0



Overall Purity: 0.55200


****************************************************************************************************
Results: 10
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 4
Validity Index: 0.63424
UMAP Number of Components: 3
UMAP Min Distance: 0.1
UMAP Number of Neighbors: 10
UMAP Trustworthiness: 0.96870

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[ 99   2   0   0]
 [  0   0  40   0]
 [140   0   0   0]
 [  0 255   0   0]
 [  7   6   0 116]
 [  0 143   0   0]
 [  2 186   0   4]]

------------------------------------------------------------

Unnamed: 0_level_0,purity
true_label,Unnamed: 1_level_1
0,0.0
1,1.0
2,1.0
3,1.0
4,0.899225
5,0.0
6,0.0



Overall Purity: 0.55100


In [10]:
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df

Unnamed: 0,algo,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,n_clusters_found,true_num_clusters,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score,true_labels,cluster_labels,matrix_trace,contingency_matrix,remapped_cont_matrix,mapping
0,dbscan,3,0.05,5,0.984051,0.984051,5,7,0.772864,0.56867,0.688071,0.725968,0.162821,0.201145,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [139, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 139, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
1,dbscan,3,0.01,15,0.96878,0.96878,5,7,0.739844,0.595963,0.708737,0.756915,0.183664,0.215668,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,218,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
2,dbscan,3,0.01,15,0.973456,0.973456,5,7,0.734985,0.572957,0.692753,0.735355,0.165292,0.203046,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
3,dbscan,3,0.05,10,0.970124,0.970124,4,7,0.732093,0.412312,0.612829,0.662947,0.057402,0.082261,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,100,"[[99, 2, 0, 0], [0, 0, 40, 0], [139, 0, 0, 1],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 139, 0, 1],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
4,dbscan,2,0.01,15,0.967995,0.967995,5,7,0.720937,0.594251,0.707398,0.754686,0.182827,0.215302,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,217,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [139, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 139, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
5,dbscan,3,0.05,15,0.972711,0.972711,5,7,0.672288,0.572957,0.692753,0.735355,0.165292,0.203046,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
6,dbscan,2,0.1,10,0.97471,0.97471,5,7,0.661111,0.573603,0.69259,0.734721,0.163802,0.202192,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,202,"[[99, 2, 0, 0, 0], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 99, 2, 0, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
7,dbscan,3,0.1,15,0.968348,0.968348,4,7,0.645069,0.408982,0.611291,0.662052,0.057177,0.082035,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,100,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
8,dbscan,2,0.1,10,0.968369,0.968369,4,7,0.638967,0.407088,0.610868,0.663478,0.056799,0.081281,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,99,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"
9,dbscan,3,0.1,10,0.968699,0.968699,4,7,0.634241,0.404124,0.608485,0.657665,0.056571,0.081048,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,99,"[[99, 2, 0, 0], [0, 0, 40, 0], [140, 0, 0, 0],...","[[0, 99, 2, 0], [40, 0, 0, 0], [0, 140, 0, 0],...","{3.0: 4.0, 1.0: 3.0, 0.0: 2.0, 2.0: 1.0}"


In [11]:
keep_cols = ['true_num_clusters', 'umap_n_components','umap_min_dist', 
             'umap_n_neighbors', 'umap_metric', 'trustworthiness', 'algo', 'n_clusters_found',
            'validity_index', 'adj_rand_score', 'fawlks_and_mallows', 'nmi', 'jaccard_score', 'f1_score']

finalized_results_frame = matrix_results_df[keep_cols]

finalized_results_frame

Unnamed: 0,true_num_clusters,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,algo,n_clusters_found,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score
0,7,3,0.05,5,0.984051,0.984051,dbscan,5,0.772864,0.56867,0.688071,0.725968,0.162821,0.201145
1,7,3,0.01,15,0.96878,0.96878,dbscan,5,0.739844,0.595963,0.708737,0.756915,0.183664,0.215668
2,7,3,0.01,15,0.973456,0.973456,dbscan,5,0.734985,0.572957,0.692753,0.735355,0.165292,0.203046
3,7,3,0.05,10,0.970124,0.970124,dbscan,4,0.732093,0.412312,0.612829,0.662947,0.057402,0.082261
4,7,2,0.01,15,0.967995,0.967995,dbscan,5,0.720937,0.594251,0.707398,0.754686,0.182827,0.215302
5,7,3,0.05,15,0.972711,0.972711,dbscan,5,0.672288,0.572957,0.692753,0.735355,0.165292,0.203046
6,7,2,0.1,10,0.97471,0.97471,dbscan,5,0.661111,0.573603,0.69259,0.734721,0.163802,0.202192
7,7,3,0.1,15,0.968348,0.968348,dbscan,4,0.645069,0.408982,0.611291,0.662052,0.057177,0.082035
8,7,2,0.1,10,0.968369,0.968369,dbscan,4,0.638967,0.407088,0.610868,0.663478,0.056799,0.081281
9,7,3,0.1,10,0.968699,0.968699,dbscan,4,0.634241,0.404124,0.608485,0.657665,0.056571,0.081048
