## Final Project: Phase 3 - Validation with External Indices
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/21/2024  
___

In [1]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics.cluster import (adjusted_rand_score, contingency_matrix, 
                                     fowlkes_mallows_score, normalized_mutual_info_score)
from sklearn.metrics import (jaccard_score, f1_score, homogeneity_score)

# pathing
from pathlib import Path
import os
import sys

### Set Paths

In [2]:
# define path
HOME = Path(os.getcwd())
print(HOME)

HOME_PARENT_STR = str(HOME.parent)
print(HOME_PARENT_STR)

# path to figs folder
PATH_TO_FIGS_FOLDER = HOME_PARENT_STR + '/figs'

# path to data
PATH_TO_DATA_FOLDER = HOME_PARENT_STR + '/data'

# path to src folder
PATH_TO_SRC = HOME_PARENT_STR + '/src'
print(PATH_TO_SRC)

# sys path
sys.path.append(PATH_TO_SRC)

/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/notebooks
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final
/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final/src


### Import Functions

In [3]:
import cluster_utils as cu
import external_indices_utils as exi

### Import Results DF & Target DF

In [30]:
# transformed data csv file name
results_file = "/results/filtered_results_2024-04-1722:42:11.819218.csv"
results_df = pd.read_csv(PATH_TO_DATA_FOLDER + results_file)

target_sample = "/curated/sampled_target.csv"
target_df = pd.read_csv(PATH_TO_DATA_FOLDER + target_sample)

In [32]:
results_df['cluster_labels'][0]

'[0 1 0 ... 1 0 1]'

In [31]:
# convert cluster_labels elements back to numpy array
results_df['cluster_labels'] = results_df['cluster_labels'].apply(exi.convert_string_to_array)

ValueError: invalid literal for int() with base 10: '...'

In [23]:
target_df

Unnamed: 0,ID,Target
0,4148,2
1,13224,3
2,9754,6
3,9159,6
4,10139,3
...,...,...
995,13402,3
996,12266,3
997,1657,5
998,3494,1


In [24]:
# get the true number of clusters
num_true_labels = len(np.unique(target_df.Target.values))

In [25]:
results_df = results_df[results_df.index == 0]

In [26]:
results_df

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,cluster_labels
0,k_means,6,15.0,15.0,15.0,0.619192,0.048962,10,0.0,euclidean,2,0.999496,,,,,"[0, 8, 1, 1, 10, 2, 4, 10, 13, 10, 5, 0, 3, 1,..."


___

### Compute External Indices
___

This cell will compute the external indices used to validate the clutering solution.

The following external indices will the used:

* ```Adjusted Rand Score``` <a href="#ref1">[1]</a>
  * The Rand index is a measure similarity between two clustering solutions by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
  * This Rand index is then adjusted for change with:
    <br><br>
    * $ARI = \dfrac{RI - \text{Expected\_RI}}{\textit{max(RI)} - \text{Expected\_RI}}$
* ```Jaccard``` <a href="#ref1">[2]</a>
  * The size of the intersection divided by the size of the union of the two labeled sets.
  * We will use ```average = macro```, this returns the mean of all the independently computed Jaccard indices for each true label. This is acceptble when the true label frequencies are similar.
  <br><br>
* ```Fawlks and Mallows Index```
  * The geometric mean between precision and recall
  <br><br>
  * $FMI = \dfrac{TP}{\sqrt{(TP + FP) \cdot (TP + FN)}}$<a href="#ref1">[3]</a>
  <br><br>
* ```F-Measure``` <a href="#ref1">[4]</a>
  * The harmonic mean of the precision and recall.
  <br><br>
  * $F1 = \dfrac{2 \cdot TP}{2 \cdot TP + FP + FN}$
  <br><br>
* ```Purity ~ Homogeneity``` <a href="#ref1">[5]</a>
  * Homogeneity measures the extent to which cluster labels contain only elements of single class.

<p id="ref1"><sup>[1]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html">this link</a>.</p>
<p id="ref1"><sup>[2]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html">this link</a>.</p>
<p id="ref1"><sup>[3]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fowlkes_mallows_score.html">this link</a>.</p>
<p id="ref1"><sup>[4]</sup> scikit-learn.org. More details available at <a href="hhttps://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html">this link</a>.</p>
<p id="ref1"><sup>[5]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html">this link</a>.</p>



In [29]:
dataframe_dict = {}
df_row_dict_list = []
i = 1
for idx, row in results_df.iterrows():
    print('\n')
    print('*' * 100)
    print(f'Results: {i}')
    print('*'*100)
    print()

    print(f'UMAP & CLUSTER ALGORITHM INFORMATION:')
    print('-'*100)
    
    # get algo
    algo = row['algo']

    ###################################################
    ### Get UMAP & Cluster Algorithm Information ####
    ###################################################

    n_components = row['umap_n_components']
    min_dist = row['umap_min_dist']
    n_neighbors = row['umap_n_neighbors']
    trustworthiness = row['trustworthiness']
    n_clusters_found = row['n_clusters_found']
    validity_index = row['validity_index']
    silhouette_score = row['silhouette_score']
    umap_metric = row['umap_metric']

    print(f"Algorithm: {algo}")
    print(f'Number of Clusers Found: {n_clusters_found}')
    if algo == 'dbscan':
        print(f"Validity Index: {validity_index:.5f}")
    else:
        print(f'Silhouette Score: {silhouette_score:.5f}')
    print(f'UMAP Number of Components: {n_components}')
    print(f'UMAP Min Distance: {min_dist}')
    print(f'UMAP Number of Neighbors: {n_neighbors}')
    print(f'UMAP Metric: {umap_metric}')
    print(f'UMAP Trustworthiness: {trustworthiness:.5f}')

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_df, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]

    ########################################
    ######## Get External Indices #########
    ########################################

    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get falks and mallows score
    fawlks_and_mallows_ = fowlkes_mallows_score(true_labels, cluster_labels)

    # completeness
    homogeneity_score_ = homogeneity_score(true_labels, cluster_labels)

    # get F-1 score
    label = np.unique(true_labels)
    f1_score_ = f1_score(true_labels, cluster_labels, average = 'macro')

    # jaccard score
    jaccard_score_ = jaccard_score(true_labels, cluster_labels, average = 'macro')

    # ge the normalized mutual info score
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)

    # get optimized contingency matrix
    modes_df = exi.get_modes(n_components, labels_df)
    cluster_mapping = exi.get_mapping(n_components, labels_df, modes_df)
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print()
    print('-'*100)
    print('CONTINGENCY MATRIX')
    print('-'*100)
    print('Contingency Matrix: ')
    print()
    print(cont_matrix)
    print()
    print('-'*100)
    print('REMAPPED CONTINGENCY MATRIX')
    print('-'*100)
    print()
    print('Best Mapping: ', dict(sorted(cluster_mapping.items())))
    print()
    print('Contingency Matrix: ')
    print()
    print(remapped_cont_matrix)
    print()
    print('-'*100)
    print('EXTERMAL INDICES: PERFORMANCE EVALUATION METRICS')
    print('-'*100)
    print()
    print(f'Adjusted Rand Score: {adj_rand:.5f}')
    print(f'Fawlks and Mallows Score: {fawlks_and_mallows_:.5f}')
    print(f'F-1 Score: {f1_score_:.5f}')
    print(f'Jaccard Score: {jaccard_score_:.5f}')
    print(f'Normalized Mutual Info Score: {nmi:.5f}')
    print(f'Homogeneity Score (Purity): {homogeneity_score_:.5f}')
    print()

    df_row_dict_list.append({
                        'algo': algo,
                        'umap_n_components': n_components,
                        'umap_min_dist': min_dist,
                        'umap_n_neighbors': n_neighbors,
                        'umap_metric': umap_metric,
                        'trustworthiness': trustworthiness,
                        'n_clusters_found': n_clusters_found,
                        'true_num_clusters': num_true_labels,
                        'validity_index': validity_index,
                        'adj_rand_score': adj_rand,
                        'fawlks_and_mallows': fawlks_and_mallows_,
                        'nmi': nmi,
                        'jaccard_score': jaccard_score_,
                        'f1_score': f1_score_,
                        'Overall Purity': overall_purity,
                        'true_labels': true_labels,
                        'cluster_labels': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    i += 1




****************************************************************************************************
Results: 1
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: k_means
Number of Clusers Found: 6
Silhouette Score: 0.61919
UMAP Number of Components: 2
UMAP Min Distance: 0.0
UMAP Number of Neighbors: 10
UMAP Metric: euclidean
UMAP Trustworthiness: 0.99950

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[  0   0   0   0   0  64  37   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   6   0   0   0   0   0   0  34   0   0]
 [ 91   0   0   0   0   0   0   0   0   0   0   0  15  34   0]
 [  0 

In [151]:
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df.sort_values(by='adj_rand_score', ascending=False).head(10)

Unnamed: 0,algo,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,n_clusters_found,true_num_clusters,validity_index,adj_rand_score,...,nmi,jaccard_score,f1_score,Overall Purity,true_labels,cluster_labels,matrix_trace,contingency_matrix,remapped_cont_matrix,mapping
67,dbscan,3,0.1,10,correlation,0.902435,6,7,0.783354,0.668531,...,0.832471,0.00758,0.014396,0.776,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,13,"[[13, 0, 0, 83, 0, 5], [0, 0, 0, 0, 40, 0], [1...","[[83, 0, 13, 0, 5, 0], [0, 40, 0, 0, 0, 0], [0...","{2.0: 5.0, 5.0: 4.0, 1.0: 3.0, 0.0: 2.0, 4.0: ..."
51,dbscan,3,0.0,10,correlation,0.903423,7,7,0.834033,0.633828,...,0.809283,0.00758,0.014396,0.775551,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,13,"[[13, 0, 0, 83, 0, 5, 0], [0, 0, 0, 0, 40, 0, ...","[[83, 0, 13, 0, 5, 0, 0], [0, 40, 0, 0, 0, 0, ...","{6.0: 5.0, 2.0: 8, 5.0: 4.0, 1.0: 3.0, 0.0: 2...."
45,dbscan,2,0.5,200,canberra,0.826707,14,7,0.141427,0.629044,...,0.753252,0.055828,0.070123,0.848624,0 2 1 3 2 6 4 3 5 5  ...,0 2 1 0 2 0 4 0 5 1  ...,145,"[[0, 0, 52, 1, 35, 0, 0, 0, 0, 0, 0, 0, 0, 0],...","[[35, 0, 52, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],...","{9.0: 6.0, 6.0: 8, 3.0: 9, 1.0: 5.0, 8.0: 4.0,..."
23,dbscan,2,0.1,25,correlation,0.898561,5,7,0.779947,0.622249,...,0.798171,0.182508,0.21442,0.692693,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,216,"[[96, 0, 0, 0, 4], [0, 0, 0, 40, 0], [138, 0, ...","[[0, 96, 0, 4, 0], [40, 0, 0, 0, 0], [0, 138, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
19,dbscan,2,0.1,10,correlation,0.901673,5,7,0.81325,0.620845,...,0.796476,0.180523,0.2133,0.692,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,215,"[[96, 0, 0, 0, 5], [0, 0, 0, 40, 0], [138, 0, ...","[[0, 96, 0, 5, 0], [40, 0, 0, 0, 0], [0, 138, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
47,dbscan,2,0.5,200,correlation,0.846848,5,7,0.52632,0.616344,...,0.80398,0.21267,0.240619,0.688912,1 3 2 6 3 6 4 3 5 5  ...,1 0 2 0 3 0 4 0 5 1  ...,246,"[[1, 0, 97, 0, 0], [0, 0, 0, 40, 0], [0, 0, 13...","[[0, 97, 1, 0, 0], [40, 0, 0, 0, 0], [0, 135, ...","{1.0: 5.0, 4.0: 4.0, 0.0: 3.0, 2.0: 2.0, 3.0: ..."
11,dbscan,2,0.0,100,correlation,0.90044,5,7,0.849916,0.614524,...,0.790805,0.185931,0.217091,0.69,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,217,"[[98, 2, 0, 0, 1], [0, 0, 0, 40, 0], [136, 2, ...","[[0, 98, 2, 1, 0], [40, 0, 0, 0, 0], [0, 136, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
3,dbscan,2,0.0,10,correlation,0.900346,6,7,0.839903,0.58951,...,0.775746,0.254501,0.310456,0.693,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,289,"[[96, 0, 0, 0, 5, 0], [0, 0, 0, 40, 0, 0], [13...","[[0, 96, 0, 5, 0, 0], [40, 0, 0, 0, 0, 0], [0,...","{5.0: 5.0, 2.0: 8, 4.0: 4.0, 1.0: 3.0, 0.0: 2...."
61,dbscan,3,0.0,200,canberra,0.831473,5,7,0.734023,0.571522,...,0.719546,0.154306,0.195688,0.67968,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,200,"[[96, 3, 0, 0, 2], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 96, 3, 2, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."
13,dbscan,2,0.0,200,canberra,0.827794,5,7,0.740203,0.563951,...,0.714212,0.151246,0.193774,0.676,0 2 1 3 2 6 3 6 4 3  ...,0 0 1 1 2 1 3 1 4 1  ...,198,"[[97, 3, 0, 0, 1], [0, 0, 0, 40, 0], [140, 0, ...","[[0, 97, 3, 1, 0], [40, 0, 0, 0, 0], [0, 140, ...","{2.0: 5.0, 4.0: 4.0, 1.0: 3.0, 0.0: 2.0, 3.0: ..."


In [152]:
keep_cols = ['true_num_clusters', 'umap_n_components','umap_min_dist', 
             'umap_n_neighbors', 'umap_metric', 'trustworthiness', 'algo', 'n_clusters_found',
            'validity_index', 'adj_rand_score', 'fawlks_and_mallows', 'nmi', 'jaccard_score', 'f1_score']

finalized_results_frame = matrix_results_df[keep_cols]

finalized_results_frame

Unnamed: 0,true_num_clusters,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,algo,n_clusters_found,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score
0,7,2,0.0,10,euclidean,0.999496,k_means,6,,0.468801,0.571041,0.728136,0.000000,0.000000
1,7,2,0.0,10,canberra,0.901121,dbscan,2,0.995394,0.033437,0.428567,0.167015,0.157887,0.170055
2,7,2,0.0,10,chebyshev,0.999475,k_means,6,,0.467099,0.571623,0.738766,0.000000,0.000000
3,7,2,0.0,10,correlation,0.900346,dbscan,6,0.839903,0.589510,0.701371,0.775746,0.254501,0.310456
4,7,2,0.0,25,euclidean,0.999059,dbscan,24,0.688936,0.384459,0.497744,0.689202,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,7,3,0.5,100,correlation,0.886426,dbscan,4,0.670634,0.520151,0.676342,0.732198,0.039032,0.061312
92,7,3,0.5,200,euclidean,0.995012,dbscan,54,0.418418,0.189713,0.345328,0.622867,0.000747,0.001436
93,7,3,0.5,200,canberra,0.833280,dbscan,3,0.203035,0.163993,0.490860,0.452209,0.017658,0.031432
94,7,3,0.5,200,chebyshev,0.987360,dbscan,71,0.368757,0.128756,0.281549,0.598843,0.001137,0.002105
