## Final Project: Phase 3 - Validation with External Indices
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/21/2024  
___

In [1]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics.cluster import (adjusted_rand_score, contingency_matrix, 
                                     fowlkes_mallows_score, normalized_mutual_info_score)
from sklearn.metrics import (jaccard_score, f1_score, homogeneity_score)

# pathing
from pathlib import Path
import os
import sys

### Set Paths

In [2]:
# define path
HOME = Path(os.getcwd())
print(HOME)

HOME_PARENT_STR = str(HOME.parent)
print(HOME_PARENT_STR)

# path to figs folder
PATH_TO_FIGS_FOLDER = HOME_PARENT_STR + '/figs'

# path to data
PATH_TO_DATA_FOLDER = HOME_PARENT_STR + '/data'

# path to src folder
PATH_TO_SRC = HOME_PARENT_STR + '/src'
print(PATH_TO_SRC)

# sys path
sys.path.append(PATH_TO_SRC)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


### Import Functions

In [3]:
import cluster_utils as cu
import external_indices_utils as exi

### Import Results DF & Target DF

In [4]:
# transformed data csv file name
results_file = "/results/filtered_results_2024-04-1811:12:48.064452.csv"
results_df = pd.read_csv(PATH_TO_DATA_FOLDER + results_file)

target_sample = "/curated/shuffled_target.csv"
target_df = pd.read_csv(PATH_TO_DATA_FOLDER + target_sample)

In [5]:
# convert cluster_labels elements back to numpy array
results_df['cluster_labels'] = results_df['cluster_labels'].apply(exi.convert_string_to_array)

In [6]:
target_df

Unnamed: 0,ID,Target
0,1488,5
1,2611,0
2,749,5
3,99,5
4,11298,3
...,...,...
13606,5191,2
13607,13418,3
13608,5390,2
13609,860,5


In [7]:
# get the true number of clusters
num_true_labels = len(np.unique(target_df.Target.values))

___

### Compute External Indices
___

This cell will compute the external indices used to validate the clutering solution.

The following external indices will the used:

* ```Adjusted Rand Score``` <a href="#ref1">[1]</a>
  * The Rand index is a measure similarity between two clustering solutions by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
  * This Rand index is then adjusted for change with:
    <br><br>
    * $ARI = \dfrac{RI - \text{Expected\_RI}}{\textit{max(RI)} - \text{Expected\_RI}}$
* ```Jaccard``` <a href="#ref1">[2]</a>
  * The size of the intersection divided by the size of the union of the two labeled sets.
  * We will use ```average = macro```, this returns the mean of all the independently computed Jaccard indices for each true label. This is acceptble when the true label frequencies are similar.
  <br><br>
* ```Fawlks and Mallows Index```
  * The geometric mean between precision and recall
  <br><br>
  * $FMI = \dfrac{TP}{\sqrt{(TP + FP) \cdot (TP + FN)}}$<a href="#ref1">[3]</a>
  <br><br>
* ```F-Measure``` <a href="#ref1">[4]</a>
  * The harmonic mean of the precision and recall.
  <br><br>
  * $F1 = \dfrac{2 \cdot TP}{2 \cdot TP + FP + FN}$
  <br><br>
* ```Purity ~ Homogeneity``` <a href="#ref1">[5]</a>
  * Homogeneity measures the extent to which cluster labels contain only elements of single class.

<p id="ref1"><sup>[1]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html">this link</a>.</p>
<p id="ref1"><sup>[2]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.jaccard_score.html">this link</a>.</p>
<p id="ref1"><sup>[3]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.fowlkes_mallows_score.html">this link</a>.</p>
<p id="ref1"><sup>[4]</sup> scikit-learn.org. More details available at <a href="hhttps://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html">this link</a>.</p>
<p id="ref1"><sup>[5]</sup> scikit-learn.org. More details available at <a href="https://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html">this link</a>.</p>



In [8]:
dataframe_dict = {}
df_row_dict_list = []
i = 1
for idx, row in results_df.iterrows():
    print('\n')
    print('*' * 100)
    print(f'Results: {i}')
    print('*'*100)
    print()

    print(f'UMAP & CLUSTER ALGORITHM INFORMATION:')
    print('-'*100)
    
    # get algo
    algo = row['algo']

    ###################################################
    ### Get UMAP & Cluster Algorithm Information ####
    ###################################################

    n_components = row['umap_n_components']
    min_dist = row['umap_min_dist']
    n_neighbors = row['umap_n_neighbors']
    trustworthiness = row['trustworthiness']
    n_clusters_found = row['n_clusters_found']
    validity_index = row['validity_index']
    silhouette_score = row['silhouette_score']
    umap_metric = row['umap_metric']

    print(f"Algorithm: {algo}")
    print(f'Number of Clusers Found: {n_clusters_found}')
    if algo == 'dbscan':
        print(f"Validity Index: {validity_index:.5f}")
    else:
        print(f'Silhouette Score: {silhouette_score:.5f}')
    print(f'UMAP Number of Components: {n_components}')
    print(f'UMAP Min Distance: {min_dist}')
    print(f'UMAP Number of Neighbors: {n_neighbors}')
    print(f'UMAP Metric: {umap_metric}')
    print(f'UMAP Trustworthiness: {trustworthiness:.5f}')

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_df, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]

    ########################################
    ######## Get External Indices #########
    ########################################

    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get falks and mallows score
    fawlks_and_mallows_ = fowlkes_mallows_score(true_labels, cluster_labels)

    # completeness
    homogeneity_score_ = homogeneity_score(true_labels, cluster_labels)

    # get F-1 score
    label = np.unique(true_labels)
    f1_score_ = f1_score(true_labels, cluster_labels, average = 'macro')

    # jaccard score
    jaccard_score_ = jaccard_score(true_labels, cluster_labels, average = 'macro')

    # ge the normalized mutual info score
    nmi = normalized_mutual_info_score(true_labels, cluster_labels)

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)

    # get optimized contingency matrix
    modes_df = exi.get_modes(n_components, labels_df)
    cluster_mapping = exi.get_mapping(n_components, labels_df, modes_df)
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print()
    print('-'*100)
    print('CONTINGENCY MATRIX')
    print('-'*100)
    print('Contingency Matrix: ')
    print()
    print(cont_matrix)
    print()
    print('-'*100)
    print('REMAPPED CONTINGENCY MATRIX')
    print('-'*100)
    print()
    print('Best Mapping: ', dict(sorted(cluster_mapping.items())))
    print()
    print('Contingency Matrix: ')
    print()
    print(remapped_cont_matrix)
    print()
    print('-'*100)
    print('EXTERMAL INDICES: PERFORMANCE EVALUATION METRICS')
    print('-'*100)
    print()
    print(f'Adjusted Rand Score: {adj_rand:.5f}')
    print(f'Fawlks and Mallows Score: {fawlks_and_mallows_:.5f}')
    print(f'F-1 Score: {f1_score_:.5f}')
    print(f'Jaccard Score: {jaccard_score_:.5f}')
    print(f'Normalized Mutual Info Score: {nmi:.5f}')
    print(f'Homogeneity Score (Purity): {homogeneity_score_:.5f}')
    print()

    df_row_dict_list.append({
                        'algo': algo,
                        'umap_n_components': n_components,
                        'umap_min_dist': min_dist,
                        'umap_n_neighbors': n_neighbors,
                        'umap_metric': umap_metric,
                        'trustworthiness': trustworthiness,
                        'n_clusters_found': n_clusters_found,
                        'true_num_clusters': num_true_labels,
                        'validity_index': validity_index,
                        'adj_rand_score': adj_rand,
                        'fawlks_and_mallows': fawlks_and_mallows_,
                        'nmi': nmi,
                        'jaccard_score': jaccard_score_,
                        'f1_score': f1_score_,
                        'Homogeneity': homogeneity_score_,
                        'true_labels': true_labels,
                        'cluster_labels': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    i += 1




****************************************************************************************************
Results: 1
****************************************************************************************************

UMAP & CLUSTER ALGORITHM INFORMATION:
----------------------------------------------------------------------------------------------------
Algorithm: dbscan
Number of Clusers Found: 4
Validity Index: 0.99987
UMAP Number of Components: 7
UMAP Min Distance: 0
UMAP Number of Neighbors: 1000
UMAP Metric: cosine
UMAP Trustworthiness: 0.84725

----------------------------------------------------------------------------------------------------
CONTINGENCY MATRIX
----------------------------------------------------------------------------------------------------
Contingency Matrix: 

[[   7 1315    0    0]
 [   0    0    0  522]
 [   0 1630    0    0]
 [   0 3546    0    0]
 [   0 1928    0    0]
 [1188   63  776    0]
 [   0 2636    0    0]]

--------------------------------------

In [9]:
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df.sort_values(by='adj_rand_score', ascending=False).head(10)

Unnamed: 0,algo,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,n_clusters_found,true_num_clusters,validity_index,adj_rand_score,...,nmi,jaccard_score,f1_score,Homogeneity,true_labels,cluster_labels,matrix_trace,contingency_matrix,remapped_cont_matrix,mapping
1,k_means,7,0,1000,euclidean,0.999931,15,7,,0.428273,...,0.719847,0.028679,0.040106,0.89077,0 5 1 0 2 5 3 5 4 ...,0 12 1 7 2 5 3 ...,872,"[[0, 0, 0, 602, 0, 0, 0, 720, 0, 0, 0, 0, 0, 0...","[[720, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","{9.0: 6.0, 2.0: 8, 11.0: 9, 12.0: 5.0, 5.0: 10..."
0,dbscan,7,0,1000,cosine,0.84725,4,7,0.999869,0.142414,...,0.436957,0.000398,0.000795,0.297941,0 5 1 0 2 5 3 5 4 ...,0 0 1 1 2 0 3 2 4 ...,7,"[[7, 1315, 0, 0], [0, 0, 0, 522], [0, 1630, 0,...","[[0, 1315, 7, 0], [522, 0, 0, 0], [0, 1630, 0,...","{0.0: 5.0, 2.0: 8, 1.0: 3.0, 3.0: 1.0}"


In [10]:
keep_cols = ['true_num_clusters', 'umap_n_components','umap_min_dist', 
             'umap_n_neighbors', 'umap_metric', 'trustworthiness', 'algo', 'n_clusters_found',
            'validity_index', 'adj_rand_score', 'fawlks_and_mallows', 'nmi', 'jaccard_score', 'f1_score']

finalized_results_frame = matrix_results_df[keep_cols]

finalized_results_frame

Unnamed: 0,true_num_clusters,umap_n_components,umap_min_dist,umap_n_neighbors,umap_metric,trustworthiness,algo,n_clusters_found,validity_index,adj_rand_score,fawlks_and_mallows,nmi,jaccard_score,f1_score
0,7,7,0,1000,cosine,0.84725,dbscan,4,0.999869,0.142414,0.471018,0.436957,0.000398,0.000795
1,7,7,0,1000,euclidean,0.999931,k_means,15,,0.428273,0.538846,0.719847,0.028679,0.040106
