In [6]:
## Libraries

# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import random


# preprocessing
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler

# data
from sklearn.datasets import load_digits

# clustering
from sklearn.cluster import DBSCAN, KMeans

#external indices
from sklearn.metrics.cluster import adjusted_rand_score, contingency_matrix

# runtime and run tracking
import time
from datetime import datetime

# pathing
from pathlib import Path
import os
import sys

In [7]:
path = Path(os.getcwd())
path = str(path.parent)

/Users/nelsonfarrell/Documents/Northeastern/5230/final_project/DS5230-final


In [10]:
module_path = path + '/src'
if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
# modules and util
import dimensionality_reduction as dr
import clustering as cl
from cluster_utils import *
from external_indices_utils import *

# import results


In [None]:
# add col for adj ran dscore
#best_results_df.loc[:, 'adjusted_rand_score'] = np.nan
dataframe_dict = {}
df_row_dict_list = []
for idx, row in best_results_df.iterrows():
    print('\n') 
    print('*'*100)
    print('Clustering Results')
    print('*'*100)
    print(row)
    

    # get algo
    algo = row['algo']

    # get n_components 
    n_components = row['umap_n_components']

    # create dataframe with columns for every value of n_components
    cluster_labels_df = pd.DataFrame()

    # add cluster labels to dataframe from results_df
    cluster_labels_df.loc[:, str(n_components)] = row['cluster_labels']

    # concatonat target vector dataframe with clusterlabels_df
    labels_df = pd.concat([target_vector_shuffled, cluster_labels_df], axis=1)
    
    # drop indicies with noise points
    noise_points_row_indices = labels_df.index[(labels_df == -1).any(axis=1)]
    labels_df = labels_df.drop(noise_points_row_indices)
    labels_df = labels_df.astype('int64')

    dataframe_dict[f'df_{str(n_components)}'] = labels_df

    # get labels
    true_labels = labels_df.loc[:, 'Target']
    cluster_labels = labels_df.loc[:, str(n_components)]
   
    
    # get adj rand score and add to dataframe
    adj_rand = adjusted_rand_score(true_labels, cluster_labels)
    best_results_df.loc[idx, 'adjusted_rand_score'] = adj_rand

    # get contigency matrices for all permutatons of cluster labels
    cont_matrix = contingency_matrix(true_labels, cluster_labels)
    matrix_trace = np.trace(cont_matrix)


    # get optimized contingency matrix
    modes_df = get_modes(n_components, labels_df)
    print(modes_df.info())
    cluster_mapping = get_mapping(n_components, labels_df, modes_df)
    
    labels_df[str(n_components)] = labels_df.loc[:, str(n_components)].map(cluster_mapping)
    
    remapped_cont_matrix = contingency_matrix(labels_df['Target'], labels_df[str(n_components)])
    remapped_matrix_trace = np.trace(remapped_cont_matrix)

    # retrieve contigency matrix with highest trace
    np.set_printoptions(linewidth=200)
    print('\n') 
    print('*'*100)
    print('CONTINGENCY MATRIX')
    print('*'*100)
    print('algo: ', algo)
    print('n_components: ', n_components)
    print('Adjusted Rand Score: ', adj_rand)
    print('True Labels: ', true_labels.values)
    print('Clustering Predicted Labels: ', cluster_labels.values)
    print('Matrix Trace: ', matrix_trace)
    print('Contingency Matrix: ')
    print(cont_matrix)
    print('\n')
    print('REMAPPED CONTINGENCY MATRIX')
    print('Remapped Matrix Trace: ', remapped_matrix_trace)
    print('Contingency Matrix: ')
    print(remapped_cont_matrix)
    print('\n')
    print('Mapping: ', dict(sorted(cluster_mapping.items())))

    df_row_dict_list.append({
                        'algo: ': algo,
                        'n_components: ': n_components,
                        'adj_rand_score: ': adj_rand,
                        'true_labels': true_labels,
                        'cluster_labels: ': cluster_labels,
                        'matrix_trace': matrix_trace,
                        'contingency_matrix': cont_matrix,
                        'remapped_cont_matrix': remapped_cont_matrix,
                        'mapping': cluster_mapping
                        })
    
matrix_results_df = pd.DataFrame(df_row_dict_list)

matrix_results_df