In [None]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier

from pathlib import Path
import os
import sys

from scipy.spatial.distance import euclidean

import hdbscan.validity as dbcv_hdbscan

# set up for imports of .py modules by adding path to sys.path

In [None]:
path = Path(os.getcwd())
path = str(path)
print(path)
sys.path.insert(1, path)

In [None]:
import utils.make_clustering_data_sets as mcds
import utils.clustering_flows as cf
import utils.external_indices as ei

# helpful functions

# make some data sets

In [None]:
data_set_dicts = mcds.make_data_sets_to_demo_cluster_validation(
    noisy_moons=True, 
    well_separated=True, 
    center_based=True
)

In [None]:
print(len(data_set_dicts.keys()))
data_set_dicts.keys()

In [None]:
list(data_set_dicts.keys())[0]

In [None]:
data_set_dicts[list(data_set_dicts.keys())[0]].keys()

In [None]:
data_set_dicts[list(data_set_dicts.keys())[0]]

# cluster the data sets

In [None]:
results_df = cf.clustering_flow_1_test_1(data_set_dicts, enhanced_k_means=True)

In [None]:
results_df

# cluster validation using external indices

In [None]:
df_row_dict_list = []
for data_set_name, data_set_dict in data_set_dicts.items():

    print(f'\n', '*' * 80, sep='')
    print(f'data_set_name: {data_set_name}')

    cf.plot_the_data_sets(data_set_dict['cap_x'], data_set_dict['y'], data_set_name)

    labels_true = data_set_dict['y']
    labels_pred = results_df.loc[results_df.data_set_name == data_set_name, 'cluster_labels'].values[0]
    algo = results_df.loc[results_df.data_set_name == data_set_name, 'algo'].values[0]

    print(algo)
    if algo == 'dbscan':
        return_dict = ei.remove_noise_data_objects_from_labels(labels_pred, labels_true)
        labels_pred = return_dict['labels_pred']
        labels_true = return_dict['labels_true']
        
    results_dict = ei.external_indices_taught_spring_24(labels_pred, labels_true, print_out=True)
    df_row_dict = results_dict['df_row_dict']
    df_row_dict['data_set_name'] = data_set_name
    df_row_dict_list.append(results_dict['df_row_dict'])

ei_results_df = pd.DataFrame(df_row_dict_list)

In [None]:
ei_results_df

# join the external indices results to the results data frame from clustering

In [None]:
results_df = pd.merge(results_df, ei_results_df, on='data_set_name')
results_df

# let's check out the results

### moons

In [None]:
data_set_type = 'moons'
temp_results_df = results_df.loc[results_df.data_set_type == data_set_type, 
                                ['data_set_name', 'algo', 'spec_type', 'spec_value', 'contingency_matrix',
                                 'validity_index', 'silhouette_score', 'rand_score', 'adjusted_rand_score']]
temp_results_df = (
    pd.melt(
        temp_results_df, 
        id_vars=['data_set_name', 'algo', 'spec_type', 'spec_value', 'contingency_matrix'],
        value_vars=['validity_index', 'silhouette_score', 'rand_score', 'adjusted_rand_score'],
        var_name='external_index'
    )
)

temp_results_df = temp_results_df.dropna(subset=['value'])
temp_results_df.head()

In [None]:
for data_set_name in temp_results_df.data_set_name.unique():
    contingency_matrix = temp_results_df.loc[temp_results_df.data_set_name == data_set_name, 'contingency_matrix'].values[0]
    print(f'\n', '*' * 80, sep='')
    print(f'data_set_name: {data_set_name}\n')
    print(f'\nclasses - form the rows of the contingency matrix')
    print(f'clusters - form the columns of the contingency matrix\n')
    print(contingency_matrix)
    
sns.scatterplot(data=temp_results_df, x='spec_value', y='value', hue='external_index', style='algo')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
spec_type = temp_results_df.spec_type.values[0]
plt.title(f'data_set_type: {data_set_type}\nspec_type: {spec_type}')
plt.grid()
plt.show()

### well_separated

In [None]:
data_set_type = 'well_separated'
temp_results_df = results_df.loc[results_df.data_set_type == data_set_type, 
                                ['data_set_name', 'algo', 'spec_type', 'spec_value', 'contingency_matrix',
                                 'validity_index', 'silhouette_score', 'rand_score', 'adjusted_rand_score']]
temp_results_df = (
    pd.melt(
        temp_results_df, 
        id_vars=['data_set_name', 'algo', 'spec_type', 'spec_value', 'contingency_matrix'],
        value_vars=['validity_index', 'silhouette_score', 'rand_score', 'adjusted_rand_score'],
        var_name='external_index'
    )
)

temp_results_df = temp_results_df.dropna(subset=['value'])
temp_results_df.head()

In [None]:
temp_results_df = temp_results_df.sort_values('spec_value', ascending=False)

for data_set_name in temp_results_df.data_set_name.unique():
    contingency_matrix = temp_results_df.loc[temp_results_df.data_set_name == data_set_name, 'contingency_matrix'].values[0]
    print(f'\n', '*' * 80, sep='')
    print(f'data_set_name: {data_set_name}\n')
    print(f'\nclasses - form the rows of the contingency matrix')
    print(f'clusters - form the columns of the contingency matrix\n')
    print(contingency_matrix)
    
sns.scatterplot(data=temp_results_df, x='spec_value', y='value', hue='external_index', style='algo')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
spec_type = temp_results_df.spec_type.values[0]
plt.title(f'data_set_type: {data_set_type}\nspec_type: {spec_type}')
plt.grid()
plt.show()