## Final Project: Phase 3 - Avoiding False Discoveries
Spring 2024  
Group: Michael Massone and Joseph Nelson Farrell   
DS 5230 Unsupervised Machine Learning  
Professor Steven Morin, PhD  
Due: 04/17/2024  
___

### Import Libraries

In [1]:
# base
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


# pathing
from pathlib import Path
import os
import sys

import umap

### Set Paths

In [2]:
# define path
HOME = Path(os.getcwd())
print(HOME)

HOME_PARENT_STR = str(HOME.parent)
print(HOME_PARENT_STR)

# path to figs folder
PATH_TO_FIGS_FOLDER = HOME_PARENT_STR + '/figs'

# path to data
PATH_TO_DATA_FOLDER = HOME_PARENT_STR + '/data'

# path to src folder
PATH_TO_SRC = HOME_PARENT_STR + '/src'
print(PATH_TO_SRC)

# sys path
sys.path.append(PATH_TO_SRC)

/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/notebooks
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final
/Users/mikey/LocalFiles/DS5230/final_project/DS5230-final/src


### Import Functions

In [3]:
import afd_utils
import cluster_utils

### Import Data

In [4]:
results_frame_file =  '/results/filtered_results_2024-04-1811:12:48.064452.csv'
results_frame = pd.read_csv(PATH_TO_DATA_FOLDER + results_frame_file)

### Check for Kmeans

In [5]:
# check if there is a Kmeans solution
if 'k_means' in list(results_frame['algo']):
    print("There is a Kmeans Solution. Proceed")
else:
    print('There is not Kmeans solution to check. Exit notebook.')

There is a Kmeans Solution. Proceed


### Filter for Kmeans

In [6]:
# filter results for k_means solution
results_frame = results_frame[results_frame["algo"] == "k_means"]
results_frame

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
1,k_means,15,15.0,15.0,15.0,0.516789,0.002807,1000,0,euclidean,7,0.999931,,,,,,"12,7,5,5,1,5,10,12,4,8,14,11,6,0,8,3,14,2,7,11..."


### Get Best Kmeans

In [7]:
results_frame.sort_values('silhouette_score', ascending = False, inplace = True)
results_frame.reset_index(drop = True, inplace = True)
best_result = pd.DataFrame(results_frame.loc[0, :]).T
best_result

Unnamed: 0,algo,n_clusters_found,n_clusters_db_score_is_min,n_clusters_ch_score_is_max,n_clusters_silhouette_score_is_max,silhouette_score,hopkins_statistic,umap_n_neighbors,umap_min_dist,umap_metric,umap_n_components,trustworthiness,eps,dbscan_min_samples,dbscan_metric,validity_index,noise_ratio,cluster_labels
0,k_means,15,15.0,15.0,15.0,0.516789,0.002807,1000,0,euclidean,7,0.999931,,,,,,"12,7,5,5,1,5,10,12,4,8,14,11,6,0,8,3,14,2,7,11..."


### Get Cap X

In [8]:
# get the design matrix
design_file = '/curated/shuffled_target.csv'
design_matrix_shuffled = pd.read_csv(PATH_TO_DATA_FOLDER + design_file)

# drop ID col
design_matrix_shuffled_noID = design_matrix_shuffled.drop('ID', axis=1)

# convert to ndarray
cap_x = design_matrix_shuffled_noID.to_numpy()
cap_x = design_matrix_shuffled
cap_x.shape

# get umap hypers from results frame
n_neighbors = results_frame['umap_n_neighbors'][0]
n_components = results_frame['umap_n_components'][0]
metric = results_frame['umap_metric'][0]
min_dist = results_frame['umap_min_dist'][0]

# get the latent embedding discovered in the gridsearch
reducer = umap.UMAP(
        n_neighbors = n_neighbors, 
        n_components = n_components, 
        metric = metric, 
        min_dist = min_dist, 
        spread = 1.0, 
        random_state = 42
        )

# fit and embed
reducer.fit(cap_x)
embedding = reducer.transform(cap_x)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
# create results list
results_list = []
n_clusters = results_frame['n_clusters_found'][0]

# get sil score of cap_x
silhouette_score_cap_x = afd_utils.cluster_kmeans(embedding, n_clusters)
results_list.append({'data': 'cap_x', 'silhouette_score': silhouette_score_cap_x})

# generate and cluster random data
for i in range(15):
    random_data = afd_utils.get_randomly_distributed_data(embedding, seed = i)
    silhouette_score_random = afd_utils.cluster_kmeans(random_data, n_clusters)
    results_list.append({'data': 'random', 'silhouette_score': silhouette_score_random})

# convert to results frame
results_df = pd.DataFrame(results_list)
print(results_df)

In [None]:
sns.histplot(data=results_df, x = 'silhouette_score', hue = 'data')
sil_max = results_df['silhouette_score'].max()
sil_min = results_df['silhouette_score'].min()
plt.xlim([sil_min - .05, sil_max + 0.05])
plt.grid()
plt.show()