In [2]:
import pandas as pd
import numpy as np
import os
from random import uniform,seed
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
import re
from sklearn.preprocessing import StandardScaler


In [3]:
os.chdir('../')


In [4]:
from mc_hammer.mchammer import mchammer
from mc_hammer.null_distributions import pca_trans, random_order, min_max
from mc_hammer.clustering_algorithms import k_means
from mc_hammer.similarity_functions import huberts_gamma, norm_gamma, sillhouette_euclidean,sillhouette_cosine, CH, DB
from mc_hammer.similarity_functions import dunn,S_Dbw,SD_score,IGP,BWC,CVNN
from mc_hammer.hypothesis_test import hypothesis_test

## Functions

In [5]:
def mc_hammer_test(x,null_method,cluster_method,k = None,eps = None,min_samples = None):
    mch = mchammer()
    mch.get_null_distributions(x,null_method,repeats = 100)
    return mch.get_q_scores(cluster_method,k = k,eps=eps,min_samples=min_samples)

In [6]:
def get_true_list(test_list,true_k_ind):
    test_list2 = test_list[:true_k_ind] + test_list[true_k_ind + 1:]
    if min(test_list) == test_list[true_k_ind]:
        return True
    else:
        return False
def get_true_dict(res_list,true_k_ind):
    res_dict = {i:[j[i] for j in res_list] for i in res_list[0].keys()}
    res_dict = {k:get_true_list(v,true_k_ind) for k,v in res_dict.items()}
    return res_dict

In [7]:
def gouss_dist(seed_n):
    seed(seed_n)
    std = uniform(0.1,0.3)
    null_arr = np.array([np.random.normal(1,std,100) for i in range(3)])
    return null_arr.T

In [8]:
def big_mc_hammer(
    x_list,
    null_method,
    cluster_method,
    k_list = [2,4,5],
    eps_list = [0.5],
    min_samples_list = [4,5,7],
    spef = False,
    true_k_ind = None
):
    if cluster_method == 'DBSCAN':
        true_list=[mc_hammer_test(
            l,
            null_method,
            cluster_method,
            k=None,
            eps = j,
            min_samples=i
        ) for l in x_list for i in min_samples_list for j in eps_list]
    else:
        if spef:
            true_list=[[mc_hammer_test(
                j,
                null_method,
                cluster_method,
                k = i,
                eps = None,
                min_samples = None
            ) for i in k_list] for j in x_list] 
            
            true_list = [get_true_dict(i,true_k_ind) for i in true_list]
        else:
            true_list=[mc_hammer_test(
                j,
                null_method,
                cluster_method,
                k = i,
                eps = None,
                min_samples = None
            ) for j in x_list for i in k_list]
        true_dict = {i:[true_list[j][i] for j in range(len(true_list))] for i in true_list[0].keys()}
        if spef == False:    
            true_dict = {k:[True if i <0.05 else False for i in v] for k,v in true_dict.items()}
        true_dict_res = {k:v.count(True)/len(v)*100 for k,v in true_dict.items()}
    return true_dict_res

In [9]:
def test_blobs(k,noise,seperation,test_true,spef = False,pca = False):
    print(str(k) + str(noise) + str(seperation))
    x_list =[make_classification(
        n_samples = 100,
        n_features = 10,
        n_informative = int((10-10*noise)),
        n_redundant = int(10*noise),
        n_classes = k,
        n_clusters_per_class = 1,
        class_sep = seperation,
        random_state = i
    )[0] for i in range(50)]
    null_method_list = ['pca_trans','random_order','min_max']
    cluster_method_list = ['K_Means']
    if pca: 
        x_list = [PCA(n_components = 0.9).fit_transform(i) for i in x_list]
    if test_true:
        k_range = [k]
    else:
        k_range = [2,4,5]
    
    if spef: 
        true_k_ind = k_range.index(k)
    else:
        true_k_ind = None
    results_dict = {i +'_' + j:big_mc_hammer(
        x_list = x_list,
        null_method = i,
        cluster_method = j,
        k_list = k_range,
        spef = spef,
        true_k_ind = true_k_ind
    ) for i in null_method_list for j in cluster_method_list}
    
    results_df = pd.DataFrame(results_dict).T
    return(results_df)

In [10]:
def cvis(df,labels,q_methods = 'All'):
    
    if q_methods == 'All':
        q_methods = ['huberts_gamma', 'norm_gamma', 'sillhouette_euclidean','sillhouette_cosine', 'CH', 'DB',
                             'dunn','S_Dbw','SD_score','IGP','BWC','CVNN']
    q_dict = {}
    for i in q_methods:
        res = []
        if i in ['BWC','dunn']:
            for j in range(len(labels)):
                res_small = eval(i + '(df[' + str(j) + '],labels[' + str(j) + '][0],labels[' + str(j) + '][1])')
                res.append(res_small)
        else:
            for j in range(len(labels)):
                res_small = eval(i + '(df['+str(j)+'],labels['+str(j)+'][0])')
                res.append(res_small)
        q_dict[i] = res
    return q_dict



In [11]:
def get_dists(data,cluster_n,repeats):
    pca_list = [pca_trans(x,i) for i in range(repeats)]
    random_list = [random_order(x, i) for i in range(repeats)]
    minmax_list = [min_max(x,i) for i in range(repeats)]
    null_dist_dict = {
        'pca_trans':pca_list,
        'random_shuffle':random_list,
        'min_max':minmax_list
                     }
    labels_dict = {k:[k_means(i,cluster_n) for i in v] for k,v in null_dist_dict.items()}
    full_dict = {k:cvis(null_dist_dict[k],v) for k,v in labels_dict.items()}
    full_dict = {k:{str(cluster_n) + '_'  +k+ '_' + k2:v2 for k2,v2 in v.items()} for k,v in full_dict.items()}
    full_dict3 = {}
    for i in full_dict.values():
        full_dict3.update(i)
    out_df = pd.DataFrame(full_dict2)
    return out_df

In [12]:
def clean_files(df):
    x = df.drop(columns = 'Y')
    x = StandardScaler().fit_transform(x)
    x2= PCA(n_components = 0.9).fit_transform(x)
    return x2

In [14]:
def real_data_test(x,cluster_n,repeats):
    lab,cen = k_means(x,cluster_n)
    q_methods = ['huberts_gamma', 'norm_gamma', 'sillhouette_euclidean','sillhouette_cosine', 'CH', 'DB',
                                'dunn','S_Dbw','SD_score','IGP','BWC','CVNN']
    q_dict = {}
    for i in q_methods:
        res = []
        if i in ['BWC','dunn']:
                res_small = eval(i + '(x,lab,cen)')
                res.append(res_small)
        else:
                res_small = eval(i + '(x,lab)')
                res.append(res_small)
        q_dict[i] = res

    null_dist = {'random_order':[random_order(x, i) for i in range(repeats)],
                 'min_max':[min_max(x,i) for i in range(repeats)]}

    labels_dict = {k:[k_means(i,cluster_n) for i in v] for k,v in null_dist.items()}
    full_dict = {k:cvis(null_dist[k],v,['sillhouette_euclidean','CH','DB','BWC']) for k,v in labels_dict.items()}
    full_dict = {k:{k2:v2 + q_dict[k2] for k2,v2 in v.items()} for k,v in full_dict.items() }
    hyp_dict = {k:{k+ '_' +k2:[hypothesis_test(v2,k2)] for k2,v2 in v.items()} for k,v in full_dict.items() }
    for i in hyp_dict.values():
        q_dict.update(i)
    
    return q_dict
        

In [15]:
def pick_best(method,q_list,k_list):
    if method in ['huberts_gamma', 'norm_gamma', 'sillhouette_euclidean','sillhouette_cosine', 'CH','dunn','IGP','BWC']:
        return(k_list[q_list.index(max(q_list))])
    else:
        return(k_list[q_list.index(min(q_list))])

In [20]:
def full_real_test(x,k_range,repeats):
    k_range_list = [real_data_test(x,i,repeats) for i in k_range]
    k_range_list = {i:[j[i] for j in k_range_list] for i in k_range_list[0].keys()}
    k_range_list = {k:[j for i in v for j in i] for k,v in k_range_list.items()}
    opt_k = {k:pick_best(k,v,k_range) for k,v in k_range_list.items()}
    found_clusters = {k:[True if i >= 0.05 else False for i in v] for k,v in k_range_list.items() if ('random_order' in k) or ('min_max' in k)}
    return opt_k, found_clusters

## Experiment 1: no clusters

In [None]:
null_method_list = ['pca_trans']
cluster_method_list = ['K_Means']
x_list = [np.random.rand(100,3) for i in range(100)]
null_results_dict = {i +'_' + j:big_mc_hammer(
    x_list = x_list,
    null_method = i,
    cluster_method = j
) for i in null_method_list for j in cluster_method_list}

In [None]:
null_results_dict

In [None]:
null_df = pd.DataFrame(null_results_dict).T
null_df.to_csv('data/processed/fullex/null_results.csv')

In [None]:
null_method_list = ['min_max','random_order']
cluster_method_list = ['K_Means']
x_list = [gouss_dist(i) for i in range(100)]
null_gous_results_dict = {i +'_' + j:big_mc_hammer(
    x_list = x_list,
    null_method = i,
    cluster_method = j
) for i in null_method_list for j in cluster_method_list}

In [None]:
null_gous_results_dict

In [None]:
null_gous_df = pd.DataFrame(null_gous_results_dict).T
null_gous_df.to_csv('data/processed/fullex/null_gouss_results.csv')

## Experiment 2: Finding Clusters

In [9]:
k_list = [2,4,5]
noise_list = [0.1,0.3,0.5]
sep_list = [0.5,1,3]

In [None]:
blob_test = {'k-' + str(i) +'_noise-' +str(j) + '_sep-' + str(l):test_blobs(
    k = i,
    noise = j,
    seperation = l,
    test_true = True
) for i in k_list for j in noise_list for l in sep_list}

In [None]:
blob_test

## Experiment 3: Idenitify cluster number 

In [None]:
if os.path.isdir('data/processed/fullex/k_means_pos_test') == False:
    os.mkdir('data/processed/fullex/k_means_pos_test')
for k,v in blob_test.items():
    file_name = re.sub('\.','',k)
    v.to_csv('data/processed/fullex/k_means_pos_test/' + file_name +'.csv')

In [None]:
for i in k_list:
    for j in noise_list:
        blob_test_sens = {}
        blob_test_sens = {'k-' + str(i) +'_noise-' +str(j) + '_sep-' + str(l):test_blobs(
            k = i,
            noise = j,
            seperation = l,
            test_true = False,
            spef = True
        )for l in sep_list}
        if os.path.isdir('data/processed/fullex/k_means_sens_test') == False:
            os.mkdir('data/processed/fullex/k_means_sens_test')
        for k,v in blob_test_sens.items():
            file_name = re.sub('\.','',k)
            v.to_csv('data/processed/fullex/k_means_sens_test/' + file_name +'.csv')

20.10.5
20.11
20.13


  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]


20.30.5
20.31


  sep_range = [np.mean(i)/max(i) for i in sep_range]


20.33


  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_

  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]
  sep_range = [np.mean(i)/max(i) for i in sep_range]


In [None]:
if os.path.isdir('data/processed/fullex/k_means_sens_test') == False:
    os.mkdir('data/processed/fullex/k_means_sens_test')
for k,v in blob_test_sens.items():
    file_name = re.sub('\.','',k)
    v.to_csv('data/processed/fullex/k_means_sens_test/' + file_name +'.csv')

## Experiment 4: PCA

In [None]:
blob_test_pca = {'k-' + str(i) +'_noise-' +str(j) + '_sep-' + str(l):test_blobs(
    k = i,
    noise = j,
    seperation = l,
    test_true = False,
    spef = True,
    pca = True
) for i in k_list for j in noise_list for l in sep_list}

In [None]:
if os.path.isdir('data/processed/fullex/k_means_pca_test') == False:
    os.mkdir('data/processed/fullex/k_means_pca_test')
for k,v in blob_test_sens.items():
    file_name = re.sub('\.','',k)
    v.to_csv('data/processed/fullex/k_means_pca_test/' + file_name +'.csv')

## Experiment 5a: Testing parameters 

In [None]:
data = np.random.rand(100,3)
for i in cluster_list:
    get_dists(data,i,1000)
    if os.path.isdir('data/processed/fullex/k_means_dist_test') == False:
        os.mkdir('data/processed/fullex/k_means_dist_test')
    for k,v in blob_test_sens.items():
        file_name = 'cluster_n-' + str(i)
        v.to_csv('data/processed/fullex/k_means_dist_test/' + file_name +'.csv')
    

## Experiemnt 5b: Testing size effect

## Experiment 6: real life datasets 

In [None]:
csv_list = os.listdir('data/raw/test_data')
csv_list = [i for i in csv_list if '.csv' in i]
df_list = [pd.read_csv('data/raw/test_data/' + i) for i in csv_list]
clean_list = [clean_files(i) for i in df_list]

In [None]:
df_table = pd.DataFrame({
    'Name':[re.sub('.csv','',i) for i in csv_list],
    'Features':[i.shape[1] - 1 for i in df_list],
    'Observations':[len(i) for i in df_list],
    'Number of clusters':[len(i['Y'].unique()) for i in df_list]
})

In [None]:
df_table

## Experiment 6: real life datasets 

In [17]:
csv_list = os.listdir('data/raw/test_data')
csv_list = [i for i in csv_list if '.csv' in i]
df_list = [pd.read_csv('data/raw/test_data/' + i) for i in csv_list]
clean_list = [clean_files(i) for i in df_list]

In [18]:
df_table = pd.DataFrame({
    'Name':[re.sub('.csv','',i) for i in csv_list],
    'Features':[i.shape[1] - 1 for i in df_list],
    'Observations':[len(i) for i in df_list],
    'Number of clusters':[len(i['Y'].unique()) for i in df_list]
})

In [70]:
df_table

Unnamed: 0,Name,Features,Observations,Number of clusters
0,breast_cancer,30,569,2
1,ecoli,7,336,8
2,glass,9,214,6
3,iris,4,150,3
4,wine,13,178,3
5,yeast,8,1484,10


In [None]:
found_cluster_list = []
opt_k_list = []
k_range = [i for i in range(2,13)]
for idx, i in enumerate(clean_list):
    opt_k, found_clusters = full_real_test(i,k_range,500)
    opt_k_list.append(opt_k)
    
    clust_n = df_table.loc[idx,'Number of clusters']
    k_ind = k_range.index(clust_n)
    found_cluster2 = {k:v[k_ind] for k,v in found_clusters.items()}
    found_cluster_list.append(found_cluster2)
f_clust_df = pd.DataFrame(found_cluster_list)
opt_df = pd.DataFrame(opt_k_list)
f_clust_df.to_csv('data/processed/k_means_real_test/found_clusters.csv')
opt_df.to_csv('data/processed/k_means_real_test/opt_clusters.csv')

In [31]:
idx = 0
clust_n = df_table.loc[idx,'Number of clusters']
k_range = [i for i in range(2,13)]
k_ind = k_range.index(clust_n)
fount_cluster2 = {k:v[k_ind] for k,v in found_cluster.items()}

{'random_order_sillhouette_euclidean': False,
 'random_order_CH': True,
 'random_order_DB': True,
 'random_order_BWC': True,
 'min_max_sillhouette_euclidean': False,
 'min_max_CH': False,
 'min_max_DB': True,
 'min_max_BWC': False}