In [1]:
import numpy as np
import pandas as pd
import os
import tqdm
import time
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import KMeans

from algorithm import *
from source import *

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



In [3]:
def test_all(data, config=None, show=False):
    results_paper = apply(data=data, configuration=config, algorithm='paper', show=show)
    results_kmeans = apply(data=data, configuration=config, algorithm='kmeans', show=show)
    file_name = '{} {}.pkl'.format(data, config)
    if data == 'real':
        file_name = '{}.pkl'.format(data)
    results = pd.concat(
        [
            results_paper,
            results_kmeans.iloc[:, 1:]
        ], axis=1)
    results.to_pickle('./results/{}'.format(file_name))

Real data:

In [6]:
test_all('real')

100%|██████████| 6/6 [00:09<00:00,  1.58s/it]
100%|██████████| 6/6 [00:13<00:00,  2.30s/it]


In [7]:
df = pd.read_pickle('./results/real.pkl')
df

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
breast cancer,2.0,241.0,0.000223,119.5,47.0,0.004995,22.5,2.0,0.000788,0.0
ecoli,8.0,6.0,0.09893,0.25,36.0,0.133538,3.5,3.0,0.667074,0.625
glass,6.0,3.0,0.008207,0.5,39.0,0.091648,5.5,2.0,0.200783,0.666667
iris,3.0,5.0,0.513593,0.666667,39.0,0.09947,12.0,3.0,0.730238,0.0
wine,3.0,178.0,0.0,58.333333,40.0,0.045587,12.333333,48.0,0.043334,15.0
zoo,7.0,58.0,0.130615,7.285714,4.0,0.644479,0.428571,49.0,0.147872,6.0
mean,4.83333,81.8333,0.125261,31.089286,34.1667,0.169953,9.376984,17.8333,0.298348,3.715278
std,2.48328,102.975,0.198267,48.862264,15.2239,0.236761,7.96841,23.7606,0.318818,5.983566


Synthetic data:

In [8]:
sigmas = [3, 4, 5]
feature_cluster_size = [(6, 3, 1000)]

all_configs = []

for n_f, n_c, sz in feature_cluster_size:
    for s in sigmas:
        configs = [
            '{}x{}-{} {}'.format(sz, n_f, n_c, s),
            '{}x{}-{} +{}NF {}'.format(sz, n_f, n_c, n_f // 2, s),
            '{}x{}-{} 50%N {}'.format(sz, n_f, n_c, s)
        ]
        for c in configs:
            test_all('synthetic', c)
        


100%|██████████| 20/20 [09:47<00:00, 29.36s/it]
100%|██████████| 20/20 [1:44:43<00:00, 314.18s/it]
100%|██████████| 20/20 [13:35<00:00, 40.77s/it]
100%|██████████| 20/20 [1:49:05<00:00, 327.26s/it]
100%|██████████| 20/20 [12:29<00:00, 37.45s/it]
100%|██████████| 20/20 [1:50:51<00:00, 332.59s/it]
100%|██████████| 20/20 [14:03<00:00, 42.18s/it]
100%|██████████| 20/20 [1:37:54<00:00, 293.70s/it]
100%|██████████| 20/20 [12:10<00:00, 36.54s/it]
100%|██████████| 20/20 [1:50:15<00:00, 330.78s/it]
100%|██████████| 20/20 [12:14<00:00, 36.71s/it]
100%|██████████| 20/20 [1:39:34<00:00, 298.71s/it]
100%|██████████| 20/20 [11:03<00:00, 33.18s/it]
100%|██████████| 20/20 [1:43:09<00:00, 309.45s/it]
100%|██████████| 20/20 [11:58<00:00, 35.93s/it]
100%|██████████| 20/20 [1:53:44<00:00, 341.20s/it]
100%|██████████| 20/20 [10:36<00:00, 31.80s/it]
100%|██████████| 20/20 [1:56:03<00:00, 348.17s/it]


In [9]:
sigmas = [1, 2, 3, 4, 5]
feature_cluster_size = [(12, 6, 1000)]

all_configs = []

for n_f, n_c, sz in feature_cluster_size:
    print(n_f, n_c, sz)
    for s in sigmas:
        print(s)
        configs = [
            '{}x{}-{} {}'.format(sz, n_f, n_c, s),
            '{}x{}-{} +{}NF {}'.format(sz, n_f, n_c, n_f // 2, s),
            '{}x{}-{} 50%N {}'.format(sz, n_f, n_c, s)
        ]
        for c in configs:
            test_all('synthetic', c)
        


  0%|          | 0/20 [00:00<?, ?it/s]

12 6 1000
1


100%|██████████| 20/20 [11:16<00:00, 33.83s/it]
100%|██████████| 20/20 [1:46:24<00:00, 319.24s/it]
100%|██████████| 20/20 [11:19<00:00, 34.00s/it]
100%|██████████| 20/20 [2:03:41<00:00, 371.09s/it]  
100%|██████████| 20/20 [11:16<00:00, 33.84s/it]
100%|██████████| 20/20 [2:05:51<00:00, 377.56s/it]  
  0%|          | 0/20 [00:00<?, ?it/s]

2


100%|██████████| 20/20 [12:07<00:00, 36.36s/it]
100%|██████████| 20/20 [1:32:17<00:00, 276.90s/it]
100%|██████████| 20/20 [11:23<00:00, 34.16s/it]
100%|██████████| 20/20 [2:03:55<00:00, 371.79s/it]  
100%|██████████| 20/20 [12:17<00:00, 36.86s/it]
100%|██████████| 20/20 [1:46:17<00:00, 318.88s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

3


100%|██████████| 20/20 [12:07<00:00, 36.39s/it]
100%|██████████| 20/20 [1:29:00<00:00, 267.01s/it]
100%|██████████| 20/20 [11:46<00:00, 35.31s/it]
100%|██████████| 20/20 [2:04:16<00:00, 372.85s/it]  
100%|██████████| 20/20 [11:34<00:00, 34.70s/it]
100%|██████████| 20/20 [1:44:45<00:00, 314.26s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

4


100%|██████████| 20/20 [10:02<00:00, 30.15s/it]
100%|██████████| 20/20 [1:30:17<00:00, 270.87s/it]
100%|██████████| 20/20 [11:22<00:00, 34.13s/it]
100%|██████████| 20/20 [2:04:00<00:00, 372.04s/it]  
100%|██████████| 20/20 [11:24<00:00, 34.21s/it]
100%|██████████| 20/20 [1:51:49<00:00, 335.48s/it]
  0%|          | 0/20 [00:00<?, ?it/s]

5


100%|██████████| 20/20 [09:06<00:00, 27.33s/it]
100%|██████████| 20/20 [1:30:46<00:00, 272.33s/it]
100%|██████████| 20/20 [11:18<00:00, 33.93s/it]
100%|██████████| 20/20 [1:56:59<00:00, 350.99s/it]
100%|██████████| 20/20 [13:07<00:00, 39.38s/it]
100%|██████████| 20/20 [1:53:15<00:00, 339.76s/it]


In [None]:
sigmas = [1, 2, 3, 4, 5]
feature_cluster_size = [(20, 10, 1000)]

all_configs = []

for n_f, n_c, sz in feature_cluster_size:
    print(n_f, n_c, sz)
    for s in sigmas:
        print(s)
        configs = [
            '{}x{}-{} {}'.format(sz, n_f, n_c, s),
            '{}x{}-{} +{}NF {}'.format(sz, n_f, n_c, n_f // 2, s),
            '{}x{}-{} 50%N {}'.format(sz, n_f, n_c, s)
        ]
        for c in configs:
            test_all('synthetic', c)
        


In [3]:
files = [x for x in sorted(os.listdir('./results/')) if x[:9] == 'synthetic' ]
len(files), files

(46,
 ['synthetic 1000x12-6 +6NF 1.pkl',
  'synthetic 1000x12-6 +6NF 2.pkl',
  'synthetic 1000x12-6 +6NF 3.pkl',
  'synthetic 1000x12-6 +6NF 4.pkl',
  'synthetic 1000x12-6 +6NF 5.pkl',
  'synthetic 1000x12-6 1.pkl',
  'synthetic 1000x12-6 2.pkl',
  'synthetic 1000x12-6 3.pkl',
  'synthetic 1000x12-6 4.pkl',
  'synthetic 1000x12-6 5.pkl',
  'synthetic 1000x12-6 50%N 1.pkl',
  'synthetic 1000x12-6 50%N 2.pkl',
  'synthetic 1000x12-6 50%N 3.pkl',
  'synthetic 1000x12-6 50%N 4.pkl',
  'synthetic 1000x12-6 50%N 5.pkl',
  'synthetic 1000x6-3 +3NF 1.pkl',
  'synthetic 1000x6-3 +3NF 2.pkl',
  'synthetic 1000x6-3 +3NF 3.pkl',
  'synthetic 1000x6-3 +3NF 4.pkl',
  'synthetic 1000x6-3 +3NF 5.pkl',
  'synthetic 1000x6-3 1.pkl',
  'synthetic 1000x6-3 2.pkl',
  'synthetic 1000x6-3 3.pkl',
  'synthetic 1000x6-3 4.pkl',
  'synthetic 1000x6-3 5.pkl',
  'synthetic 1000x6-3 50%N 1.pkl',
  'synthetic 1000x6-3 50%N 2.pkl',
  'synthetic 1000x6-3 50%N 3.pkl',
  'synthetic 1000x6-3 50%N 4.pkl',
  'synthetic 10

In [25]:
synthetic_summary_df = pd.DataFrame()

for f in files:
    df = pd.read_pickle('./results/{}'.format(f))[-2:]
    name = f.split('.')[0]
    df = df.rename(index={
        'mean' : '{}:mean'.format(name),
        'std' : '{}:std'.format(name)
    })
    synthetic_summary_df = synthetic_summary_df.append(df)


In [26]:
synthetic_summary_df

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 1:mean,6,1000,0.000000,1.656667e+02,39.75,0.026856,5.625000,2,0.000623,6.666667e-01
synthetic 1000x12-6 +6NF 1:std,0,0,0.000000,2.916006e-14,5.29026,0.011556,0.881710,0,0.001385,1.139065e-16
synthetic 1000x12-6 +6NF 2:mean,6,1000,0.000000,1.656667e+02,37.3,0.120107,5.266667,2.1,0.139466,6.500000e-01
synthetic 1000x12-6 +6NF 2:std,0,0,0.000000,2.916006e-14,11.0601,0.043160,1.687068,0.307794,0.203017,5.129892e-02
synthetic 1000x12-6 +6NF 3:mean,6,1000,0.000000,1.656667e+02,38.8,0.154391,5.466667,2,0.206861,6.666667e-01
...,...,...,...,...,...,...,...,...,...,...
synthetic 500x2-3 50%N 3:std,0,11.9212,0.167125,3.973744e+00,16.7033,0.191735,5.567764,11.9273,0.169235,3.830128e+00
synthetic 500x2-3 50%N 4:mean,3,18.55,0.278628,5.183333e+00,28.25,0.181577,8.416667,4.8,0.376149,7.666667e-01
synthetic 500x2-3 50%N 4:std,0,13.516,0.243189,4.505325e+00,15.9138,0.268265,5.304610,3.28634,0.258477,9.799153e-01
synthetic 500x2-3 50%N 5:mean,3,20.6,0.273060,5.866667e+00,23.85,0.252348,6.950000,9.55,0.431807,2.316667e+00


In [27]:
synthetic_summary_df.to_pickle('./results/synthetic summary.pkl')

In [6]:
synthetic_summary_df = pd.read_pickle('./results/synthetic summary.pkl')
pd.options.display.float_format = '{:.3f}'.format
pd.options.display.max_rows = None


In [21]:
synthetic_summary_df

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 1:mean,6.0,1000.0,0.0,165.667,39.75,0.027,5.625,2.0,0.001,0.667
synthetic 1000x12-6 +6NF 1:std,0.0,0.0,0.0,0.0,5.29,0.012,0.882,0.0,0.001,0.0
synthetic 1000x12-6 +6NF 2:mean,6.0,1000.0,0.0,165.667,37.3,0.12,5.267,2.1,0.139,0.65
synthetic 1000x12-6 +6NF 2:std,0.0,0.0,0.0,0.0,11.06,0.043,1.687,0.308,0.203,0.051
synthetic 1000x12-6 +6NF 3:mean,6.0,1000.0,0.0,165.667,38.8,0.154,5.467,2.0,0.207,0.667
synthetic 1000x12-6 +6NF 3:std,0.0,0.0,0.0,0.0,4.753,0.047,0.792,0.0,0.164,0.0
synthetic 1000x12-6 +6NF 4:mean,6.0,1000.0,0.0,165.667,37.05,0.194,5.225,2.05,0.272,0.658
synthetic 1000x12-6 +6NF 4:std,0.0,0.0,0.0,0.0,10.385,0.101,1.565,0.224,0.13,0.037
synthetic 1000x12-6 +6NF 5:mean,6.0,1000.0,0.0,165.667,39.0,0.202,5.55,2.45,0.411,0.592
synthetic 1000x12-6 +6NF 5:std,0.0,0.0,0.0,0.0,10.543,0.103,1.583,0.686,0.172,0.114


In [12]:
synthetic_summaries_df = {}
for s in range(1, 6):
    ss_df = pd.DataFrame()
    for f in files:
        name = f.split('.')[0]
        if name[:11] != 'synthetic 1':
            continue
        if int(name[-1]) == s:
            df = pd.read_pickle('./results/{}'.format(f))[-2:]
            df = df.rename(index={
                'mean' : '{}:mean'.format(name),
                'std' : '{}:std'.format(name)
            })
            ss_df = ss_df.append(df)
    synthetic_summaries_df[s] = ss_df


In [13]:
synthetic_summaries_df[1]

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 1:mean,6.0,1000.0,0.0,165.667,39.75,0.027,5.625,2.0,0.001,0.667
synthetic 1000x12-6 +6NF 1:std,0.0,0.0,0.0,0.0,5.29,0.012,0.882,0.0,0.001,0.0
synthetic 1000x12-6 1:mean,6.0,1000.0,0.0,165.667,31.1,0.371,4.233,2.75,0.414,0.542
synthetic 1000x12-6 1:std,0.0,0.0,0.0,0.0,17.758,0.315,2.884,1.164,0.244,0.194
synthetic 1000x12-6 50%N 1:mean,6.0,1000.0,0.0,165.667,37.15,0.087,5.192,2.0,0.085,0.667
synthetic 1000x12-6 50%N 1:std,0.0,0.0,0.0,0.0,6.302,0.032,1.05,0.0,0.066,0.0
synthetic 1000x6-3 +3NF 1:mean,3.0,999.75,0.0,332.25,41.2,0.025,12.733,2.1,-0.0,0.333
synthetic 1000x6-3 +3NF 1:std,0.0,0.444,0.0,0.148,4.927,0.013,1.642,0.447,0.001,0.0
synthetic 1000x6-3 1:mean,3.0,934.7,0.0,310.567,22.55,0.34,6.517,2.0,0.502,0.333
synthetic 1000x6-3 1:std,0.0,25.52,0.0,8.507,20.098,0.366,6.699,0.0,0.223,0.0


In [14]:
synthetic_summaries_df[2]

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 2:mean,6.0,1000.0,0.0,165.667,37.3,0.12,5.267,2.1,0.139,0.65
synthetic 1000x12-6 +6NF 2:std,0.0,0.0,0.0,0.0,11.06,0.043,1.687,0.308,0.203,0.051
synthetic 1000x12-6 2:mean,6.0,998.8,0.0,165.467,10.3,0.888,0.717,5.8,0.973,0.033
synthetic 1000x12-6 2:std,0.0,1.642,0.0,0.274,11.193,0.269,1.865,0.523,0.058,0.087
synthetic 1000x12-6 50%N 2:mean,6.0,999.95,0.0,165.658,38.4,0.131,5.45,2.1,0.155,0.65
synthetic 1000x12-6 50%N 2:std,0.0,0.224,0.0,0.037,9.495,0.041,1.39,0.308,0.072,0.051
synthetic 1000x6-3 +3NF 2:mean,3.0,996.95,0.0,331.317,38.05,0.119,11.683,2.0,0.358,0.333
synthetic 1000x6-3 +3NF 2:std,0.0,2.837,0.0,0.946,9.682,0.123,3.227,0.0,0.329,0.0
synthetic 1000x6-3 2:mean,3.0,398.45,0.37,131.817,3.0,0.984,0.0,2.8,0.929,0.067
synthetic 1000x6-3 2:std,0.0,371.567,0.39,123.856,0.0,0.026,0.0,0.41,0.128,0.137


In [15]:
synthetic_summaries_df[3]

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 3:mean,6.0,1000.0,0.0,165.667,38.8,0.154,5.467,2.0,0.207,0.667
synthetic 1000x12-6 +6NF 3:std,0.0,0.0,0.0,0.0,4.753,0.047,0.792,0.0,0.164,0.0
synthetic 1000x12-6 3:mean,6.0,348.4,0.317,57.533,11.5,0.861,0.917,6.0,0.976,0.0
synthetic 1000x12-6 3:std,0.0,480.682,0.352,79.762,13.497,0.289,2.249,0.0,0.064,0.0
synthetic 1000x12-6 50%N 3:mean,6.0,998.9,0.0,165.483,32.65,0.165,4.642,2.05,0.174,0.658
synthetic 1000x12-6 50%N 3:std,0.0,3.007,0.0,0.501,15.776,0.078,2.236,0.224,0.1,0.037
synthetic 1000x6-3 +3NF 3:mean,3.0,991.9,0.0,329.633,40.25,0.092,12.417,2.05,0.282,0.317
synthetic 1000x6-3 +3NF 3:std,0.0,13.924,0.0,4.641,5.928,0.019,1.976,0.224,0.36,0.075
synthetic 1000x6-3 3:mean,3.0,348.1,0.431,115.033,3.0,0.998,0.0,2.95,0.979,0.017
synthetic 1000x6-3 3:std,0.0,344.15,0.467,114.717,0.0,0.004,0.0,0.224,0.088,0.075


In [16]:
synthetic_summaries_df[4]

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 4:mean,6.0,1000.0,0.0,165.667,37.05,0.194,5.225,2.05,0.272,0.658
synthetic 1000x12-6 +6NF 4:std,0.0,0.0,0.0,0.0,10.385,0.101,1.565,0.224,0.13,0.037
synthetic 1000x12-6 4:mean,6.0,7.15,0.812,0.358,6.05,0.975,0.008,6.05,0.984,0.008
synthetic 1000x12-6 4:std,0.0,2.368,0.22,0.243,0.224,0.068,0.037,0.224,0.051,0.037
synthetic 1000x12-6 50%N 4:mean,6.0,998.65,0.0,165.442,35.5,0.178,4.967,2.1,0.225,0.65
synthetic 1000x12-6 50%N 4:std,0.0,4.32,0.0,0.72,10.164,0.053,1.533,0.308,0.134,0.051
synthetic 1000x6-3 +3NF 4:mean,3.0,933.9,0.001,310.3,29.45,0.269,8.817,2.2,0.609,0.267
synthetic 1000x6-3 +3NF 4:std,0.0,62.074,0.001,20.691,16.048,0.331,5.349,0.41,0.268,0.137
synthetic 1000x6-3 4:mean,3.0,88.65,0.724,28.55,3.0,0.993,0.0,2.95,0.977,0.017
synthetic 1000x6-3 4:std,0.0,205.193,0.338,68.398,0.0,0.03,0.0,0.224,0.103,0.075


In [17]:
synthetic_summaries_df[5]

Unnamed: 0,clusters,paper pred. clusters,paper ARI,paper MARE,kmeans elbow2 pred. clusters,kmeans elbow2 ARI,kmeans elbow2 MARE,kmeans c-h pred. clusters,kmeans c-h ARI,kmeans c-h MARE
synthetic 1000x12-6 +6NF 5:mean,6.0,1000.0,0.0,165.667,39.0,0.202,5.55,2.45,0.411,0.592
synthetic 1000x12-6 +6NF 5:std,0.0,0.0,0.0,0.0,10.543,0.103,1.583,0.686,0.172,0.114
synthetic 1000x12-6 5:mean,6.0,8.85,0.829,0.558,7.2,0.943,0.2,6.15,0.955,0.025
synthetic 1000x12-6 5:std,0.0,3.528,0.161,0.505,4.467,0.166,0.745,0.366,0.085,0.061
synthetic 1000x12-6 50%N 5:mean,6.0,990.8,0.0,164.133,40.0,0.158,5.667,2.2,0.254,0.633
synthetic 1000x12-6 50%N 5:std,0.0,22.182,0.001,3.697,6.139,0.039,1.023,0.696,0.175,0.116
synthetic 1000x6-3 +3NF 5:mean,3.0,898.05,0.001,298.35,35.65,0.105,10.883,2.4,0.679,0.233
synthetic 1000x6-3 +3NF 5:std,0.0,100.866,0.001,33.622,10.908,0.084,3.636,0.598,0.252,0.157
synthetic 1000x6-3 5:mean,3.0,7.8,0.86,1.6,3.0,1.0,0.0,3.0,1.0,0.0
synthetic 1000x6-3 5:std,0.0,14.916,0.187,4.972,0.0,0.0,0.0,0.0,0.0,0.0
