In [1]:
from Algs.weights import A0, A1, A2
from Algs.BANCOIn import BANCOIn
from Algs.IKM import IKM
from Algs.KM import KM
from sklearn.metrics import adjusted_rand_score 
from sklearn.metrics import normalized_mutual_info_score 
import os
import numpy as np 
import copy

np.random.seed(42)

In [2]:
datasets = []
for dataset_name in os.listdir("subdatasets/")[:-2]:
    data = np.load("subdatasets/" + dataset_name + '/data.npy')
    real_clusters = []
    with open('subdatasets/{}/real_clusters.txt'.format(dataset_name), 'r') as fp:
        for line in fp:
            x = line[:-1]
            real_clusters.append(set(map(int, x.split(", "))))
    datasets.append({"data": data, "real clusters": real_clusters, "name": dataset_name})

In [3]:
def get_scores(real_clusters, algo_clusters, data):
  r_labels = np.full(data.shape[0], 0)
  for i in range(len(real_clusters)):
    r_labels[list(real_clusters[i])] = i

  alg_labels = np.full(data.shape[0], 0)
  for i in range(len(algo_clusters)):
    alg_labels[list(algo_clusters[i])] = i

  return [
    adjusted_rand_score(r_labels, alg_labels),
    normalized_mutual_info_score(r_labels, alg_labels),
  ]

In [4]:
def test_alg(datasets, test_num, algs=[], beta1 = 1, beta2=1, er=1e-10):
  # algs format:
  # list of dicts like: { 'BANCO preamble':           true/false; 
  #                       'KM/IKM':                   'KM'/'IKM'; 
  #                       'BANCO weights':            'A0'/'A1'/'A2';
  #                       'KM/IKM weights':           'A0'/'A1'/'A2';
  #                       'cluster specific weights': true/false;} 

  w_funcs = {'A0': A0, 'A1': A1, 'A2': A2}
  alg_types = {'KM': KM, 'IKM': IKM}
  
  scores = []
  for alg in algs:
    name = 'B' + ('+ ' + alg['BANCO weights'] + " " if alg['BANCO preamble'] else '- ' + alg['KM/IKM weights'] + " ") +\
          alg['KM/IKM'] +\
          (' ' + alg['KM/IKM weights'] if alg['KM/IKM weights'] != alg['BANCO weights'] and alg['BANCO preamble'] else '') +\
          (' 3d' if alg['cluster specific weights'] else '')

    if (alg['BANCO weights'] == 'A0' and beta1 != 1.) or (alg['KM/IKM weights'] == 'A0' and beta2 != 1.) or\
       (alg['BANCO weights'] == 'A1' and beta1 == 1) or (alg['KM/IKM weights'] == 'A1' and beta2 == 1) or \
       (not alg['BANCO preamble'] and beta1 != beta2) or \
       (alg['KM/IKM weights'] == 'A0' and alg['cluster specific weights']) or \
       (alg['KM/IKM'] == 'KM' and alg['KM/IKM weights'] == 'A2'):
      #print('continue: ', name)
      continue
    
    for dataset in datasets:
      data, real_clusters, dataset_name = dataset.values()
      l, u = data.shape[1] // 2, data.shape[1]
      # idxs = [np.arange(l), np.arange(l, u)]
      idxs = [np.arange(l), np.arange(l, u)] if alg['BANCO weights'] == 'A2' else [np.arange(u)]
      # idxs2 = [np.arange(l), np.arange(l, u)] if alg['KM/IKM'] == 'IKM' else [np.arange(u)]
      idxs2 = [np.arange(l), np.arange(l, u)] if alg['KM/IKM weights'] == 'A2' else [np.arange(u)]
      real_len = len(real_clusters)
    
      for i in range(test_num):
        hyp_clusters = None
        if alg['BANCO preamble']:
          hyp_clusters = BANCOIn(w_funcs[alg['BANCO weights']], '3d',
                                 center_data = True,
                                 minmax_normilize = True,
                                 scale_by_var = True).fit(copy.deepcopy(data),
                                                                          idxs, real_len,
                                                                          beta_degree=beta1,
                                                                          er=er)
          #print(len(hyp_clusters), real_len)
        alg_method = alg_types[alg['KM/IKM']](w_funcs[alg['KM/IKM weights']], '3d' if alg['cluster specific weights'] else '2d',
                                 center_data = True,
                                 minmax_normilize = True,
                                 scale_by_var = True)
        alg_clusters = alg_method.fit(copy.deepcopy(data),
                                      idxs2, real_len,
                                      beta_degree=beta2,
                                      hyp_clusters=hyp_clusters)
        
        scores.append([name, dataset_name, i, beta1, beta2, *get_scores(copy.deepcopy(real_clusters), 
                                                                        alg_clusters, data)])
  scores = np.array(scores)
  return scores

In [5]:
import itertools

keys = ['BANCO preamble', 'KM/IKM', 'BANCO weights', 'KM/IKM weights', 'cluster specific weights']


B =       [True, False]
alg =     ['KM', 'IKM']
weights = ['A0', 'A1', 'A2']
cs =      [True, False]

# B =       [True]
# alg =     ['IKM']
# weights = ['A2']
# cs =      [False]



B_algs = list(itertools.product([B[0]], alg, weights, weights, cs))
B_test_algs = [{keys[id]:item for id, item in enumerate(alg)} for alg in B_algs]

algs = list(itertools.product([B[1]], alg, weights, weights, cs))
test_algs = [{keys[id]:item for id, item in enumerate(alg)} for alg in algs]

In [6]:
res = np.empty(shape=(0, 7))
for b1 in np.arange(1, 2.9, 0.1):
  b1 = round(b1, 2)
  res_p = test_alg(datasets, 100, algs=test_algs, beta1=b1, beta2=b1)
  if len(res_p):
    res = np.vstack([res, res_p])
  
  for b2 in np.arange(1., 2.9, 0.1):
    b2 = round(b2, 2)
    print(f'b1: {b1}, b2: {b2}')
    
    res_p = test_alg(datasets, 1, algs=B_test_algs, beta1=b1, beta2=b2)
    if len(res_p):
      res = np.vstack([res, res_p])
res

b1: 1.0, b2: 1.0
b1: 1.0, b2: 1.1
b1: 1.0, b2: 1.2
b1: 1.0, b2: 1.3
b1: 1.0, b2: 1.4
b1: 1.0, b2: 1.5
b1: 1.0, b2: 1.6
b1: 1.0, b2: 1.7
b1: 1.0, b2: 1.8
b1: 1.0, b2: 1.9
b1: 1.0, b2: 2.0
b1: 1.0, b2: 2.1
b1: 1.0, b2: 2.2
b1: 1.0, b2: 2.3
b1: 1.0, b2: 2.4
b1: 1.0, b2: 2.5
b1: 1.0, b2: 2.6
b1: 1.0, b2: 2.7
b1: 1.0, b2: 2.8
b1: 1.1, b2: 1.0
b1: 1.1, b2: 1.1
b1: 1.1, b2: 1.2
b1: 1.1, b2: 1.3
b1: 1.1, b2: 1.4
b1: 1.1, b2: 1.5
b1: 1.1, b2: 1.6
b1: 1.1, b2: 1.7
b1: 1.1, b2: 1.8
b1: 1.1, b2: 1.9
b1: 1.1, b2: 2.0
b1: 1.1, b2: 2.1
b1: 1.1, b2: 2.2
b1: 1.1, b2: 2.3
b1: 1.1, b2: 2.4
b1: 1.1, b2: 2.5
b1: 1.1, b2: 2.6
b1: 1.1, b2: 2.7
b1: 1.1, b2: 2.8
b1: 1.2, b2: 1.0
b1: 1.2, b2: 1.1
b1: 1.2, b2: 1.2
b1: 1.2, b2: 1.3
b1: 1.2, b2: 1.4
b1: 1.2, b2: 1.5
b1: 1.2, b2: 1.6
b1: 1.2, b2: 1.7
b1: 1.2, b2: 1.8
b1: 1.2, b2: 1.9
b1: 1.2, b2: 2.0
b1: 1.2, b2: 2.1
b1: 1.2, b2: 2.2
b1: 1.2, b2: 2.3
b1: 1.2, b2: 2.4
b1: 1.2, b2: 2.5
b1: 1.2, b2: 2.6
b1: 1.2, b2: 2.7
b1: 1.2, b2: 2.8
b1: 1.3, b2: 1.0
b1: 1.3, b2: 1

array([['B- A0 KM', 'BSP_GAP_3', '0', ..., '1.0', '0.41373315949587136',
        '0.5812685165317657'],
       ['B- A0 KM', 'BSP_GAP_3', '1', ..., '1.0', '0.8500986193293886',
        '0.8354351965419832'],
       ['B- A0 KM', 'BSP_GAP_3', '2', ..., '1.0', '0.45999125491910803',
        '0.6010937702602511'],
       ...,
       ['B+ A2 IKM', 'Fungi_genera_3', '0', ..., '2.8',
        '0.8084388005605954', '0.7574683948076006'],
       ['B+ A2 IKM', 'Fungi_genera_4', '0', ..., '2.8',
        '0.2152793283578918', '0.36518336805407875'],
       ['B+ A2 IKM', 'Fungi_genera_5', '0', ..., '2.8',
        '0.3572128743912013', '0.47955212667874114']], dtype='<U32')

In [7]:
import pandas as pd 
res_df = pd.DataFrame(res, columns=["name", "dataset", "iter", "beta1", "beta2", "ARI", "NMI"])
res_df

Unnamed: 0,name,dataset,iter,beta1,beta2,ARI,NMI
0,B- A0 KM,BSP_GAP_3,0,1.0,1.0,0.41373315949587136,0.5812685165317657
1,B- A0 KM,BSP_GAP_3,1,1.0,1.0,0.8500986193293886,0.8354351965419832
2,B- A0 KM,BSP_GAP_3,2,1.0,1.0,0.45999125491910803,0.6010937702602511
3,B- A0 KM,BSP_GAP_3,3,1.0,1.0,0.3272562904926925,0.42825659819840883
4,B- A0 KM,BSP_GAP_3,4,1.0,1.0,0.7264397905759162,0.749303853962197
...,...,...,...,...,...,...,...
159931,B+ A2 IKM,BSP_GAP_4,0,2.8,2.8,0.7489104927924908,0.7862045528080539
159932,B+ A2 IKM,BSP_GAP_5,0,2.8,2.8,0.14169223371328335,0.3191393793932266
159933,B+ A2 IKM,Fungi_genera_3,0,2.8,2.8,0.8084388005605954,0.7574683948076006
159934,B+ A2 IKM,Fungi_genera_4,0,2.8,2.8,0.2152793283578918,0.36518336805407875


In [11]:
res_df.to_csv("results.csv")