In [1]:
from Algs.weights import A0, A1, A2
from Algs.BANCOIn import BANCOIn
from Algs.IKM import IKM
from Algs.KM import KM
from sklearn.metrics import adjusted_rand_score # type: ignore
from sklearn.metrics import normalized_mutual_info_score # type: ignore
import os
import numpy as np # type: ignore
import copy

np.random.seed(42)

In [2]:
datasets = []
for dataset_name in os.listdir("subdatasets/"):
    data = np.load("subdatasets/" + dataset_name + '/data.npy')
    real_clusters = []
    with open('subdatasets/{}/real_clusters.txt'.format(dataset_name), 'r') as fp:
        for line in fp:
            x = line[:-1]
            real_clusters.append(set(map(int, x.split(", "))))
    datasets.append({"data": data, "real clusters": real_clusters, "name": dataset_name})

In [3]:
def get_scores(real_clusters, algo_clusters, data):
  r_labels = np.full(data.shape[0], 0)
  for i in range(len(real_clusters)):
    r_labels[list(real_clusters[i])] = i

  alg_labels = np.full(data.shape[0], 0)
  for i in range(len(algo_clusters)):
    alg_labels[list(algo_clusters[i])] = i

  return [
    adjusted_rand_score(r_labels, alg_labels),
    normalized_mutual_info_score(r_labels, alg_labels),
  ]

In [4]:
def test_alg(datasets, test_num, alg_names=[], beta=1, er=1e-10):
  bancoA0_required = ['B+ A0 IKM', 'B+ A0 IKM 3d',
                      'B+ A0 KM', 'B+ A0 KM 3d',]
  bancoA1_required = ['B+ A1 IKM', 'B+ A1 IKM 3d', 'B+ A1 IKM A2', 'B+ A1 IKM A2 3d',
                      'B+ A1 KM', 'B+ A1 KM 3d', 'B+ A1 KM A2', 'B+ A1 KM A2 3d']
  bancoA2_required = ['B+ A2 IKM', 'B+ A2 IKM 3d',
                      'B+ A2 KM', 'B+ A2 KM 3d']

  scores = []
  for dataset in datasets:
    data, real_clusters, dataset_name = dataset.values()

    for i in range(test_num):
      l, u = data.shape[1] // 2, data.shape[1]
      idxs = [np.arange(l), np.arange(l, u)]
      real_len = len(real_clusters)

      if len(set(alg_names) & set(bancoA0_required)):
        method = BANCOIn(A0, '3d')
        bancoA0 = method.fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, er=er)

      if len(set(alg_names) & set(bancoA1_required)):
        method = BANCOIn(A1, '3d')
        bancoA1 = method.fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, er=er)

      if len(set(alg_names) & set(bancoA2_required)):
        method = BANCOIn(A2, '3d')
        bancoA2 = method.fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, er=er)

      algs = {}
      algs.update({
              'B- A0 IKM':        lambda data: IKM(A0, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A0 IKM':        lambda data: IKM(A0, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA0),

              'B- A1 IKM':        lambda data: IKM(A1, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B- A1 IKM 3d':     lambda data: IKM(A1, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A1 IKM':        lambda data: IKM(A1, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
              'B+ A1 IKM 3d':     lambda data: IKM(A1, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),

              'B- A2 IKM':        lambda data: IKM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B- A2 IKM 3d':     lambda data: IKM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A2 IKM':        lambda data: IKM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA2),
              'B+ A2 IKM 3d':     lambda data: IKM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA2),

              'B+ A1 IKM A2':     lambda data: IKM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
              'B+ A1 IKM A2 3d':  lambda data: IKM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
              
              'B- A0 KM':         lambda data:  KM(A0, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A0 KM':         lambda data:  KM(A0, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA0),

              'B- A1 KM':         lambda data:  KM(A1, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B- A1 KM 3d':      lambda data:  KM(A1, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A1 KM':         lambda data:  KM(A1, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
              'B+ A1 KM 3d':      lambda data:  KM(A1, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),

              'B- A2 KM':         lambda data:  KM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B- A2 KM 3d':      lambda data:  KM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta),
              'B+ A2 KM':         lambda data:  KM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA2),
              'B+ A2 KM 3d':      lambda data:  KM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA2),

              'B+ A1 KM A2':      lambda data:  KM(A2, '2d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
              'B+ A1 KM A2 3d':   lambda data:  KM(A2, '3d').fit(copy.deepcopy(data), idxs, real_len, beta_degree=beta, hyp_clusters=bancoA1),
      })

      for name in alg_names:
        scores.append([name, dataset_name, i, beta, *get_scores(copy.deepcopy(real_clusters), algs[name](copy.deepcopy(data)), data)])
  scores = np.array(scores)
  return scores

In [8]:
res = []
res.append(test_alg(datasets, 100, alg_names=['B- A0 IKM', 'B- A2 IKM', 'B- A2 IKM 3d', 
                                              'B- A0 KM', 'B- A2 KM', 'B- A2 KM 3d']))
res.append(test_alg(datasets, 1, alg_names=['B+ A0 IKM', 'B+ A2 IKM', 'B+ A2 IKM 3d',
                                            'B+ A0 KM', 'B+ A2 KM', 'B+ A2 KM 3d']))
for b in np.arange(1.1, 2.8, 0.1):
  print('b: ', round(b, 2))
  res.append(test_alg(datasets, 100, alg_names=['B- A1 IKM', 'B- A1 IKM 3d', 'B- A1 KM', 'B- A1 KM 3d'], beta=round(b, 2)))
  res.append(test_alg(datasets, 1, alg_names=['B+ A1 IKM', 'B+ A1 IKM 3d', 'B+ A1 IKM A2', 'B+ A1 IKM A2 3d',
                                              'B+ A1 KM', 'B+ A1 KM 3d', 'B+ A1 KM A2', 'B+ A1 KM A2 3d',], beta=round(b, 2)))
res = np.vstack(res)
res

b:  1.1
b:  1.2
b:  1.3
b:  1.4
b:  1.5
b:  1.6
b:  1.7
b:  1.8
b:  1.9
b:  2.0
b:  2.1
b:  2.2
b:  2.3
b:  2.4
b:  2.5
b:  2.6
b:  2.7


array([['B- A0 IKM', 'BSP_GAP_3', '0', '1', '0.4046057426031191',
        '0.5472340823396795'],
       ['B- A2 IKM', 'BSP_GAP_3', '0', '1', '0.4555951532430506',
        '0.5967820527332923'],
       ['B- A2 IKM 3d', 'BSP_GAP_3', '0', '1', '0.8412776412776413',
        '0.832362697858983'],
       ...,
       ['B+ A1 KM 3d', 'Fungi_genera_7', '0', '2.7',
        '0.31635760649714306', '0.4858116788526714'],
       ['B+ A1 KM A2', 'Fungi_genera_7', '0', '2.7',
        '0.3060291065802397', '0.4751409756027246'],
       ['B+ A1 KM A2 3d', 'Fungi_genera_7', '0', '2.7',
        '0.21904604224352708', '0.4153313496965748']], dtype='<U32')

In [9]:
import pandas as pd 
res_df = pd.DataFrame(res, columns=["name", "dataset", "iter", "beta", "ARI", "NMI"])
res_df

Unnamed: 0,name,dataset,iter,beta,ARI,NMI
0,B- A0 IKM,BSP_GAP_3,0,1,0.4046057426031191,0.5472340823396795
1,B- A2 IKM,BSP_GAP_3,0,1,0.4555951532430506,0.5967820527332923
2,B- A2 IKM 3d,BSP_GAP_3,0,1,0.8412776412776413,0.832362697858983
3,B- A0 KM,BSP_GAP_3,0,1,0.41373315949587136,0.5812685165317657
4,B- A2 KM,BSP_GAP_3,0,1,0.9220160893120998,0.916744228830706
...,...,...,...,...,...,...
60331,B+ A1 IKM A2 3d,Fungi_genera_7,0,2.7,0.1696221371964646,0.3740346163545042
60332,B+ A1 KM,Fungi_genera_7,0,2.7,0.3596678843877428,0.5232813652021078
60333,B+ A1 KM 3d,Fungi_genera_7,0,2.7,0.31635760649714306,0.4858116788526714
60334,B+ A1 KM A2,Fungi_genera_7,0,2.7,0.3060291065802397,0.4751409756027246


In [13]:
test = res_df[np.logical_and(res_df['dataset'] == 'Fungi_genera_3', res_df['name'] == 'B+ A2 IKM')]
test[['name', 'dataset', 'ARI', 'NMI']].max()

name                B+ A2 IKM
dataset        Fungi_genera_3
ARI        0.5505893909626719
NMI        0.4958854950812797
dtype: object

In [27]:
test = res_df[np.logical_and(res_df['dataset'] == 'Fungi_genera_3', res_df['name'] == 'B+ A0 IKM')]
test[['name', 'dataset', 'ARI', 'NMI']].max()

name                B+ A0 IKM
dataset        Fungi_genera_3
ARI        0.5521248372475501
NMI        0.5501696039374915
dtype: object

In [14]:
res_df.to_csv("results.csv")