In [16]:
import numpy as np
import pandas as pd
import re
import os

In [17]:
species_df = pd.read_csv('data/species_180.csv', index_col=0)
BSP_df = pd.read_csv('data/BrazilianScientificProduction.csv', index_col=0)

In [18]:
cols = ['spores 1d', 'spores 2d', 'pileus width', 'stipes long', 'stipes thick']
measure_cols = [col + ' measure' for col in cols]
bsp_cols = BSP_df.columns[2:]   

In [19]:
for col in cols:
  # process NaN -> and get appears of boundaries from string
  species_df[col] = species_df[col].apply(lambda x: '[\'0\', \'0\']' if type(x) == float else x).apply(lambda x: re.sub(r'[\(\)]', r'', re.sub(r'\([0-9.]+|[0-9.]+\)', r'', '-'.join(re.sub('[\[\]\']', r'', x).split(',')))))
  species_df[col] = species_df[col].apply(lambda x: re.findall(r'[0-9.]+', ' '.join(x.split('-')).strip()))
  species_df[col] = species_df[col].apply(lambda x: np.array([float(x[0]), float(x[1])]) if len(x) >= 2 else (np.array([float(x[0]), float(x[0])]) if len(x) == 1 else np.array([0., 0.])))

for col in bsp_cols:
  # process NaN -> and get appears of boundaries from string
  BSP_df[col] = BSP_df[col].apply(lambda x: '[\'0\', \'0\']' if type(x) == float else x).apply(lambda x: re.sub(r'[\(\)]', r'', re.sub(r'\([0-9.]+|[0-9.]+\)', r'', '-'.join(re.sub('[\[\]\']', r'', x).split(',')))))
  BSP_df[col] = BSP_df[col].apply(lambda x: re.findall(r'[0-9.]+', ' '.join(x.split('-')).strip()))
  BSP_df[col] = BSP_df[col].apply(lambda x: np.array([float(x[0]), float(x[1])]) if len(x) >= 2 else (np.array([float(x[0]), float(x[0])]) if len(x) == 1 else np.array([0., 0.])))

In [20]:
def generate_dataset(dataset, d_cols, top_most_group_num, group_col, top_most_col, m_cols=None):
  dataset_a, dataset_b = dataset.copy(), dataset.copy()
  dataset_a[d_cols] = dataset[d_cols].apply(lambda x: x.apply(lambda x: x[0]), axis=1).to_numpy()
  dataset_b[d_cols] = dataset[d_cols].apply(lambda x: x.apply(lambda x: x[1]), axis=1).to_numpy()
  if m_cols:
    dataset_a[d_cols] *= dataset[m_cols].to_numpy()
    dataset_b[d_cols] *= dataset[m_cols].to_numpy()

  clusters = dataset_a.groupby(by=group_col).count()[d_cols[0]].sort_values(ascending=False)[:top_most_group_num].index.to_list()
  dataset_a = dataset_a[dataset_a[top_most_col].isin(clusters)].reset_index(drop=True)
  dataset_b = dataset_b[dataset_b[top_most_col].isin(clusters)].reset_index(drop=True)

  real_clusters = []
  for target in dataset_a[top_most_col].unique():
    real_clusters.append(set(dataset_a[dataset_a[top_most_col] == target].index))

  data = np.hstack([dataset_a[d_cols].to_numpy(), dataset_b[d_cols].to_numpy()])
  return [data, real_clusters, top_most_col]

In [21]:
datasets = []
for i in range(3, 8):
  datasets.append([*generate_dataset(species_df, cols, i, 'genera', 'genera', m_cols=measure_cols), 'Fungi_genera_' + str(i)])

for i in range(3, 6):
  datasets.append([*generate_dataset(BSP_df, bsp_cols, i, 'GRANDE-AREA-PREDOMINANTE', 'GRANDE-AREA-PREDOMINANTE'), 'BSP_GAP_' + str(i)])

In [22]:
for dataset in datasets:
    if not os.path.exists("subdatasets/" + dataset[-1]):
        os.mkdir("subdatasets/" + dataset[-1])
    np.save("subdatasets/" + dataset[-1] + '/data', dataset[0])
    with open('subdatasets/{}/real_clusters.txt'.format(dataset[-1]), 'w') as fp:
        for cluster in dataset[1]:
            # write each cluster on a new line
            fp.write(", ".join(map(str, cluster)) + "\n")