#Imports

In [22]:
import pandas as pd
import numpy as np

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from sklearn.cluster import KMeans

In [23]:
from google.colab import drive

drive.mount('/content/gdrive', force_remount=True)
path = 'PATH_TO_DATASET_IN_DRIVE'

Mounted at /content/gdrive


#Functions

In [24]:
def read_imb_base(base_type): # reading imbalanced base
  if base_type == 'base_total': 
    complement = 'NORMALIZADO'
  elif base_type == 'base_reduzida':
    complement = 'RECORTE_FEITO_NORMALIZADO'
  
  base_desb = pd.read_csv(path+'base_pre_proc_desb_imp_moda_'+complement+'.csv', sep=',')
  
  return base_desb

In [25]:
def checking_if_everything_is_ok(imb_base, undersampler, method=None):
  if method == 'sbc':
    remaining_base = imb_base.drop(undersampler, axis=0, inplace=False) # dropping samples with unselected indexes
    index_remaining_base = list(remaining_base.index)

    for indice in undersampler:
      if indice in index_remaining_base:
        return False
    return True
  else:
    #print(f'{undersampler.sample_indices_} Size = {len(undersampler.sample_indices_)}') # selected samples index
    remaining_base = imb_base.drop(undersampler.sample_indices_, axis=0, inplace=False) # dropping samples with unselected indexes
    # the samples in remaining_base are only going to have the samples that were not selected by undersampling
    index_remaining_base = list(remaining_base.index)
    #print(index_remaining_base)

    for sample_index in undersampler.sample_indices_: 
      if sample_index in index_remaining_base: # checking
        return False
    
    return True

In [27]:
def run_sbc(base, base_t):
  print(f'{Counter(base["mc_cri_vdrl"])}')

  if base_t == 'base_reduzida':
    N_CLUSTERS = 4
  elif base_t == 'base_total':
    N_CLUSTERS = 6

  kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
  kmeans.fit(base)

  print(f'KMeans labels: {kmeans.labels_} Size: {len(kmeans.labels_)}') # this is where each cluster labels are at

  clusters = [[None]]*N_CLUSTERS # clusters array

  for i in range(len(clusters)): 
    clusters[i] = [] # each position of the arrays of arrays represents a cluster

  for i in range(len(kmeans.labels_)): # the array at the position 0 corresponds to the cluster 1. Position 1: cluster 3. This way until it completes the dataframe
    label = kmeans.labels_[i] # in the label 1 cluster, goes index 0 (above logic). So you will know which cluster each sample belongs to
    clusters[label].append(base.iloc[i])

  df_clusters = [[None]]*N_CLUSTERS # clusters array

  for i in range(N_CLUSTERS): 
    df_clusters[i] = pd.DataFrame(columns=base.columns, data=clusters[i]) # putting the samples in each cluster

  for i in range(len(df_clusters)):
    print(f'Cluster {i} with {df_clusters[i].shape[0]} samples. {df_clusters[i]["mc_cri_vdrl"].value_counts()[0]} negatives',
          f'{df_clusters[i]["mc_cri_vdrl"].value_counts()[1]} positives')

  total_minority = base['mc_cri_vdrl'].value_counts()[1] # full quantity of positive cases
  ratio_clusters = [[None]]* N_CLUSTERS 
  sum_ratio_clusters = 0

  for i in range(N_CLUSTERS): 
    majority_samples = df_clusters[i]["mc_cri_vdrl"].value_counts()[0] # getting the quantity of the cluster majority class
    minority_samples = df_clusters[i]["mc_cri_vdrl"].value_counts()[1] # getting the quantity of the cluster minority class

    ratio_clusters[i] = majority_samples/minority_samples # getting ratio
    sum_ratio_clusters = sum_ratio_clusters + ratio_clusters[i]
    print(f'Ratio do Cluster {i}: {ratio_clusters[i]:.2f} (majority/minority)')
  
  print(f'Sum of ratios: {sum_ratio_clusters:.2f}') 

  selected_clusters = [[None]] * N_CLUSTERS # putting here how many samples each cluster will have

  for i in range(N_CLUSTERS): # applying the sbc formula (assuming the ratio is 1
    selected_clusters[i] = (1 * total_minority) * (ratio_clusters[i]/sum_ratio_clusters)
    print(f'Cluster {i} {round(selected_clusters[i])} samples negatives to remove') # taking this quantity of negative samples of each cluster

  samples_cluster = [[None]] * N_CLUSTERS # this way is more readable
  
  for i in range(N_CLUSTERS): # taking the quantity of samples in a random way (using random_state so it is replicable) of each cluster and saving
    samples_cluster[i] = (df_clusters[i])[(df_clusters[i])['mc_cri_vdrl'] == 0.0].sample(round(selected_clusters[i]), random_state=42)
    print(f'Cluster {i} {samples_cluster[i].shape[0]} negative samples removed in random')

  base_negatives_sbc = pd.DataFrame(columns=base.columns) # putting the chosen samples in a base
  
  for i in range(N_CLUSTERS):
    base_negatives_sbc = pd.concat([base_negatives_sbc, samples_cluster[i]])
  
  print(f'Shape {base_negatives_sbc.shape[0]}')

  base_positive = base[base['mc_cri_vdrl'] == 1.0] # putting the positive cases in a base, to join
  
  base_undersampled = pd.concat([base_positive, base_negatives_sbc]) # putting in a unique base
  print(f'{base_undersampled["mc_cri_vdrl"].value_counts()}')

  chosen_indexes = []

  for i in base_undersampled.index: 
    chosen_indexes.append(i) # putting the indexes of the samples chosen via undersampling

  print(len(chosen_indexes))
  frequency = Counter(chosen_indexes)
  not_chosen_indexes = []

  for i in base.index:
    if frequency[i] > 0: 
      not_chosen_indexes.append(i) # putting the indexes here of the not chosen
  
  X_res_sbc = base_undersampled.drop(labels=['mc_cri_vdrl'], axis=1, inplace=False)
  y_res_sbc = base_undersampled['mc_cri_vdrl']

  return X_res_sbc, y_res_sbc, not_chosen_indexes

In [28]:
def run_undersampling(method, X, y, imb_base, type_nm=None, qt_neighbors=None, qt_neighbors_2=None, base_t=None):
  if method == 'random_under':
    undersampler = RandomUnderSampler(
        sampling_strategy='majority',
        random_state=42,
        replacement=False
    )

    X_undersampling, y_undersampling = undersampler.fit_resample(X, y)
    title = f'Classes {Counter(y_undersampling)}, undersampling {method}'
  elif method == 'sbc':
    X_undersampling, y_undersampling, undersampler = run_sbc(imb_base, base_t)
    title = f'Classes {Counter(y_undersampling)}, undersampling {method}'
    # here the sbc undersampler is not the method but a list with the remaining samples indexes
  elif method == 'near_miss':
    if type_nm == '3':
      undersampler = NearMiss(
        sampling_strategy='majority',
        version=int(type_nm),
        n_neighbors=qt_neighbors,
        n_neighbors_ver3=qt_neighbors_2,
        n_jobs=-1
      )
      X_undersampling, y_undersampling = undersampler.fit_resample(X, y)
      title = f'Classes {Counter(y_undersampling)}, undersampling {method}-{type_nm} neighbors {qt_neighbors} neighbors_2 {qt_neighbors_2}'
    else:
      undersampler = NearMiss(
        sampling_strategy='majority',
        version=int(type_nm),
        n_neighbors=qt_neighbors,
        n_jobs=-1
      )
      X_undersampling, y_undersampling = undersampler.fit_resample(X, y)
      title = f'Classes {Counter(y_undersampling)}, undersampling {method}-{type_nm} neighbors {qt_neighbors}'
      
  print(title)

  return X_undersampling, y_undersampling, undersampler

In [34]:
def generate_base_with_remaining_samples(method, X, y, imb_base, base_t, type_nm=None, qt_neighbors=None, qt_neighbors_2=None):
  X_undersampling, y_undersampling, undersampler = run_undersampling(method, X, y, imb_base, type_nm, qt_neighbors, qt_neighbors_2, base_t)
  # undersampler.sample_indices_ 
  if method == 'sbc':
    remaining_base = imb_base.drop(undersampler, axis=0, inplace=False)
    print(f'Worked? {checking_if_everything_is_ok(imb_base, undersampler, method="sbc")}')
  else:
    remaining_base = imb_base.drop(undersampler.sample_indices_, axis=0, inplace=False)
    print(f'Worked? {checking_if_everything_is_ok(imb_base, undersampler)}')
  
  # selecting the data that were not selected through the undersampling method
  print(f'Total base: {imb_base.shape[0]} \nBase without the data selected by {method}: {remaining_base.shape[0]}')
  remaining_base.reset_index(drop=True) # reseting index, it may cause trouble later
  print(f'Congenital syphilis case in the base with the discarded data\n{remaining_base["mc_cri_vdrl"].value_counts()}')
  print('')
  if type_nm != None:
    if type_nm == '3':
      title = f'{base_t}_{method}-{type_nm}_k{qt_neighbors}_k2{qt_neighbors_2}_unselected_samples'
    else:
      title = f'{base_t}_{method}-{type_nm}_k{qt_neighbors}_unselected_samples'
  else:
    title = f'{base_t}_{method}_unselected_samples'
  
  print(title)
  #remaining_base.to_csv(path+'dados-nao-selecionados-pelo-undersampling/'+title+'.csv', index=False)

In [33]:
def generate_unselected_samples():
  #method_undersampling = ['random_under', 'sbc', 'near_miss'] # undersamplings used
  #method_undersampling = ['random_under'] # undersamplings used
  method_undersampling = ['sbc'] # undersamplings used
  #method_undersampling = ['near_miss']
  #base_type = ['base_total', 'base_reduzida'] # base type
  #base_type = ['base_total'] # base type
  base_type = ['base_reduzida']
  #type_near_miss = ['1', '2', '3']
  type_near_miss = ['1', '2']
  #type_near_miss = ['3']

  for base_t in base_type:
    imb_base = read_imb_base(base_t) # reading the imbalanced data
    print(f'{base_t} negatives: {(Counter(imb_base["mc_cri_vdrl"])[0])} positives: {(Counter(imb_base["mc_cri_vdrl"])[1])}')

    X = imb_base.drop(labels=['mc_cri_vdrl'], axis=1, inplace=False) # X e y for undersampling
    y = imb_base['mc_cri_vdrl']

    for method in method_undersampling: # applying undersampling
      if method == 'near_miss':
        n_neighbors = [1, 3, 5, 7, 9, 11]
        for type_nm in type_near_miss:
          for n in n_neighbors:
            if type_nm == '3':
              for n_2 in n_neighbors: # doing another loop
                if n_2 > n:
                  generate_base_with_remaining_samples(method, X, y, imb_base, base_t, type_nm, n, n_2)
            else:  
              generate_base_with_remaining_samples(method, X, y, imb_base, base_t, type_nm, n)
      else:
        generate_base_with_remaining_samples(method, X, y, imb_base, base_t)

In [35]:
generate_unselected_samples()

base_reduzida negativos: 33938 positivos: 862
Counter({0.0: 33938, 1.0: 862})




KMeans labels: [1 3 0 ... 0 1 3] Tamanho: 34800
Cluster 0 com 10064 amostras. 9781 negativos 283 positivos
Cluster 1 com 14053 amostras. 13770 negativos 283 positivos
Cluster 2 com 5987 amostras. 5829 negativos 158 positivos
Cluster 3 com 4696 amostras. 4558 negativos 138 positivos
Ratio do Cluster 0: 34.56 (majoritária/minoritária)
Ratio do Cluster 1: 48.66 (majoritária/minoritária)
Ratio do Cluster 2: 36.89 (majoritária/minoritária)
Ratio do Cluster 3: 33.03 (majoritária/minoritária)
Soma dos ratios: 153.14
Cluster 0 195 amostras negativas para retirar
Cluster 1 274 amostras negativas para retirar
Cluster 2 208 amostras negativas para retirar
Cluster 3 186 amostras negativas para retirar
Cluster 0 195 amostras negativas retiradas de forma aleatória
Cluster 1 274 amostras negativas retiradas de forma aleatória
Cluster 2 208 amostras negativas retiradas de forma aleatória
Cluster 3 186 amostras negativas retiradas de forma aleatória
Shape 863
0.0    863
1.0    862
Name: mc_cri_vdrl, dt