#Package imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
path = 'PATH_TO_DATASET_IN_DRIVE'

Mounted at /content/gdrive/


#Action

In [None]:
def read_imb_dataset(dataset_type): # reading the imbalanced dataset
  if dataset_type == 'base_total':
    complement = 'NORMALIZADO' # this was used in my drive as a name to differentiate the bases

  imb_base = pd.read_csv(path+'base_pre_proc_desb_imp_moda_'+complement+'.csv', sep=',')

  return imb_base

In [None]:
def analisar_k_value(base_t):
  base = read_imb_dataset(base_t)
  analyse_siloette(base)

In [None]:
def apply_kmeans_distortions(base):
  X = base.drop(labels=['mc_cri_vdrl'], axis=1, inplace=False)
  y = base['mc_cri_vdrl']

  distortions = []
  K = range(1, 31)

  for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    distortions.append(kmeans.inertia_)

  plt.figure()
  plt.plot(K, distortions)
  plt.xlabel('K')
  plt.xticks(np.arange(min(K), max(K)+1, 1.0))
  plt.ylabel('Distortion')
  plt.title('Values of K x Distortions')
  plt.show()

In [None]:
def analyse_siloette(base):
  X = base.drop(labels=['mc_cri_vdrl'], axis=1, inplace=False)
  y = base['mc_cri_vdrl']

  sil = []
  kmax = 30
  K = range(1, 31)

  for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    labels=kmeans.labels_
    sil.append(silhouette_score(X, labels, metric='euclidean'))

  plt.figure(figsize=(16, 8))
  plt.plot(K[1:], sil)
  plt.xlabel('K')
  plt.xticks(np.arange(min(K), max(K)+1, 1.0))
  plt.ylabel('Silhouettes')
  plt.title('Silhouette values x K values')
  plt.show()

In [None]:
def apply_sbc(base_t):
  base = read_imb_dataset(base_t)
  print(f'{Counter(base["mc_cri_vdrl"])}')

  if base_t == 'base_reduzida': # this was a old approach
    N_CLUSTERS = 4
  elif base_t == 'base_total':
    N_CLUSTERS = 6

  kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
  kmeans.fit(base)

  print(f'KMeans labels: {kmeans.labels_} Size: {len(kmeans.labels_)}') # aqui é onde fica as labels de cada cluster

  clusters = [[None]]*N_CLUSTERS # clusters array
  #print(clusters)

  for i in range(len(clusters)):
    clusters[i] = [] # each array position represents a cluster

  for i in range(len(kmeans.labels_)):
    label = kmeans.labels_[i] # position 0 array is cluster 1. position 1 -> cluster 3. This way until complete the dataframe
    clusters[label].append(base.iloc[i]) # on cluster of label 1, goes the index 0 (above logics). To know which cluster each sample belongs to

  df_clusters = [[None]]*N_CLUSTERS # clusters array

  for i in range(N_CLUSTERS):
    df_clusters[i] = pd.DataFrame(columns=base.columns, data=clusters[i]) # putting the samples in each cluster

  for i in range(len(df_clusters)):
    print(f'Cluster {i} with {df_clusters[i].shape[0]} samples. {df_clusters[i]["mc_cri_vdrl"].value_counts()[0]} negatives',
          f'{df_clusters[i]["mc_cri_vdrl"].value_counts()[1]} positives')

  total_minority = base['mc_cri_vdrl'].value_counts()[1] # total of positives cases
  ratio_clusters = [[None]]* N_CLUSTERS
  sum_ratio_clusters = 0

  for i in range(N_CLUSTERS):
    majority_samples = df_clusters[i]["mc_cri_vdrl"].value_counts()[0] # catching the quantity of majority class of the cluster
    minority_samples = df_clusters[i]["mc_cri_vdrl"].value_counts()[1] # catching the quantity of minority class of the cluster

    ratio_clusters[i] = majority_samples/minority_samples # getting the ratio
    sum_ratio_clusters = sum_ratio_clusters + ratio_clusters[i]
    print(f'Cluster Ratio {i}: {ratio_clusters[i]:.2f} (majority/minority)')

  print(f'Ratios sum: {sum_ratio_clusters:.2f}')

  selected_clusters = [[None]] * N_CLUSTERS # putting here how many samples will be in each cluster

  for i in range(N_CLUSTERS): # applying sbc formula (proportion is 1)
    selected_clusters[i] = (1 * total_minority) * (ratio_clusters[i]/sum_ratio_clusters)
    print(f'Cluster {i} {round(selected_clusters[i])} negative samples to remove') # remove this quantity of negative samples

  samples_cluster = [[None]] * N_CLUSTERS

  for i in range(N_CLUSTERS): # retirando a quantidade de amostras de forma aleatoria (controlada random_state para replicar) de cada cluster e guardando
    samples_cluster[i] = (df_clusters[i])[(df_clusters[i])['mc_cri_vdrl'] == 0.0].sample(round(selected_clusters[i]), random_state=42)
    print(f'Cluster {i} {samples_cluster[i].shape[0]} negative samples removed randomly')

  #print(type(samples_cluster[0])) 
  base_negatives_sbc = pd.DataFrame(columns=base.columns) # putting the chosen samples in a dataset
  #base_negatives_sbc = pd.concat([base_negatives_sbc, samples_cluster[0]])
  for i in range(N_CLUSTERS):
    base_negatives_sbc = pd.concat([base_negatives_sbc, samples_cluster[i]], ignore_index=True)

  print(f'Shape {base_negatives_sbc.shape[0]}')
  
  base_positive = base[base['mc_cri_vdrl'] == 1.0] # putting the positive cases in a dataset, to join

  base_undersampled = pd.concat([base_positive, base_negatives_sbc], ignore_index=True) # colocando numa base só
  print(f'{base_undersampled["mc_cri_vdrl"].value_counts()}')

  if base_t == 'base_reduzida':
    nome = 'base_moda_sbc_RECORTE_FEITO_NORMALIZADO.csv'
    #nome = 'base_moda_sbc_RECORTE.csv'
  else:
    #nome = 'base_moda_sbc_NORMALIZADO.csv'
    nome = 'base_moda_sbc.csv'

  #base_undersampled.to_csv(path+f'bases-pre-proc/{nome}', sep=',', index=False)

In [None]:
apply_sbc('base_total')

Counter({0.0: 46493, 1.0: 1023})




KMeans labels: [0 5 1 ... 1 0 5] Tamanho: 47516
Cluster 0 com 17868 amostras. 17559 negativos 309 positivos
Cluster 1 com 13149 amostras. 12841 negativos 308 positivos
Cluster 2 com 2694 amostras. 2629 negativos 65 positivos
Cluster 3 com 6998 amostras. 6833 negativos 165 positivos
Cluster 4 com 1428 amostras. 1396 negativos 32 positivos
Cluster 5 com 5379 amostras. 5235 negativos 144 positivos
Ratio do Cluster 0: 56.83 (majoritária/minoritária)
Ratio do Cluster 1: 41.69 (majoritária/minoritária)
Ratio do Cluster 2: 40.45 (majoritária/minoritária)
Ratio do Cluster 3: 41.41 (majoritária/minoritária)
Ratio do Cluster 4: 43.62 (majoritária/minoritária)
Ratio do Cluster 5: 36.35 (majoritária/minoritária)
Soma dos ratios: 260.35
Cluster 0 223 amostras negativas para retirar
Cluster 1 164 amostras negativas para retirar
Cluster 2 159 amostras negativas para retirar
Cluster 3 163 amostras negativas para retirar
Cluster 4 171 amostras negativas para retirar
Cluster 5 143 amostras negativas par