## Aula 2 - Obtendo os clusters

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('CC GENERAL.CSV')

In [3]:
data.drop(columns=['CUST_ID', 'TENURE'], inplace=True) 

In [4]:
data.head()

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0


In [5]:
data.isna().sum()

BALANCE                               0
BALANCE_FREQUENCY                     0
PURCHASES                             0
ONEOFF_PURCHASES                      0
INSTALLMENTS_PURCHASES                0
CASH_ADVANCE                          0
PURCHASES_FREQUENCY                   0
ONEOFF_PURCHASES_FREQUENCY            0
PURCHASES_INSTALLMENTS_FREQUENCY      0
CASH_ADVANCE_FREQUENCY                0
CASH_ADVANCE_TRX                      0
PURCHASES_TRX                         0
CREDIT_LIMIT                          1
PAYMENTS                              0
MINIMUM_PAYMENTS                    313
PRC_FULL_PAYMENT                      0
dtype: int64

In [6]:
data.fillna(data.median(), inplace=True)

In [7]:
from sklearn.preprocessing import Normalizer
values = Normalizer().fit_transform(data.values)

In [8]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, n_init=10, max_iter=300)
y_pred = kmeans.fit_predict(values)

## Aula 3 - Entendendo os critérios e métricas de validação

In [9]:
# Coeficiente de Silhouette
from sklearn.metrics import silhouette_score
labels = kmeans.labels_
silhouette = silhouette_score(values, labels, metric='euclidean')
print(silhouette)

0.36454411415175675


## Aula 4 - Entendendo e calculando o índice Davies-Bouldin

In [10]:
from sklearn.metrics import davies_bouldin_score
dbs = davies_bouldin_score(values, labels)
print(dbs)

1.0759640420985468


## Aula 5 - Entendendo e calculando o índice Calinski-Harabasz

In [12]:
from sklearn.metrics import calinski_harabasz_score
chs = calinski_harabasz_score(values, labels)
print(chs)

3431.7970107150363


## Aula 6 - Validando os clusters

In [18]:
def clustering_algorithm(n_clusters, dataset):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, max_iter=300)
    labels = kmeans.fit_predict(dataset)
    s = silhouette_score(dataset, labels, metric='euclidean')
    dbs = davies_bouldin_score(dataset, labels)
    chs = calinski_harabasz_score(dataset, labels)
    
    return s, dbs, chs

In [26]:
clustering_algorithm(3, values)

(0.3272203126696238, 1.3096073640088426, 3526.440519908274)

In [20]:
clustering_algorithm(5, values)

(0.3645412082353538, 1.0759109484969387, 3431.7941358665316)

In [22]:
clustering_algorithm(10, values)

(0.35162480221559056, 1.1134907706123962, 3019.1197975312843)

In [23]:
clustering_algorithm(20, values)

(0.2714740435125655, 1.231195637544065, 2401.77689024112)

In [24]:
clustering_algorithm(50, values)

(0.25431510321729117, 1.1987204456261005, 1614.7499167315657)

In [29]:
import numpy as np
random_data = np.random.rand(8950, 16)
print(clustering_algorithm(5, values))
print(clustering_algorithm(5, random_data))

(0.36454314826693845, 1.075812734747931, 3431.800540734652)
(0.03924799512344281, 3.514320955210715, 300.1625541800183)


In [30]:
set1, set2, set3 = np.array_split(values, 3)
print(clustering_algorithm(5, set1))
print(clustering_algorithm(5, set2))
print(clustering_algorithm(5, set3))

(0.3692455604046628, 1.0550439720459237, 1203.9722444846886)
(0.35416642754504835, 1.1382306445993162, 1194.951986504888)
(0.36704573758244236, 1.0990426578873274, 1167.5322875560996)
