In [1]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np
import random

In [2]:
import dask.dataframe as dd

In [3]:
#get K value for K Means
def getK(x):
    scaled_x = x
    distortions = {}
    i = 1
    while True:
        #fit k means clustering according to i
        km = KMeans(
            n_clusters= i, init='random',
            n_init=10, max_iter=300, 
            tol=1e-04, random_state=0
        ).fit(scaled_x)
        #get distortion of actual k, which is the sum distance between clusters and their centroid
        current_distortion = sum(np.min(cdist(scaled_x, km.cluster_centers_,'euclidean'), axis=1)) / scaled_x.shape[0] 
        distortions[i] = current_distortion
        #getting 3 iterations
        if i >= 3:
            #get slope between i -1 and i - 2, i and i - 1
            m1 = distortions[i - 2] - distortions[i - 1]
            m2 = distortions[i - 1] - distortions[i]
            #get the differential between slopes and addition
            m_dif = m1 - m2
            m_sum = m1 + m2
            #get the percentage representation of differential, since 100% equals to the sum of slope values
            dif_percentage = (m_dif * 100) / m_sum
            #if this percentage is less than 25%, it means that  distortion will have a linear behaviour as more k iterations
            #so we can say that a correct k value for optimal clustering is i - 2.
            if dif_percentage < 25.0:
                break
        i += 1
    return i - 2

In [4]:
def makeClusters(n):
    km = KMeans(
        n_clusters= n, init='random',
        n_init=10, max_iter=300, 
        tol=1e-04, random_state=0
    )
    y_km = km.fit_predict(x)
    cluster_labels = km.labels_
    return km, y_km, cluster_labels

In [5]:
# def plotClusters(km,y_km, n):
#     # plot the clusters
#     for i in range(n):
#         plt.scatter(
#         x[y_km == i, 0], x[y_km == i, 1],
#         c = '#' + str(random.randint(100000,999999)),
#         label = 'cluster ' + str(i)
#     )
#     # plot the centroids
#     plt.scatter(
#         km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
#         s=250, marker='*',
#         c='red', edgecolor='black',
#         label='centroids'
#     )
#     plt.legend(scatterpoints=1)
#     plt.grid()
#     plt.show()

In [6]:
df = dd.read_csv('/home/alt9193/Documents/IA/DeepLearningBackend/examples/telecom_pca.csv')
df = df.drop(columns=["Unnamed: 0"])
df

Unnamed: 0_level_0,TARGET,YEAR_JOINED,CURRENT_YEAR,BILL_AMOUNT,PAID_AMOUNT,PAYMENT_TRANSACTIONS,PARTY_REV,PREPAID_LINES,POSTPAID_LINES,OTHER_LINES,LINE_REV,MOUS_TO_LOCAL_MOBILES,MOUS_FROM_LOCAL_MOBILES,MOUS_TO_LOCAL_LANDLINES,MOUS_FROM_LOCAL_LANDLINES,MOUS_TO_INT_NUMBER,MOUS_FROM_INT_NUMBER,DATA_IN_BNDL,DATA_OUT_BNDL
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,int64,int64,int64,float64,float64,int64,float64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [7]:
x = np.array(df.drop(columns=['TARGET']))

In [8]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(1, 10)
scaled_x = x 
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(scaled_x)
  
    distortions.append(sum(np.min(cdist(scaled_x, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / scaled_x.shape[0])
    inertias.append(kmeanModel.inertia_)
  
    mapping1[k] = sum(np.min(cdist(scaled_x, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / scaled_x.shape[0]
    mapping2[k] = kmeanModel.inertia_


n = getK(x)
km, y_km, km_labels = makeClusters(n)

In [9]:
n

2

In [10]:
clusters = pd.DataFrame(data = {'cluster': y_km})

In [11]:
# Une la clasificacion con los datos del dataset
df_clusters = dd.merge(clusters, df.drop(columns=['TARGET']), left_index=True, right_index=True)
df_clusters

Unnamed: 0_level_0,cluster,YEAR_JOINED,CURRENT_YEAR,BILL_AMOUNT,PAID_AMOUNT,PAYMENT_TRANSACTIONS,PARTY_REV,PREPAID_LINES,POSTPAID_LINES,OTHER_LINES,LINE_REV,MOUS_TO_LOCAL_MOBILES,MOUS_FROM_LOCAL_MOBILES,MOUS_TO_LOCAL_LANDLINES,MOUS_FROM_LOCAL_LANDLINES,MOUS_TO_INT_NUMBER,MOUS_FROM_INT_NUMBER,DATA_IN_BNDL,DATA_OUT_BNDL
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,int32,int64,int64,float64,float64,int64,float64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [12]:
df_clusters = dd.merge(df_clusters, df[['TARGET']], left_index=True, right_index=True)
df_clusters

Unnamed: 0_level_0,cluster,YEAR_JOINED,CURRENT_YEAR,BILL_AMOUNT,PAID_AMOUNT,PAYMENT_TRANSACTIONS,PARTY_REV,PREPAID_LINES,POSTPAID_LINES,OTHER_LINES,LINE_REV,MOUS_TO_LOCAL_MOBILES,MOUS_FROM_LOCAL_MOBILES,MOUS_TO_LOCAL_LANDLINES,MOUS_FROM_LOCAL_LANDLINES,MOUS_TO_INT_NUMBER,MOUS_FROM_INT_NUMBER,DATA_IN_BNDL,DATA_OUT_BNDL,TARGET
npartitions=2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,int32,int64,int64,float64,float64,int64,float64,int64,int64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [13]:
# sort the dataframe
df_clusters = df_clusters.sort_values(by=['cluster'])
df_clusters.compute()

Unnamed: 0,cluster,YEAR_JOINED,CURRENT_YEAR,BILL_AMOUNT,PAID_AMOUNT,PAYMENT_TRANSACTIONS,PARTY_REV,PREPAID_LINES,POSTPAID_LINES,OTHER_LINES,LINE_REV,MOUS_TO_LOCAL_MOBILES,MOUS_FROM_LOCAL_MOBILES,MOUS_TO_LOCAL_LANDLINES,MOUS_FROM_LOCAL_LANDLINES,MOUS_TO_INT_NUMBER,MOUS_FROM_INT_NUMBER,DATA_IN_BNDL,DATA_OUT_BNDL,TARGET
1,0,2018,2019,497.764401,400.000000,1,612.706667,0,1,1,515.773333,115.390,27.235,4.075,18.820,78.740,4.525,3536.632324,0.0,0
186480,0,2009,2019,498.502344,407.780000,1,1019.213333,2,4,0,292.806667,76.830,14.135,4.850,22.850,13.600,16.260,16.201172,0.0,0
186480,0,2015,2019,346.235975,325.000000,2,302.575000,0,1,0,302.575000,120.110,81.785,11.695,25.585,41.295,7.425,37.104492,0.0,0
186480,0,2015,2019,346.235975,325.000000,2,302.575000,0,1,0,302.575000,120.110,81.785,11.695,25.585,41.295,7.425,37.104492,0.0,0
186479,0,2009,2019,729.627085,848.303333,2,1253.678333,0,1,1,819.100000,464.965,183.525,75.210,49.095,7.890,0.000,3520.781738,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536003,1,2017,2018,896.861660,1089.333333,1,1299.838333,0,1,0,1299.838333,66.350,13.135,130.950,103.815,144.620,88.010,19274.636230,0.0,1
536003,1,2017,2018,896.861660,1089.333333,1,1299.838333,0,1,0,1299.838333,66.350,13.135,130.950,103.815,144.620,88.010,19274.636230,0.0,0
536003,1,2018,2019,300.219587,306.216667,1,346.640000,1,1,1,299.000000,0.000,0.000,0.000,0.000,0.000,0.000,272977.441406,0.0,1
452004,1,2017,2019,290.592737,313.846667,1,343.135000,2,1,1,299.000000,0.000,0.000,0.000,0.000,0.000,0.000,346641.752930,0.0,0


In [14]:
df_to_csv = df_clusters[df_clusters['cluster'] == 0]
df_to_csv
for i in range (n):
    df_to_csv = df_clusters[df_clusters['cluster'] == i]
    df_to_csv.to_csv('/home/alt9193/Documents/IA/DeepLearningBackend/examples/telecom_pca_cluster_'+str(i)+'.csv', single_file=True)