In [25]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np
import random

In [26]:
import dask.dataframe as dd

In [27]:
#get K value for K Means
def getK(x):
    scaled_x = x
    distortions = {}
    i = 1
    while True:
        #fit k means clustering according to i
        km = KMeans(
            n_clusters= i, init='random',
            n_init=10, max_iter=300, 
            tol=1e-04, random_state=0
        ).fit(scaled_x)
        #get distortion of actual k, which is the sum distance between clusters and their centroid
        current_distortion = sum(np.min(cdist(scaled_x, km.cluster_centers_,'euclidean'), axis=1)) / scaled_x.shape[0] 
        distortions[i] = current_distortion
        #getting 3 iterations
        if i >= 3:
            #get slope between i -1 and i - 2, i and i - 1
            m1 = distortions[i - 2] - distortions[i - 1]
            m2 = distortions[i - 1] - distortions[i]
            #get the differential between slopes and addition
            m_dif = m1 - m2
            m_sum = m1 + m2
            #get the percentage representation of differential, since 100% equals to the sum of slope values
            dif_percentage = (m_dif * 100) / m_sum
            #if this percentage is less than 25%, it means that  distortion will have a linear behaviour as more k iterations
            #so we can say that a correct k value for optimal clustering is i - 2.
            if dif_percentage < 25.0:
                break
        i += 1
    return i - 2

In [28]:
def makeClusters(n):
    km = KMeans(
        n_clusters= n, init='random',
        n_init=10, max_iter=300, 
        tol=1e-04, random_state=0
    )
    y_km = km.fit_predict(x)
    cluster_labels = km.labels_
    return km, y_km, cluster_labels

In [31]:
def make_clusters(file_path, file_name):
    df = dd.read_csv(file_path + "/" + file_name + ".csv")
    df = df.drop(columns=["Unnamed: 0"])

    x = np.array(df.drop(columns=['TARGET']))

    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}
    K = range(1, 10)
    scaled_x = x 
    for k in K:
        # Building and fitting the model
        kmeanModel = KMeans(n_clusters=k).fit(scaled_x)
    
        distortions.append(sum(np.min(cdist(scaled_x, kmeanModel.cluster_centers_,
                                            'euclidean'), axis=1)) / scaled_x.shape[0])
        inertias.append(kmeanModel.inertia_)
    
        mapping1[k] = sum(np.min(cdist(scaled_x, kmeanModel.cluster_centers_,
                                    'euclidean'), axis=1)) / scaled_x.shape[0]
        mapping2[k] = kmeanModel.inertia_


    n = getK(x)
    km, y_km, km_labels = makeClusters(n)

    clusters = pd.DataFrame(data = {'cluster': y_km})

    # Une la clasificacion con los datos del dataset
    df_clusters = dd.merge(clusters, df.drop(columns=['TARGET']), left_index=True, right_index=True)

    df_clusters = dd.merge(df_clusters, df[['TARGET']], left_index=True, right_index=True)

    # sort the dataframe
    df_clusters = df_clusters.sort_values(by=['cluster'])

    info = []
    amount = 0
    for i in range (n):
        df_to_csv = df_clusters[df_clusters['cluster'] == i]
        clusters_amount = df_to_csv.shape[0].compute()
        amount += clusters_amount 
        info.append({"name": "cluster_" + str(i + 1), "percentage": clusters_amount})
        df_to_csv.to_csv(file_path + "/cluster/cluster_" + str(i) + "/cluster.csv", single_file=True)
    for i in info:
        i["percentage"] = i["percentage"] / amount
    
    return info

In [32]:
make_clusters("/home/alt9193/Documents/IA/DeepLearningBackend/examples/", "telecom_pca")

[{'name': 'cluster_1', 'percentage': 0.9922587129012082},
 {'name': 'cluster_2', 'percentage': 0.007741287098791895}]