In [9]:
import pandas as pd
import prince
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

In [10]:
class Categoricer:

    def __init__(self):
        self.hierarchicalClustering = HierarchicalClustering()
    
    def generate_from_dataframe(self,dataframe):
        print("Categoricer - Generating categories from data")
        data = dataframe.copy(deep=True)
        for columnName in data.columns:
            dataType = data[columnName].dtype
            if dataType == "int64" or dataType == "float64":
                self.__transformNumericColumnToSegments(data,columnName)
            else:
                self.__transformCategoricalColumnToSegments(data,columnName)
        return data
        
    def __transformNumericColumnToSegments(self,data,columnName):
        print("Categoricer - Numeric - generate segments for column %s"%(columnName))
        X = np.array(data[[columnName]])
        labels = self.hierarchicalClustering.agglomerativeSegmentate( data, columnName )
        for index in range(len(labels)):
            label = labels[index]
            char_label = self.__getIndexCharacter(label)
            data.iloc[index, data.columns.get_loc(columnName)] = char_label

    def __transformCategoricalColumnToSegments(self,data,columnName):
        print("Categoricer - Categorical - generate segments for column %s"%(columnName))
        for index,value in enumerate(data[columnName].unique()):
            char_label = self.__getIndexCharacter(index)
            data.loc[data[columnName] == value, columnName] = char_label
            
    def __getIndexCharacter(self,number):
        result = ""
        while True:
            mod = number%26
            result = str(chr(65+mod)) + result
            number = int(number/26) - 1
            if number < 0:
                break
        return result
    
    def __proposedAmount(self,data_size):
        segmentsAmount = 1 + 3.3 * math.log10(data_size)
        return math.ceil(segmentsAmount)

    def __select_bestK_by_silhouette(self,scores):
        OFFSET = 2
        max_values = np.where(scores == np.max(scores))
        first_max_possition_value = max_values[0][0]+OFFSET
        return first_max_possition_value

In [11]:
import pandas as pd
import numpy as np
import math

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import preprocessing

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import scale

class HierarchicalClustering:

    def __init__(self):
        self.AFFINITY = "euclidean"

    def agglomerativeSegmentate(self,data,columnName):
        X = np.array(data[columnName])
        X = scale(X)
        _ , intra_variance = self.__get_variances_distributions(X)
        K = self.get_optimal_k(intra_variance)
        model = AgglomerativeClustering(
                n_clusters=K,
                affinity=self.AFFINITY,
                linkage="single",
                compute_full_tree=True
            )
        model.fit_predict( [[x] for x in X] )
        return model.labels_

    def __get_variances_distributions(self,X):
        MAX_K_ITERATIONS = self.__proposedAmount( len(X) )
        inter_variances = []
        intra_variance = []
        for i in range( 1, min( len(X),MAX_K_ITERATIONS )+1 ):
            model = AgglomerativeClustering(
                n_clusters=i,
                affinity=self.AFFINITY,
                linkage="ward",
                compute_full_tree=True
            )
            model.fit_predict( [[x] for x in X] )         
            inter_variances.append( self.__calculate_varianze_inter( X, model.labels_, np.unique(model.labels_) ) )
            intra_variance.append( self.__calculate_variance_intra( X, model.labels_, np.unique(model.labels_) ) )
        return inter_variances,intra_variance

    def __calculate_varianze_inter(self,values,labels,targets):
        mean_total = np.mean( values )
        sumatory = 0
        for target in targets:
            filter_group_values = [ values[i] for i in range(len(values)) if labels[i]==target ]
            mean_group = np.mean( filter_group_values )
            sumatory += (( mean_group-mean_total )**2)*len(filter_group_values)
        return sumatory / len(values)
    
    def __calculate_variance_intra(self,values,labels,targets):
        sumatory = 0
        for target in targets:
            filter_group_values = [ values[i] for i in range(len(values)) if labels[i]==target ]
            variance_group = np.var( filter_group_values )
            sumatory += (variance_group)*len(filter_group_values)
        return sumatory / len(values)

    def __proposedAmount(self,data_size):
        segmentsAmount = 1 + 3.3 * math.log10(data_size)
        return math.ceil(segmentsAmount)

    def get_optimal_k(self,distortions):
        OFF_SET = 1
        # top line formula
        x1,y1 = 1,distortions[0]
        x2,y2 = len(distortions),distortions[-1]
        m = (y2 - y1) / (x2 - x1)
        b = y1 - m * x1
        # find each point variables
        m_inverted = -1/m

        distances = []  
        for i in range(OFF_SET,len(distortions)+OFF_SET):
            # find variables
            xData, yData = i,distortions[i-OFF_SET]
            b_intersection = yData - m_inverted*xData
            # distance points
            px = ( b_intersection - b ) / ( m - m_inverted)
            py = m_inverted*px + b_intersection
            #find distance
            distance = math.sqrt( math.pow(px-xData,2) + math.pow(py-yData,2) )
            distances.append(distance)
            
        maxDistanceResults = np.where(distances == np.max(distances))
        return maxDistanceResults[0][0]+OFF_SET

In [12]:
data = pd.read_csv(r"data\data.csv")
data = data.iloc[:,0:6]
data

Unnamed: 0,ingresos,egresos,activos,pasivos,vol_trans,a_econ
0,20000000,6000000,380000000,48000000,6000000,A2
1,5593385,1185000,43000000,14000000,1678015,A2
2,1800000,730000,123000000,20000000,540000,A2
3,3474000,1300000,52000000,10000000,1042200,A2
4,5421293,825000,75000000,500000,1626387,A2
...,...,...,...,...,...,...
6995,8000000,1420000,83000000,10000000,2400000,A4
6996,2400000,600000,20000000,1,720000,A2
6997,2792832,800000,123000000,800000,837849,A6
6998,1600000,600000,270000000,1,480000,A6


In [14]:
categoricer = Categoricer()
categoricer.generate_from_dataframe( data )

Categoricer - Generating categories from data
Categoricer - Numeric - generate segments for column ingresos


KeyboardInterrupt: 

In [31]:
def transformNumericColumnToSegments(data,columnName):
    print("Categoricer - Numeric - generate segments for column %s"%(columnName))
    X = np.array(data[[columnName]])
    labels = agglomerativeSegmentate( data, columnName )
    for index in range(len(labels)):
        label = labels[index]
        char_label = self.__getIndexCharacter(label)
        data.iloc[index, data.columns.get_loc(columnName)] = char_label
        
def agglomerativeSegmentate(data,columnName):
    print("here")
    X = np.array(data[columnName])
    X = scale(X)
    _ , intra_variance = self.__get_variances_distributions(X)
    K = self.get_optimal_k(intra_variance)
    model = AgglomerativeClustering(
            n_clusters=K,
            affinity=self.AFFINITY,
            linkage="single",
            compute_full_tree=True
    )
    model.fit_predict( [[x] for x in X] )
    return model.labels_
            
transformNumericColumnToSegments(data,"ingresos")

Categoricer - Numeric - generate segments for column ingresos
here


AttributeError: 'numpy.ndarray' object has no attribute 'to_list'

In [35]:
X = data["ingresos"].to_list()
X = scale(X)
model = AgglomerativeClustering(
    n_clusters=1,
    affinity="euclidean",
    linkage="ward",
    compute_full_tree=True)
model.fit(model.fit_predict( [[0,x] for x in X] ) )

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [34]:
[[0,x] for x in X]

[[0, 2.735458789029594],
 [0, 0.13992734012474822],
 [0, -0.5434982990905642],
 [0, -0.24190631197445736],
 [0, 0.10892281906932107],
 [0, 0.2899783618388799],
 [0, -0.4534170604059445],
 [0, 0.5990993388842473],
 [0, -0.3273033262474769],
 [0, -0.4534170604059445],
 [0, -0.6285349884088453],
 [0, -0.5074658036167163],
 [0, 0.19891415667773354],
 [0, -0.5885389184328741],
 [0, -0.5434982990905642],
 [0, 0.5194603173879486],
 [0, -0.3273033262474769],
 [0, -0.5615145468274881],
 [0, -0.4218886268663276],
 [0, -0.597547042301336],
 [0, -0.23722208756285712],
 [0, -0.4795406196244842],
 [0, 0.10508661943869786],
 [0, -0.5903405432065665],
 [0, -0.16515709661516134],
 [0, -0.8677907583551953],
 [0, 0.03302162849100207],
 [0, -0.324186515388989],
 [0, -0.5434982990905642],
 [0, -0.162634821931992],
 [0, 0.393346583229481],
 [0, -0.6291475408319007],
 [0, 0.6798027602968433],
 [0, -0.30568382896316815],
 [0, -0.4173845649320966],
 [0, -0.4534170604059445],
 [0, -0.6347053730962643],
 [0, -0.

In [46]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np

X = np.array([[2], [4], [0],[2], [4], [0]])

#X = data["ingresos"].to_list()
#X = np.array(X).reshape(-1, 1)

clustering = AgglomerativeClustering()
clustering.fit(X)

clustering.children_

array([[0, 3],
       [1, 4],
       [2, 5],
       [6, 7],
       [8, 9]], dtype=int64)