In [1]:
# from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, adjusted_rand_score, calinski_harabasz_score, davies_bouldin_score
# from sklearn.neighbors import DistanceMetric
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans
# from sklearn.manifold import TSNE
# from sklearn import preprocessing


# from kmodes.kprototypes import KPrototypes
# from kmodes.kmodes import KModes
# from sklearn_extra.cluster import KMedoids

# import umap

# import io
# import math
# import pandas as pd
# import numpy as np
# import math

# from community import community_louvain
# import networkx as nx

# from matplotlib import gridspec
# import matplotlib.pyplot as plt


from mixclu import *

In [None]:
def evaluate_clusters(scores,  preds, labels, name='', X=None):
    
    if X is not None:

        silhouette = silhouette_score(X, preds, metric='euclidean')
        cal_har = calinski_harabasz_score(X, preds)
        dav_bould = davies_bouldin_score(X, preds)

        adj_mut_info = adjusted_mutual_info_score(labels, preds, average_method='arithmetic')
        adj_rand = adjusted_rand_score(labels, preds)

        content = {'Algorithm':name,
                   'Silhouette':silhouette,
                   'Calinski_Harabasz':cal_har,
                   'Davis Bouldin':dav_bould,
                   'Adjusted_Mutual_Info':adj_mut_info,
                   'Adjusted_Rand_Score':adj_rand}

        scores = scores.append(content, ignore_index = True)

    else:

        adj_mut_info = adjusted_mutual_info_score(labels, preds, average_method='arithmetic')
        adj_rand = adjusted_rand_score(labels, preds)

        content = {'Algorithm':name,
                   'Silhouette':np.NaN,
                   'Calinski_Harabasz':np.NaN,
                   'Davis Bouldin':np.NaN,
                   'Adjusted_Mutual_Info':adj_mut_info,
                   'Adjusted_Rand_Score':adj_rand}

        scores = scores.append(content, ignore_index = True)
    return scores



def calculate_gower_distance(df, cat_columns):
    if cat_columns:
        variable_distances = gower.gower_matrix(df,cat_features= 
                           [True if df[k].dtypes == np.object else False 
                            for k in df.columns])
    else:
        variable_distances = gower.gower_matrix(df)
    
    variable_distances[np.isnan(variable_distances)] = 0
    return variable_distances


def calculate_zscore(df, columns):
    '''
    scales columns in dataframe using z-score
    '''
    df = df.copy()
    for col in columns:
        df[col] = (df[col] - df[col].mean())/df[col].std(ddof=0)

    return df



def one_hot_encode(df, columns):
    
    '''
    one hot encodes list of columns and
    concatenates them to the original df
    '''

    concat_df = pd.concat([pd.get_dummies(df[col], drop_first=True, prefix=col) for col in columns], axis=1)
    one_hot_cols = concat_df.columns

    return concat_df, one_hot_cols



def normalize_column_modality(df, columns):
    '''
    divides each column by the probability μₘ of the modality 
    (number of ones in the column divided by N) only for one hot columns
    '''

    length = len(df)
    for col in columns:

        weight = math.sqrt(sum(df[col])/length)
        df[col] = df[col]/weight

    return df



def center_columns(df, columns):
    '''
    center columns by subtracting the mean value
    '''
    for col in columns:
        df[col] = (df[col] - df[col].mean())
    return df



def FAMD_2(df, n_components=2):
    '''
    Factorial Analysis of Mixed Data (FAMD), 
    which generalizes the Principal Component Analysis (PCA) 
    algorithm to datasets containing numerical and categorical variables

    a) For the numerical variables
    - Standard scale (= get the z-score)

    b) For the categorical variables:
    - Get the one-hot encoded columns
    - Divide each column by the square root of its probability sqrt(μₘ)
    - Center the columns

    c) Apply a PCA algorithm over the table obtained!

    '''

    variable_distances = []
    numeric_cols = data.select_dtypes(include=np.number)
    cat_cols = data.select_dtypes(include='object')

    # numeric process
    normalized_df = calculate_zscore(df, numeric_cols)
    normalized_df = normalized_df[numeric_cols.columns]

    # categorical process
    cat_one_hot_df, one_hot_cols = one_hot_encode(df, cat_cols)
    cat_one_hot_norm_df = normalize_column_modality(cat_one_hot_df, one_hot_cols)
    cat_one_hot_norm_center_df = center_columns(cat_one_hot_norm_df, one_hot_cols)

    # Merge DataFrames
    processed_df = pd.concat([normalized_df, cat_one_hot_norm_center_df], axis=1)

    # Perform (PCA)
    pca = PCA(n_components=n_components)
    principalComponents = pca.fit_transform(processed_df)

    return principalComponents


# use Umap to do embedding then cluster on that
def umap_reduce(df, intersection=False):
    
    numerical = df.select_dtypes(exclude='object')
    for c in numerical.columns:
        numerical[c] = (numerical[c] - numerical[c].mean())/numerical[c].std(ddof=0)
      
    ##preprocessing categorical
    categorical = df.select_dtypes(include='object')
    categorical = pd.get_dummies(categorical)



    #Embedding numerical & categorical
    fit1 = umap.UMAP(random_state=12).fit(numerical)
    fit2 = umap.UMAP(metric='dice', n_neighbors=250).fit(categorical)

    numeric_embedding = fit1.embedding_
    numeric = pd.DataFrame(
                         {'x': numeric_embedding[:,0],
                         'y':  numeric_embedding[:,1],
                        })


    categorical_embedding = fit2.embedding_
    categorical = pd.DataFrame(
                         {'x': categorical_embedding[:,0],
                         'y':  categorical_embedding[:,1],
                        })


    # intersection will resemble the numerical embedding more.
    if intersection:
        embedding = fit1 * fit2

    # union will resemble the categorical embedding more.
    else:
        embedding = fit1 + fit2

    umap_embedding = embedding.embedding_


    results = pd.DataFrame(
                        {'x': umap_embedding[:,0],
                         'y':  umap_embedding[:,1],
                        })
    
    return results, umap_embedding




def elbow_method_kmeans(df, space=(2,11)):

    cost = []
    n_clusters = []

    start = space[0]
    stop  = space[1]
    for k in range(start, stop):
        kmeans = KMeans(n_clusters=k, verbose=0)
        kmeans.fit(df)
        cost.append(kmeans.inertia_)
        n_clusters.append(k)


    results = pd.DataFrame(
                        {'n_clusters': n_clusters,
                        'cost': cost,
                        })
  
    return (p9.ggplot(results, p9.aes(x='n_clusters', y='cost'))
            + p9.geom_point()
            + p9.geom_line()
            + p9.ggtitle('Elbow Plot'))




def get_knn_bins(df, cols, 
                 bins=5, 
                 drop_cols=True, 
                 encode = True):
    
    k_columns = []
    
    for col in cols:
    
        kmeans  = KMeans(n_clusters=bins).fit(df[col].to_frame().values.reshape(-1,1))
        results = pd.DataFrame(kmeans.labels_, columns=[col + '_centroid'])

        df = df.reset_index()
        df[col + '_centroid'] = results[col + '_centroid']

        knn_bin_df = pd.DataFrame(kmeans.cluster_centers_)
        knn_bin_df = knn_bin_df.astype(int).reset_index()

        temp_df = pd.merge(df[col + '_centroid'],
                           knn_bin_df, 
                           left_on=col + '_centroid',
                           right_on='index',
                           how='left')

        # rename empty column header 0 -> column_name value
        temp_df = temp_df.rename(columns={0:col+'_value'})

        temp_df.loc[:,col+'_value'] = col + '_' + temp_df[col+'_value'].astype(str)

        df = pd.concat([df, temp_df[col+'_value']], axis=1)
        df.drop([col + '_centroid', 'index'], axis=1, inplace=True)
        k_columns.append(col+'_value')
    
    cat_columns = [k for k in df.columns if k not in cols]
    print("New cat columns ", ",".join(k_columns))
    df = cobj(df, cat_columns)
    
    if drop_cols:
        df = df.drop(cols, axis=1)
    
    if encode:
        df = encode_columns(df, k_columns)

    return df



def convert_df_to_sgraph_network(df):
    '''
    This function converts a dataframe into an edge list and finally
    into a network graph
    '''
    df = df.copy()
    edges_df = pd.DataFrame()
    # create a name for each row
    length = len(df)
    row_names = ['row '+ str(i) for i in range(1, length+1)]

    original_cols = df.columns
    df['row_name'] = row_names

    for col in original_cols:
        col_edge_df = df[['row_name', col]].rename(columns={col:'to'})
        edges_df = pd.concat([edges_df, col_edge_df], axis=0)

    # set the edge weights to one
    edges_df['weight'] = 1
    edges_df = edges_df.groupby(['row_name', 'to']).count().reset_index()
    edges_df.rename(columns={'row_name':'from'}, inplace=True)

    graph = nx.from_pandas_edgelist(edges_df, source='from',
                                  target='to', edge_attr=['weight'])
  
    return graph

In [None]:
df, cat_columns, con_feats, y = get_dummy_data(50, 
                                               8, 
                                               7,
                                               centers        = 3,
                                               missing_values = None, 
                                               id_cols        = None)
df = cobj(df, cat_columns)

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric


def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as 
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """
    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.dtypes[0] == np.object:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / max(np.ptp(feature.values),1)

        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

In [None]:
calculate_gower_distance(df, cat_columns)

In [None]:
Xd=pd.DataFrame({'age':[21,21,19, 30,21,21,19,30,None],
'gender':['M','M','N','M','F','F','F','F',None],
'civil_status':['MARRIED','SINGLE','SINGLE','SINGLE','MARRIED','SINGLE','WIDOW','DIVORCED',None],
'salary':[3000.0,1200.0 ,32000.0,1800.0 ,2900.0 ,1100.0 ,10000.0,1500.0,None],
'has_children':[1,0,1,1,1,0,0,1,None],
'available_credit':[2200,100,22000,1100,2000,100,6000,2200,None]})

calculate_gower_distance(Xd, cat_columns = [True, True, True, False, True, False])


In [None]:
for k in df.columns:
    print(df[k].dtypes == np.object)

In [None]:
df

In [None]:
import gower
X = np.asarray(df)
gower.gower_matrix(X)

In [None]:


import gower
gower.gower_matrix(df,cat_features= [True if df[k].dtypes == np.object else False for k in df.columns])

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric

df = pd.DataFrame([[1,2.6,'A'],[12,5,'X'],[4,7,'A']])
df.columns = ['Num_1','Num_2','Cat_1']

In [None]:
print(df.dtypes)
print(df.head(5))

[True if df[k].dtypes == np.object else False for k in df.columns]

In [None]:
s1 = DistanceMetric.get_metric('manhattan').pairwise(df[['Num_1']])
s1 = s1/max(np.ptp(df['Num_1']),1)
s2 = DistanceMetric.get_metric('manhattan').pairwise(df[['Num_2']])/max(np.ptp(df['Num_2']),1)
s3 = DistanceMetric.get_metric('dice').pairwise(dummy_df)
Gowers_Distance = (s1*w1 + s2*w2 + s3*w3)/(w1 + w2 + w3) 
Gowers_Distance

In [None]:
s3 = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(df['Cat_1'], drop_first=True))
s3[np.isnan(s3)] = 0
s3

In [None]:
w1 = w2 = w3 = 1
Gowers_Distance = (s1*w1 + s2*w2 + s3*w3)/(w1 + w2 + w3) 
Gowers_Distance

In [None]:
import gower
gower.gower_matrix(df,cat_features= [False, True, False, True, False, False, False, False])

In [None]:
calculate_gower_distance(df)

In [None]:
def get_knn_bins(df, cols, 
                 bins=5, 
                 drop_cols=True, 
                 encode = True):
    
    k_columns = []
    
    for col in cols:
    
        kmeans  = KMeans(n_clusters=bins).fit(df[col].to_frame().values.reshape(-1,1))
        results = pd.DataFrame(kmeans.labels_, columns=[col + '_centroid'])

        df = df.reset_index()
        df[col + '_centroid'] = results[col + '_centroid']

        knn_bin_df = pd.DataFrame(kmeans.cluster_centers_)
        knn_bin_df = knn_bin_df.astype(int).reset_index()

        temp_df = pd.merge(df[col + '_centroid'],
                           knn_bin_df, 
                           left_on=col + '_centroid',
                           right_on='index',
                           how='left')

        # rename empty column header 0 -> column_name value
        temp_df = temp_df.rename(columns={0:col+'_value'})

        temp_df.loc[:,col+'_value'] = col + '_' + temp_df[col+'_value'].astype(str)

        df = pd.concat([df, temp_df[col+'_value']], axis=1)
        df.drop([col + '_centroid', 'index'], axis=1, inplace=True)
        k_columns.append(col+'_value')
    
    cat_columns = [k for k in df.columns if k not in cols]
    print("New cat columns ", ",".join(k_columns))
    df = cobj(df, cat_columns)
    
    if drop_cols:
        df = df.drop(cols, axis=1)
    
    if encode:
        df = encode_columns(df, k_columns)

    return df
    
        
        
        
        
        
        
        

df, cat_columns, con_feats, y = get_dummy_data(50, 
                                               8, 
                                               3,
                                               centers        = 3,
                                               missing_values = None, 
                                               id_cols        = None)
df = cobj(df, cat_columns)

In [None]:
con_feats

In [None]:
rt = get_knn_bins(df, con_feats, bins=5, drop_cols = True, encode = True)
rt.head(10)

In [None]:
convert_df_to_sgraph_network(rt)

In [None]:
rt['X1_value'].head(10)

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')

# ‘uniform’, ‘quantile’, ‘kmeans’
# print(est.fit_transform(df))
dfrt = pd.DataFrame(est.fit_transform(df[con_feats]))
dfrt.head(10)

In [None]:
ty.head(5)

In [2]:
info = u"Hello, Welcome\nPlease follow the instructions \u2191"

In [3]:
info

'Hello, Welcome\nPlease follow the instructions ↑'