# k-prototypes


In [2]:
!pip install kmodes

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [3]:
!pip install plotnine

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [4]:
pip install umap-learn

Keyring is skipped due to an exception: 'keyring.backends'
[0mNote: you may need to restart the kernel to use updated packages.


In [12]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
from kmodes.kprototypes import KPrototypes
import constants 
import pandas as pd
import plotnine 
from plotnine import *
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
import umap

In [6]:
# Load Metadata 
metadata = constants.load_training_metadata()

# Explore metadata 
metadata

Unnamed: 0,image,age_approx,anatom_site_general,lesion_id,sex
0,ISIC_0000000,55.0,anterior torso,,female
1,ISIC_0000001,30.0,anterior torso,,female
2,ISIC_0000002,60.0,upper extremity,,female
3,ISIC_0000003,30.0,upper extremity,,male
4,ISIC_0000004,80.0,posterior torso,,male
...,...,...,...,...,...
25326,ISIC_0073247,85.0,head/neck,BCN_0003925,female
25327,ISIC_0073248,65.0,anterior torso,BCN_0001819,male
25328,ISIC_0073249,70.0,lower extremity,BCN_0001085,male
25329,ISIC_0073251,55.0,palms/soles,BCN_0002083,female


In [7]:
# Data cleanup 

# 1. drop N/A
print("Age N/As count: " + str(metadata['age_approx'].isna().sum()))
print("Sex N/As count: " + str(metadata['sex'].isna().sum()))

metadata.dropna(inplace=True)

# 2. Save Y 
Y = metadata.image 

# 3. Drop columns 
metadata = metadata.drop(columns=['image', 'lesion_id'])

# 4. Rename age_approx -> Age 
metadata = metadata.rename(columns={"age_approx": "age", "anatom_site_general":"site"})
metadata

Age N/As count: 437
Sex N/As count: 384


Unnamed: 0,age,site,sex
1459,50.0,posterior torso,female
1460,30.0,lower extremity,female
1461,35.0,upper extremity,female
1462,45.0,posterior torso,male
1463,20.0,upper extremity,female
...,...,...,...
25326,85.0,head/neck,female
25327,65.0,anterior torso,male
25328,70.0,lower extremity,male
25329,55.0,palms/soles,female


In [13]:
#Preprocessing numerical
numerical = metadata.select_dtypes(exclude='object')

for c in numerical.columns:
    pt = PowerTransformer()
    numerical.loc[:, c] = pt.fit_transform(np.array(numerical[c]).reshape(-1, 1))
    
##preprocessing categorical
categorical = metadata.select_dtypes(include='object')
categorical = pd.get_dummies(categorical)

#Percentage of columns which are categorical is used as weight parameter in embeddings later
categorical_weight = len(metadata.select_dtypes(include='object').columns) / metadata.shape[1]

#Embedding numerical & categorical
fit1 = umap.UMAP(metric='l2').fit(numerical)
fit2 = umap.UMAP(metric='dice').fit(categorical)

#Augmenting the numerical embedding with categorical
intersection = umap.umap_.general_simplicial_set_intersection(fit1.graph_, fit2.graph_, weight=categorical_weight)
intersection = umap.umap_.reset_local_connectivity(intersection)
embedding = umap.umap_.simplicial_set_embedding(fit1._raw_data, intersection, fit1.n_components, 
                                                fit1._initial_alpha, fit1._a, fit1._b, 
                                                fit1.repulsion_strength, fit1.negative_sample_rate, 
                                                200, 'random', np.random, fit1.metric, 
                                                fit1._metric_kwds, False)

plt.figure(figsize=(20, 10))
plt.scatter(*embedding.T, s=2, cmap='Spectral', alpha=1.0)
plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


TypeError: simplicial_set_embedding() missing 2 required positional arguments: 'densmap_kwds' and 'output_dens'

In [None]:
break

In [None]:
# Expirment with 8 clusters 
df = metadata
categorical_features_idx = [1, 2]
df_array = df.to_numpy()
names = df.columns

# kproto = KPrototypes(n_clusters=8, verbose=2, max_iter=20)
# clusters = kproto.fit(df_array, categorical=categorical_features_idx)
# print (clusters.cluster_centroids_.shape)
# print (clusters.cluster_centroids_)

In [None]:
def ApplyKPrototypes (n_clusters, df = df):
    
    print ("Number of Clusters: %d" % (n_clusters))    
    categorical_features_idx = [1, 2]
    df_array = df.to_numpy()
    names = df.columns

    kproto = KPrototypes(n_clusters=n_clusters, verbose=0, max_iter=20)
    clusters = kproto.fit(df_array, categorical=categorical_features_idx)

    fig, axes = plt.subplots(nrows=1, ncols=3, sharex=False, sharey=False, figsize = (16, 4))
    for ii in np.arange (0, df.shape[1]):
        ax = axes[ii]
        ax.hist(df_array[:,ii], bins=30) ## We have 150 values for each feature; bins = 30 means we have 5 points per bin
        ax.scatter (x = clusters.cluster_centroids_[:,ii], y = np.zeros(n_clusters), c = 'red', s = 100)
        ax.set_title(names[ii])

    plt.show()

    print (clusters.cluster_centroids_)

    return clusters

for nn in [1,2, 3, 5]:
   clusters = ApplyKPrototypes(nn)

In [None]:
# Choose optimal K using Elbow method
cost = []
limit = 10
for n_clusters in range(1, limit):
    try:
        kprototype = KPrototypes(n_jobs = -1, n_clusters = n_clusters, init = 'Huang', random_state = 0)
        kprototype.fit_predict(df_array, categorical = categorical_features_idx)
        cost.append(kprototype.cost_)
        print('Cluster initiation: {}'.format(n_clusters))
    except Exception as e:
        print(e)
        break
print(len(cost))
# Converting the results into a dataframe and plotting them
df_cost = pd.DataFrame({'Cluster':range(1, limit), 'Cost':cost})
# Data viz
plotnine.options.figure_size = (8, 4.8)
(
    ggplot(data = df_cost)+
    geom_line(aes(x = 'Cluster',
                  y = 'Cost'))+
    geom_point(aes(x = 'Cluster',
                   y = 'Cost'))+
    geom_label(aes(x = 'Cluster',
                   y = 'Cost',
                   label = 'Cluster'),
               size = 10,
               nudge_y = 1000) +
    labs(title = 'Optimal number of cluster with Elbow Method')+
    xlab('Number of Clusters k')+
    ylab('Cost')+
    theme_minimal()
)

## K = 3

In [None]:
# running with 3 clusters 
kproto = KPrototypes(n_clusters=3, verbose=0, max_iter=20)
clusters = kproto.fit(df, categorical=categorical_features_idx)
print (clusters.cluster_centroids_)

In [None]:
# Plotting k = 3
sns.scatterplot(data=df, x="age", y="site", hue=clusters.labels_, style=clusters.labels_)

In [None]:
#Megan plotting attempts 
labels = 
for i in labels:
    plt.scatter(df[site == i , 0] , df[label == i , 1] , label = i)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'k)
plt.legend()
plt.show()

## K = 4

In [None]:
# running with 4 clusters 
kproto = KPrototypes(n_clusters=4, verbose=0, max_iter=20)
clusters = kproto.fit(df, categorical=categorical_features_idx)
print (clusters.cluster_centroids_)

In [None]:
# Plotting K = 4
sns.scatterplot(data=df, x="age", y="site", hue=clusters.labels_, style=clusters.labels_)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches((20, 10))
scatter = ax.scatter(embedding[:, 0], embedding[:, 1], s=2, c=clusters, cmap='tab20b', alpha=1.0)

# produce a legend with the unique colors from the scatter
legend1 = ax.legend(*scatter.legend_elements(num=15),
                    loc="lower left", title="Classes")
ax.add_artist(legend1)