# Visualising Clustering Algorithms

In the code below, we will demonstrate the usage of some common Clustering algorithms (interactively), and visualise the results

First, we will initialise all the libraries needed. Mainly, we are using **Numpy** and **Pandas** for data storage and manipulation, **Bokeh** for visualising, **IPython Widgets** for adding interactivity, and **SKLearn** for the clustering algorithms 

In [1]:
import numpy as np
import pandas as pd
from bokeh.layouts import column, row
from bokeh.io import output_notebook, push_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, CDSView, GroupFilter, Select, CustomJS, CustomJSFilter, BooleanFilter, HoverTool, Range1d, FactorRange
from bokeh.palettes import Category20

from ipywidgets import interact, Dropdown, HBox, VBox, SelectMultiple, FloatSlider, IntSlider, Button, FloatText
from IPython.display import display

from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering

output_notebook()

Now, we will read in the data and perform some very basic pre-processing

In [2]:
# READ IN DATA, MAKE THE NOMINAL COLUMNS AS STRING
all_data = pd.read_csv("Datasets/Wholesale_customers_data.csv")
all_data['Channel'] = all_data['Channel'].astype(str)
all_data['Region'] = all_data['Region'].astype(str)

# GET THE DIMENSIONS, AND THE COLUMN NAMES OF THE NUMERICAL COLUMNS
num_rows = len(all_data.index.values)
num_cols = len(all_data.columns.values)
usable_column_names = list(all_data.columns.values[2:])

# CONVERT THOSE NUMERICAL COLUMNS TO FLOATS, SO THAT THEY CAN BE EASILY USED IN CALCULATIONS
for col_name in usable_column_names:
    all_data[col_name] = all_data[col_name].astype(dtype=np.float32)

# MAKE A NEW COLUMN, CALLED 'UNIQUE_GROUP', WHICH COMBINES THE TWO ORDINAL COLUMNS
all_data['UniqueGroup'] = [(all_data['Channel'].values[i] + all_data['Region'].values[i]) for i in range(num_rows)]
# all_data.sort_values(by="UniqueGroup",inplace=True)

# MAKE A DICTIONARY OF COLORS TO BE ASSIGNED TO EACH SUCH GROUP, AND SUBSEQUENTLY ASSIGN THEM
ug_colors = {
                "11":Category20[20][0],
                "12":Category20[20][2],
                "13":Category20[20][4],
                "21":Category20[20][8],
                "22":Category20[20][6],
                "23":Category20[20][16]
            }

all_data['colors'] = [ ug_colors[i] for i in all_data['UniqueGroup']]

## Visualising the data as-is

We will first visualise the data as-is, to get a general overview of the dataset

In [3]:
# MAKE A CDS FOR THE SCATTER PLOT
scatter_data = all_data.copy()
scatter_data['x_data'] = all_data[usable_column_names[0]]
scatter_data['y_data'] = all_data[usable_column_names[1]]

scatter_data_CDS = ColumnDataSource(scatter_data)

In [4]:
# CREATE THE TOOLS NEEDED FOR INTERACTIVITY IN-GRAPH
scatter_tools = "lasso_select,box_select,box_zoom,wheel_zoom,reset,help"
hover_tool = HoverTool(tooltips=[ (usable_column_names[i],"@"+usable_column_names[i]) for i in range(len(usable_column_names)) ])

# PLOT THE SCATTER PLOT
scatter_figure = figure(title='Scatter plot of the data (Group-coloring)',
                        plot_width=450,plot_height=400,
                        x_range=[0,max(all_data[usable_column_names[0]])],
                        y_range=[0,max(all_data[usable_column_names[1]])],
                        tools=[hover_tool,scatter_tools])
scatter_figure.circle(x='x_data',y='y_data',fill_color='colors',line_color='colors',legend='UniqueGroup',size=10,hover_fill_color="#000000",source=scatter_data_CDS)

In [5]:
# CREATE 'SELECT' WIDGET TO ACCOMMODATE INTERACTIVTY IN CHOOSING WHAT DATA IS TO BE PLOTTED N THE AXES
scatter_x_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="X Data")
scatter_y_select = Dropdown(options=usable_column_names,value=usable_column_names[1],description="Y Data")

# CREATE 'MULTI-SELECT' WIDGET TO LET THE USER DECIDE WHAT SUBSECTION OF THE DATA ARE TO BE VIEWED
scatter_mult_select = SelectMultiple(options=list(all_data.UniqueGroup.unique()),value=list(all_data.UniqueGroup.unique()),description="Selected Groups")

# MERGE THE WIDGET INTO A SINGLE WIDGETBOX
scatter_box = HBox([VBox([scatter_x_select,scatter_y_select]) , scatter_mult_select])

# DEFINE THE TOOLBOX CALLBACK FUNCTION
def scatterSelectCallback(change):

    nan = float('nan')
    
    if(change['type'] == 'change'):
        
        make_nan_boolean = [False if (ug in scatter_mult_select.value) else True for ug in scatter_data_CDS.data['UniqueGroup']]
        
        scatter_data_CDS.data['x_data'] = np.copy(scatter_data_CDS.data[scatter_x_select.value])
        scatter_data_CDS.data['y_data'] = np.copy(scatter_data_CDS.data[scatter_y_select.value])
        
        scatter_data_CDS.data['x_data'][make_nan_boolean] = nan
        scatter_data_CDS.data['y_data'][make_nan_boolean] = nan
        
        scatter_figure.x_range.end = max(all_data[scatter_x_select.value])*1.1 
        scatter_figure.y_range.end = max(all_data[scatter_y_select.value])*1.1
        
        push_notebook()

        
# ATTACH THE CALLBACK FUNCTION TO THE INTERACTIVE WIDGETS
scatter_x_select.observe(scatterSelectCallback,names="value")
scatter_y_select.observe(scatterSelectCallback,names="value")
        
scatter_mult_select.observe(scatterSelectCallback,names="value")

The plot shown here shows all the given records, displaying 2 variables at a time. This allows for a quick look at the data, to see if there exist some obvious patterns.

The colors for the markers are based on the two nominal variables in the data, which together give rise to 6 individual groups. The interactivity in this plot allows us to look at any subset of there classes, plotted with any two variables out of the 6 numerical variables.

In [6]:
# SHOW THE SCATTER PLOT
scatter_handle = show(scatter_figure , notebook_handle=True)

In [7]:
# DISPLAY THE WIDGETBOX
display(scatter_box)

In [8]:
# MAKE A DICTIONARY OF COLORS TO BE ASSIGNED TO EACH CLUSTER, AND SUBSEQUENTLY ASSIGN THEM
cluster_colors = {
                "0":Category20[20][18],
                "1":Category20[20][16],
                "2":Category20[20][14],
                "3":Category20[20][12],
                "4":Category20[20][10],
                "5":Category20[20][8]
            }



In [9]:
def scatterViewCallback(scatter_figure,data_CDS,x_select,y_select,group_mult_select,label_mult_select):
    
    nan = float('nan')
    
    data_CDS.data['x_data'] = np.copy(data_CDS.data[x_select.value])
    data_CDS.data['y_data'] = np.copy(data_CDS.data[y_select.value])

    make_nan_boolean = [False if ((ug in group_mult_select.value) and (lab in label_mult_select.value)) else True for (ug,lab) in zip(data_CDS.data['UniqueGroup'] , data_CDS.data['ClusterLabel'])]

    data_CDS.data['x_data'][make_nan_boolean] = nan
    data_CDS.data['y_data'][make_nan_boolean] = nan

    scatter_figure.x_range.end = max(data_CDS.data[x_select.value])*1.1 
    scatter_figure.y_range.end = max(data_CDS.data[y_select.value])*1.1

    scatter_figure.xaxis.axis_label = x_select.value
    scatter_figure.yaxis.axis_label = y_select.value
        


In [10]:
def makeClusterColBoxData(home_data_CDS):
    
    temp_dict = {}
    temp_dict['colors'] = np.copy(list(cluster_colors.values()))
    
    cluster_labels = list(np.unique(home_data_CDS.data['ClusterLabel']))
    temp_dict['x_range'] = cluster_labels
    temp_dict['colors'] = temp_dict['colors'][0:len(cluster_labels)]
    
    for col in usable_column_names:
        min_vec = np.empty((len(cluster_labels)),dtype=np.float32)
        max_vec = np.empty((len(cluster_labels)),dtype=np.float32)
        upperq_vec = np.empty((len(cluster_labels)),dtype=np.float32)
        lowerq_vec = np.empty((len(cluster_labels)),dtype=np.float32)
        
        for i in range(len(cluster_labels)):
            
            cluster = cluster_labels[i]
            temp_bool = (home_data_CDS.data['ClusterLabel'] == cluster)
            
            min_vec[i] = np.min(home_data_CDS.data[col][temp_bool])
            max_vec[i] = np.max(home_data_CDS.data[col][temp_bool])
            upperq_vec[i] = np.percentile(home_data_CDS.data[col][temp_bool],90,interpolation='nearest')
            lowerq_vec[i] = np.percentile(home_data_CDS.data[col][temp_bool],10,interpolation='nearest')
            
        temp_dict[col+"_min"] = min_vec
        temp_dict[col+"_max"] = max_vec
        temp_dict[col+"_upperq"] = upperq_vec
        temp_dict[col+"_lowerq"] = lowerq_vec
    
            
    init_col = usable_column_names[0]
    
    temp_dict["min_val"] = np.copy(temp_dict[init_col+"_min"])
    temp_dict["max_val"] = np.copy(temp_dict[init_col+"_min"])
    temp_dict["upperq_val"] = np.copy(temp_dict[init_col+"_upperq"])
    temp_dict["lowerq_val"] = np.copy(temp_dict[init_col+"_lowerq"])
    
    return temp_dict


def processClusterColBox(home_data_CDS,plot_data_CDS,box_figure):
    
    plot_data_CDS.data = makeClusterColBoxData(home_data_CDS)
    box_figure.x_range = FactorRange(factors=list(np.unique(home_data_CDS.data['ClusterLabel'])))
    
    
    
def clusterColBoxViewCallback(box_figure,plot_data_CDS,home_data_CDS,col_select):
    
    col = col_select.value
    
    plot_data_CDS.data["min_val"] = np.copy(plot_data_CDS.data[col+"_min"])
    plot_data_CDS.data["max_val"] = np.copy(plot_data_CDS.data[col+"_max"])
    plot_data_CDS.data["upperq_val"] = np.copy(plot_data_CDS.data[col+"_upperq"])
    plot_data_CDS.data["lowerq_val"] = np.copy(plot_data_CDS.data[col+"_lowerq"])
    
    box_figure.yaxis.axis_label = col
    box_figure.y_range.end = np.max(home_data_CDS.data[col])*1.1
    
            
    
    

In [11]:
def makeGroupToClusterData(home_data_CDS):
    
    temp_dict = {}
    cluster_labels = list(np.unique(home_data_CDS.data['ClusterLabel']))
    groups = list(np.unique(home_data_CDS.data['UniqueGroup']))
    
    total_entries = len(cluster_labels)*len(groups)
    
    x_val_vec = np.empty((total_entries),dtype=object)
    count_top_val_vec = np.empty((total_entries),dtype=np.float32)
    prop_top_val_vec = np.empty((total_entries),dtype=np.float32)
    count_bottom_val_vec = np.empty((total_entries),dtype=np.float32)
    prop_bottom_val_vec = np.empty((total_entries),dtype=np.float32)
    color_vec = np.empty((total_entries),dtype=object)
    
    ind_count = 0
    
    for i in range(len(groups)):
        ug = groups[i]
        
        ug_boolean = home_data_CDS.data['UniqueGroup']==ug
        ug_total_count = np.sum(ug_boolean)
        
        count_ug_curr_count = 0.0
        prop_ug_curr_count = 0.0
        
        for j in range(len(cluster_labels)):
            cluster = cluster_labels[j]
            cluster_boolean = home_data_CDS.data['ClusterLabel']==cluster
            
            ug_cluster_comb_count = np.sum(np.logical_and(cluster_boolean,ug_boolean))
            
            x_val_vec[ind_count] = ug
            
            count_bottom_val_vec[ind_count] = count_ug_curr_count
            prop_bottom_val_vec[ind_count] = prop_ug_curr_count
            
            count_top_val_vec[ind_count] = count_ug_curr_count + np.float(ug_cluster_comb_count)
            prop_top_val_vec[ind_count] = prop_ug_curr_count + np.float(ug_cluster_comb_count)/np.float(ug_total_count)                
            
            color_vec[ind_count] = cluster_colors[cluster]
            
            count_ug_curr_count = count_top_val_vec[ind_count]
            prop_ug_curr_count = prop_top_val_vec[ind_count]
            ind_count += 1
            
            
    temp_dict['x_range'] = x_val_vec
    temp_dict['count_top_val'] = count_top_val_vec
    temp_dict['prop_top_val'] = prop_top_val_vec
    temp_dict['count_bottom_val'] = count_bottom_val_vec
    temp_dict['prop_bottom_val'] = prop_bottom_val_vec
    temp_dict['colors'] = color_vec
    
    temp_dict['top_val'] = np.copy(temp_dict['prop_top_val'])
    temp_dict['bottom_val'] = np.copy(temp_dict['prop_bottom_val'])
    
    return temp_dict
    
    
    
def processGroupToCluster(home_data_CDS , plot_data_CDS):
    plot_data_CDS.data = makeGroupToClusterData(home_data_CDS)
    
    
def groupClusterViewCallback(plot_data_CDS,algo,plot_figure):
    
    if(algo=="Proportion"):
        temp_str = "prop"
        plot_figure.yaxis.axis_label = "Proportion"
    else:
        temp_str = "count"
        plot_figure.yaxis.axis_label = "Count"
        
    plot_data_CDS.data['top_val'] = np.copy(plot_data_CDS.data[temp_str + "_top_val"])
    plot_data_CDS.data['bottom_val'] = np.copy(plot_data_CDS.data[temp_str + "_bottom_val"])
    
    plot_figure.y_range.end = np.max(plot_data_CDS.data['top_val'])
            

In [12]:
def makeClusterToGroupData(home_data_CDS):
    
    temp_dict = {}
    cluster_labels = list(np.unique(home_data_CDS.data['ClusterLabel']))
    groups = list(np.unique(home_data_CDS.data['UniqueGroup']))
    
    total_entries = len(cluster_labels)*len(groups)
    
    x_val_vec = np.empty((total_entries),dtype=object)
    count_top_val_vec = np.empty((total_entries),dtype=np.float32)
    prop_top_val_vec = np.empty((total_entries),dtype=np.float32)
    count_bottom_val_vec = np.empty((total_entries),dtype=np.float32)
    prop_bottom_val_vec = np.empty((total_entries),dtype=np.float32)
    color_vec = np.empty((total_entries),dtype=object)
    
    ind_count = 0
    
    for i in range(len(cluster_labels)):
        cluster = cluster_labels[i]
        
        cluster_boolean = home_data_CDS.data['ClusterLabel']==cluster
        cluster_total_count = np.sum(cluster_boolean)
        
        count_cluster_curr_count = 0.0
        prop_cluster_curr_count = 0.0
        
        for j in range(len(groups)):
            ug = groups[j]
            ug_boolean = home_data_CDS.data['UniqueGroup']==ug
            
            ug_cluster_comb_count = np.sum(np.logical_and(cluster_boolean,ug_boolean))
            
            x_val_vec[ind_count] = cluster
            
            count_bottom_val_vec[ind_count] = count_cluster_curr_count
            prop_bottom_val_vec[ind_count] = prop_cluster_curr_count
            
            count_top_val_vec[ind_count] = count_cluster_curr_count + np.float(ug_cluster_comb_count)
            prop_top_val_vec[ind_count] = prop_cluster_curr_count + np.float(ug_cluster_comb_count)/np.float(cluster_total_count)                
            
            color_vec[ind_count] = ug_colors[ug]
            
            count_cluster_curr_count = count_top_val_vec[ind_count]
            prop_cluster_curr_count = prop_top_val_vec[ind_count]
            ind_count += 1
            
            
    temp_dict['x_range'] = x_val_vec
    temp_dict['count_top_val'] = count_top_val_vec
    temp_dict['prop_top_val'] = prop_top_val_vec
    temp_dict['count_bottom_val'] = count_bottom_val_vec
    temp_dict['prop_bottom_val'] = prop_bottom_val_vec
    temp_dict['colors'] = color_vec
    
    temp_dict['top_val'] = np.copy(temp_dict['prop_top_val'])
    temp_dict['bottom_val'] = np.copy(temp_dict['prop_bottom_val'])
    
    return temp_dict
    
    
    
def processClusterToGroup(home_data_CDS , plot_data_CDS, plot_figure):
    plot_data_CDS.data = makeClusterToGroupData(home_data_CDS)
    plot_figure.x_range = FactorRange(factors=list(np.unique(home_data_CDS.data['ClusterLabel'])))

In [13]:
def makeClusteringData(data_CDS,col_names):
    vectors = [data_CDS.data[col] for col in col_names]
    curr_data = np.stack(vectors,axis=1)
    return curr_data


def getLabels(algo,data_CDS,col_names,params):
    
    if(len(col_names)==0):
        print("PLEASE SELECT AT LEAST ONE COLUMN")
        return []
    
    curr_data = makeClusteringData(data_CDS,col_names)
    
    if(algo=="kmeans"):
        kmeans_results = KMeans(n_clusters=params["n_clusters"]).fit(curr_data)
        labels = kmeans_results.labels_
        return labels
    
    if(algo=="agglo"):
        agglo_results = AgglomerativeClustering(n_clusters=params["n_clusters"] , affinity=params["affinity"] , linkage=params["linkage"]).fit(curr_data)
        labels = agglo_results.labels_
        return labels
    
    if(algo=="spectral"):
        spectral_results = SpectralClustering(n_clusters=params["n_clusters"] , affinity=params["affinity"] , gamma=params["gamma"] , degree=params["degree"] , coef0=params["zerocoeff"]).fit(curr_data)
        labels = spectral_results.labels_
        return labels
        
    

def assignLabels(labels,data_CDS,label_mult_select):
    
    data_CDS.data['ClusterLabel'] = (labels).astype(str)
    data_CDS.data['colors'] = [ cluster_colors[i] for i in data_CDS.data['ClusterLabel']]
    
    label_mult_select.options = list(np.unique(data_CDS.data['ClusterLabel']))
    label_mult_select.value = list(np.unique(data_CDS.data['ClusterLabel']))
    
    return

## K-Means Clustering

In [14]:
kmeans_data = all_data.copy()

kmeans_results = KMeans(n_clusters=6).fit(kmeans_data[usable_column_names])
kmeans_data['ClusterLabel'] = (kmeans_results.labels_).astype(str)
kmeans_data['colors'] = [ cluster_colors[i] for i in kmeans_data['ClusterLabel']]

kmeans_data['x_data'] = kmeans_data[usable_column_names[0]].copy()
kmeans_data['y_data'] = kmeans_data[usable_column_names[1]].copy()

kmeans_data_CDS = ColumnDataSource(kmeans_data)


kmeans_colbox_data_CDS = ColumnDataSource(makeClusterColBoxData(kmeans_data_CDS))
kmeans_group_to_cluster_data_CDS = ColumnDataSource(makeGroupToClusterData(kmeans_data_CDS))
kmeans_cluster_to_group_data_CDS = ColumnDataSource(makeClusterToGroupData(kmeans_data_CDS))

In [15]:
kmeans_scatter_view_x_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="X Data")
kmeans_scatter_view_y_select = Dropdown(options=usable_column_names,value=usable_column_names[1],description="Y Data")

kmeans_scatter_group_mult_select = SelectMultiple(options=list(np.unique(kmeans_data_CDS.data['UniqueGroup'])),value=list(np.unique(kmeans_data_CDS.data['UniqueGroup'])),description="(VIEW) Selected Groups")
kmeans_scatter_label_mult_select = SelectMultiple(options=list(np.unique(kmeans_data_CDS.data['ClusterLabel'])),value=list(np.unique(kmeans_data_CDS.data['ClusterLabel'])),description="(VIEW) Selected Labels")

kmeans_scatter_view_box = VBox([kmeans_scatter_view_x_select,kmeans_scatter_view_y_select , kmeans_scatter_group_mult_select , kmeans_scatter_label_mult_select])
    


def kmeansScatterViewCallback(change):
    if(change['type'] == 'change'):
        scatterViewCallback(kmeans_scatter_figure,kmeans_data_CDS,kmeans_scatter_view_x_select,kmeans_scatter_view_y_select,kmeans_scatter_group_mult_select,kmeans_scatter_label_mult_select)
        push_notebook()

        

kmeans_scatter_view_x_select.observe(kmeansScatterViewCallback,names="value")
kmeans_scatter_view_y_select.observe(kmeansScatterViewCallback,names="value")
        
kmeans_scatter_group_mult_select.observe(kmeansScatterViewCallback,names="value")
kmeans_scatter_label_mult_select.observe(kmeansScatterViewCallback,names="value")

In [16]:
kmeans_scatter_cluster_col_mult_select = SelectMultiple(options=usable_column_names , value=usable_column_names , description="(CLUSTER) Selected Columns")
kmeans_scatter_cluster_nc_slider = IntSlider(value=6 , min=1 , max=6 , step=1 , description="No. of Clusters")
kmeans_scatter_cluster_button = Button(description="Perform Clustering")

kmeans_scatter_cluster_box = VBox([kmeans_scatter_cluster_button , kmeans_scatter_cluster_col_mult_select , kmeans_scatter_cluster_nc_slider])


def kmeansClusterCallback(b):
    
    col_names = list(kmeans_scatter_cluster_col_mult_select.value)
    
    params = {}
    params["n_clusters"] = int(kmeans_scatter_cluster_nc_slider.value)
    
    labels = getLabels("kmeans",kmeans_data_CDS,col_names,params)
    assignLabels(labels,kmeans_data_CDS,kmeans_scatter_label_mult_select)
    
    processClusterColBox(kmeans_data_CDS,kmeans_colbox_data_CDS,kmeans_colbox_figure)
    processGroupToCluster(kmeans_data_CDS,kmeans_group_to_cluster_data_CDS)
    processClusterToGroup(kmeans_data_CDS,kmeans_cluster_to_group_data_CDS,kmeans_cluster_to_group_figure)
    
    push_notebook()
    

kmeans_scatter_cluster_button.on_click(kmeansClusterCallback)

In [17]:
kmeans_colbox_col_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="Boxplot Variable")

kmeans_colbox_view_box = kmeans_colbox_col_select

def kmeansColBoxViewCallback(change):
    if(change['type'] == 'change'):
        clusterColBoxViewCallback(kmeans_colbox_figure,kmeans_colbox_data_CDS,kmeans_data_CDS,kmeans_colbox_col_select)
        push_notebook()
        
kmeans_colbox_col_select.observe(kmeansColBoxViewCallback,names="value")

In [18]:
kmeans_group_to_cluster_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Clusters in Groups: Type")
kmeans_cluster_to_group_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Groups in Clusters: Type")

kmeans_group_cluster_type_box = VBox([kmeans_group_to_cluster_type_select , kmeans_cluster_to_group_type_select])

def kmeansGroupToClusterViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(kmeans_group_to_cluster_data_CDS , kmeans_group_to_cluster_type_select.value , kmeans_group_to_cluster_figure)
        push_notebook()
        
def kmeansClusterToGroupViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(kmeans_cluster_to_group_data_CDS , kmeans_cluster_to_group_type_select.value , kmeans_cluster_to_group_figure)
        push_notebook()
        
kmeans_group_to_cluster_type_select.observe(kmeansGroupToClusterViewCallback,names="value")
kmeans_cluster_to_group_type_select.observe(kmeansClusterToGroupViewCallback,names="value")
        

In [19]:
kmeans_scatter_figure = figure(title="Scatter Plot (Cluster-coloring, 2-variables)",
                        plot_width=500,plot_height=300,
                        x_range=[0,max(all_data[usable_column_names[0]])*1.1],
                        y_range=[0,max(all_data[usable_column_names[1]])*1.1],
                        x_axis_label = usable_column_names[0], y_axis_label = usable_column_names[1],
                        tools=[hover_tool,scatter_tools])
kmeans_scatter_figure.circle(x='x_data',y='y_data',fill_color='colors',line_color='colors',legend='ClusterLabel',size=10,hover_fill_color="#000000",source=kmeans_data_CDS)

kmeans_colbox_figure = figure(title="Distribution of data in Clusters (1-variable)",
                              plot_width=500,plot_height=300,
                              x_range=list(np.unique(kmeans_data_CDS.data['ClusterLabel'])),
                              y_range=[min(kmeans_data_CDS.data[usable_column_names[0]])*0.9,max(kmeans_data_CDS.data[usable_column_names[0]])*1.1],
                              x_axis_label = "Cluster Labels" , y_axis_label = usable_column_names[0])
kmeans_colbox_figure.vbar(x='x_range' , width=0.7 , top='upperq_val' , bottom='lowerq_val' , fill_color='colors' , line_color='colors' , source=kmeans_colbox_data_CDS)
kmeans_colbox_figure.segment(x0='x_range' , y0='min_val' , x1='x_range' , y1='max_val' , line_color='colors' , line_width=3 , source=kmeans_colbox_data_CDS)

kmeans_group_to_cluster_figure = figure(title="(Proportion/Count) distribution of Clusters in Groups",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(kmeans_data_CDS.data['UniqueGroup'])),
                                        y_range=[0,1],
                                        x_axis_label="Group Labels" , y_axis_label="Proportion of Cluster")
kmeans_group_to_cluster_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=kmeans_group_to_cluster_data_CDS)

kmeans_cluster_to_group_figure = figure(title="(Proportion/Count) distribution of Groups in Clusters",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(kmeans_data_CDS.data['ClusterLabel'])),
                                        y_range=[0,1],
                                        x_axis_label="Cluster Labels" , y_axis_label="Proportion of Group")
kmeans_cluster_to_group_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=kmeans_cluster_to_group_data_CDS)


In [20]:
show(column(row(kmeans_scatter_figure , kmeans_colbox_figure) , row(kmeans_group_to_cluster_figure , kmeans_cluster_to_group_figure)), notebook_handle=True)

In [21]:
display(HBox([kmeans_scatter_cluster_box,kmeans_scatter_view_box,VBox([kmeans_colbox_view_box,kmeans_group_cluster_type_box])]))

## Agglomerative Clustering

In [22]:
agglo_data = all_data.copy()

agglo_results = AgglomerativeClustering(n_clusters=6,affinity="euclidean",linkage="ward").fit(agglo_data[usable_column_names])
agglo_data['ClusterLabel'] = (agglo_results.labels_).astype(str)
agglo_data['colors'] = [ cluster_colors[i] for i in agglo_data['ClusterLabel']]

agglo_data['x_data'] = agglo_data[usable_column_names[0]].copy()
agglo_data['y_data'] = agglo_data[usable_column_names[1]].copy()

agglo_data_CDS = ColumnDataSource(agglo_data)

agglo_colbox_data_CDS = ColumnDataSource(makeClusterColBoxData(agglo_data_CDS))
agglo_group_to_cluster_data_CDS = ColumnDataSource(makeGroupToClusterData(agglo_data_CDS))
agglo_cluster_to_group_data_CDS = ColumnDataSource(makeClusterToGroupData(agglo_data_CDS))

In [23]:
agglo_scatter_view_x_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="X Data")
agglo_scatter_view_y_select = Dropdown(options=usable_column_names,value=usable_column_names[1],description="Y Data")

agglo_scatter_group_mult_select = SelectMultiple(options=list(np.unique(agglo_data_CDS.data['UniqueGroup'])),value=list(np.unique(agglo_data_CDS.data['UniqueGroup'])),description="(VIEW) Selected Groups")
agglo_scatter_label_mult_select = SelectMultiple(options=list(np.unique(agglo_data_CDS.data['ClusterLabel'])),value=list(np.unique(agglo_data_CDS.data['ClusterLabel'])),description="(VIEW) Selected Labels")

agglo_scatter_view_box = VBox([agglo_scatter_view_x_select,agglo_scatter_view_y_select , agglo_scatter_group_mult_select , agglo_scatter_label_mult_select])
    


def aggloScatterViewCallback(change):
    if(change['type'] == 'change'):
        scatterViewCallback(agglo_scatter_figure,agglo_data_CDS,agglo_scatter_view_x_select,agglo_scatter_view_y_select,agglo_scatter_group_mult_select,agglo_scatter_label_mult_select)
        push_notebook()

        

agglo_scatter_view_x_select.observe(aggloScatterViewCallback,names="value")
agglo_scatter_view_y_select.observe(aggloScatterViewCallback,names="value")
        
agglo_scatter_group_mult_select.observe(aggloScatterViewCallback,names="value")
agglo_scatter_label_mult_select.observe(aggloScatterViewCallback,names="value")

In [24]:
agglo_scatter_cluster_col_mult_select = SelectMultiple(options=usable_column_names , value=usable_column_names , description="(CLUSTER) Selected Columns")
agglo_scatter_cluster_nc_slider = IntSlider(value=6 , min=1 , max=6 , step=1 , description="No. of Clusters")
agglo_scatter_cluster_aff_select = Dropdown(options=["euclidean","l1","l2","manhattan","cosine"] , value="euclidean" , description="Affinity Metric")
agglo_scatter_cluster_linkage_select = Dropdown(options=["ward","complete","average"] , value="ward" , description="Linkage Metric")
agglo_scatter_cluster_button = Button(description="Perform Clustering")

agglo_scatter_cluster_box = VBox([agglo_scatter_cluster_button , agglo_scatter_cluster_col_mult_select , agglo_scatter_cluster_nc_slider , agglo_scatter_cluster_aff_select , agglo_scatter_cluster_linkage_select])


def aggloClusterCallback(b):
    
    col_names = list(agglo_scatter_cluster_col_mult_select.value)
    
    params = {}
    params["n_clusters"] = int(agglo_scatter_cluster_nc_slider.value)
    params["affinity"] = agglo_scatter_cluster_aff_select.value
    params["linkage"] = agglo_scatter_cluster_linkage_select.value

    if(params["linkage"] == "ward"):
        params["affinity"] = "euclidean"
        agglo_scatter_cluster_aff_select.value = "euclidean"
    
    labels = getLabels("agglo",agglo_data_CDS,col_names,params)
    assignLabels(labels,agglo_data_CDS,agglo_scatter_label_mult_select)
    
    processClusterColBox(agglo_data_CDS,agglo_colbox_data_CDS,agglo_colbox_figure)
    processGroupToCluster(agglo_data_CDS,agglo_group_to_cluster_data_CDS)
    processClusterToGroup(agglo_data_CDS,agglo_cluster_to_group_data_CDS,agglo_cluster_to_group_figure)
    
    push_notebook()
    

agglo_scatter_cluster_button.on_click(aggloClusterCallback)

In [25]:
agglo_colbox_col_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="Boxplot Variable")

agglo_colbox_view_box = agglo_colbox_col_select

def aggloColBoxViewCallback(change):
    if(change['type'] == 'change'):
        clusterColBoxViewCallback(agglo_colbox_figure,agglo_colbox_data_CDS,agglo_data_CDS,agglo_colbox_col_select)
        push_notebook()
        
agglo_colbox_col_select.observe(aggloColBoxViewCallback,names="value")

In [26]:
agglo_group_to_cluster_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Clusters in Groups: Type")
agglo_cluster_to_group_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Groups in Clusters: Type")

agglo_group_cluster_type_box = VBox([agglo_group_to_cluster_type_select , agglo_cluster_to_group_type_select])

def aggloGroupToClusterViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(agglo_group_to_cluster_data_CDS , agglo_group_to_cluster_type_select.value , agglo_group_to_cluster_figure)
        push_notebook()
        
def aggloClusterToGroupViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(agglo_cluster_to_group_data_CDS , agglo_cluster_to_group_type_select.value , agglo_cluster_to_group_figure)
        push_notebook()
        
agglo_group_to_cluster_type_select.observe(aggloGroupToClusterViewCallback,names="value")
agglo_cluster_to_group_type_select.observe(aggloClusterToGroupViewCallback,names="value")

In [27]:
agglo_scatter_figure = figure(title="Scatter Plot (Cluster-coloring, 2-variables)",
                        plot_width=500,plot_height=300,
                        x_range=[0,max(all_data[usable_column_names[0]])*1.1],
                        y_range=[0,max(all_data[usable_column_names[1]])*1.1],
                        x_axis_label = usable_column_names[0], y_axis_label = usable_column_names[1],
                        tools=[hover_tool,scatter_tools])
agglo_scatter_figure.circle(x='x_data',y='y_data',fill_color='colors',line_color='colors',legend='ClusterLabel',size=10,hover_fill_color="#000000",source=agglo_data_CDS)

agglo_colbox_figure = figure(title="Distribution of data in Clusters (1-variable)",
                              plot_width=500,plot_height=300,
                              x_range=list(np.unique(agglo_data_CDS.data['ClusterLabel'])),
                              y_range=[min(agglo_data_CDS.data[usable_column_names[0]])*0.9,max(agglo_data_CDS.data[usable_column_names[0]])*1.1],
                              x_axis_label = "Cluster Labels" , y_axis_label = usable_column_names[0])
agglo_colbox_figure.vbar(x='x_range' , width=0.7 , top='upperq_val' , bottom='lowerq_val' , fill_color='colors' , line_color='colors' , source=agglo_colbox_data_CDS)
agglo_colbox_figure.segment(x0='x_range' , y0='min_val' , x1='x_range' , y1='max_val' , line_color='colors' , line_width=3 , source=agglo_colbox_data_CDS)

agglo_group_to_cluster_figure = figure(title="(Proportion/Count) distribution of Clusters in Groups",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(agglo_data_CDS.data['UniqueGroup'])),
                                        y_range=[0,1],
                                        x_axis_label="Group Labels" , y_axis_label="Proportion of Cluster")
agglo_group_to_cluster_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=agglo_group_to_cluster_data_CDS)

agglo_cluster_to_group_figure = figure(title="(Proportion/Count) distribution of Groups in Clusters",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(agglo_data_CDS.data['ClusterLabel'])),
                                        y_range=[0,1],
                                        x_axis_label="Cluster Labels" , y_axis_label="Proportion of Group")
agglo_cluster_to_group_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=agglo_cluster_to_group_data_CDS)

In [28]:
show(column(row(agglo_scatter_figure , agglo_colbox_figure) , row(agglo_group_to_cluster_figure , agglo_cluster_to_group_figure)), notebook_handle=True)

In [29]:
display(HBox([agglo_scatter_cluster_box,agglo_scatter_view_box,VBox([agglo_colbox_view_box,agglo_group_cluster_type_box])]))

## Spectral Clustering

In [30]:
spectral_data = all_data.copy()

spectral_results = SpectralClustering(n_clusters=6).fit(spectral_data[usable_column_names])
spectral_data['ClusterLabel'] = (spectral_results.labels_).astype(str)
spectral_data['colors'] = [ cluster_colors[i] for i in spectral_data['ClusterLabel']]

spectral_data['x_data'] = spectral_data[usable_column_names[0]].copy()
spectral_data['y_data'] = spectral_data[usable_column_names[1]].copy()

spectral_data_CDS = ColumnDataSource(spectral_data)

spectral_colbox_data_CDS = ColumnDataSource(makeClusterColBoxData(spectral_data_CDS))
spectral_group_to_cluster_data_CDS = ColumnDataSource(makeGroupToClusterData(spectral_data_CDS))
spectral_cluster_to_group_data_CDS = ColumnDataSource(makeClusterToGroupData(spectral_data_CDS))



In [31]:
spectral_scatter_view_x_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="X Data")
spectral_scatter_view_y_select = Dropdown(options=usable_column_names,value=usable_column_names[1],description="Y Data")

spectral_scatter_group_mult_select = SelectMultiple(options=list(np.unique(spectral_data_CDS.data['UniqueGroup'])),value=list(np.unique(spectral_data_CDS.data['UniqueGroup'])),description="(VIEW) Selected Groups")
spectral_scatter_label_mult_select = SelectMultiple(options=list(np.unique(spectral_data_CDS.data['ClusterLabel'])),value=list(np.unique(spectral_data_CDS.data['ClusterLabel'])),description="(VIEW) Selected Labels")

spectral_scatter_view_box = VBox([spectral_scatter_view_x_select,spectral_scatter_view_y_select , spectral_scatter_group_mult_select , spectral_scatter_label_mult_select])
    


def spectralScatterViewCallback(change):
    if(change['type'] == 'change'):
        scatterViewCallback(spectral_scatter_figure,spectral_data_CDS,spectral_scatter_view_x_select,spectral_scatter_view_y_select,spectral_scatter_group_mult_select,spectral_scatter_label_mult_select)
        push_notebook()

        

spectral_scatter_view_x_select.observe(spectralScatterViewCallback,names="value")
spectral_scatter_view_y_select.observe(spectralScatterViewCallback,names="value")
        
spectral_scatter_group_mult_select.observe(spectralScatterViewCallback,names="value")
spectral_scatter_label_mult_select.observe(spectralScatterViewCallback,names="value")

In [32]:
spectral_scatter_cluster_col_mult_select = SelectMultiple(options=usable_column_names , value=usable_column_names , description="(CLUSTER) Selected Columns")
spectral_scatter_cluster_nc_slider = IntSlider(value=6 , min=1 , max=6 , step=1 , description="No. of Clusters")
spectral_scatter_cluster_aff_select = Dropdown(options=["rbf","sigmoid","polynomial","cosine"] , value="rbf" , description="Affinity Metric")
spectral_scatter_cluster_gamma_input = FloatText(value=1.0 , description="Gamma Value (NOT FOR Cosine)")
spectral_scatter_cluster_degree_slider = IntSlider(value=1 , min=1 , max=8 , step=1 , description="Degree of Polynomial")
spectral_scatter_cluster_zerocoeff_input = FloatText(value=1.0 , description="Zero Coefficient (Polynomial and Sigmoid)")
spectral_scatter_cluster_button = Button(description="Perform Clustering")

spectral_scatter_cluster_box = VBox([spectral_scatter_cluster_button , spectral_scatter_cluster_col_mult_select , spectral_scatter_cluster_nc_slider , spectral_scatter_cluster_aff_select , spectral_scatter_cluster_gamma_input , spectral_scatter_cluster_degree_slider , spectral_scatter_cluster_zerocoeff_input])


def spectralClusterCallback(b):
    
    col_names = list(spectral_scatter_cluster_col_mult_select.value)
    
    params = {}
    params["n_clusters"] = int(spectral_scatter_cluster_nc_slider.value)
    params["affinity"] = spectral_scatter_cluster_aff_select.value
    params["gamma"] = float(spectral_scatter_cluster_gamma_input.value)
    params["degree"] = int(spectral_scatter_cluster_degree_slider.value)
    params["zerocoeff"] = float(spectral_scatter_cluster_zerocoeff_input.value)
    
    labels = getLabels("spectral",spectral_data_CDS,col_names,params)
    assignLabels(labels,spectral_data_CDS,spectral_scatter_label_mult_select)
    
    processClusterColBox(spectral_data_CDS,spectral_colbox_data_CDS,spectral_colbox_figure)
    processGroupToCluster(spectral_data_CDS,spectral_group_to_cluster_data_CDS)
    processClusterToGroup(spectral_data_CDS,spectral_cluster_to_group_data_CDS,spectral_cluster_to_group_figure)
    
    push_notebook()
    

spectral_scatter_cluster_button.on_click(spectralClusterCallback)

In [33]:
spectral_colbox_col_select = Dropdown(options=usable_column_names,value=usable_column_names[0],description="Boxplot Variable")

spectral_colbox_view_box = spectral_colbox_col_select

def spectralColBoxViewCallback(change):
    if(change['type'] == 'change'):
        clusterColBoxViewCallback(spectral_colbox_figure,spectral_colbox_data_CDS,spectral_data_CDS,spectral_colbox_col_select)
        push_notebook()
        
spectral_colbox_col_select.observe(spectralColBoxViewCallback,names="value")

In [34]:
spectral_group_to_cluster_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Clusters in Groups: Type")
spectral_cluster_to_group_type_select = Dropdown(options=["Proportion","Count"] , value="Proportion" , description="Groups in Clusters: Type")

spectral_group_cluster_type_box = VBox([spectral_group_to_cluster_type_select , spectral_cluster_to_group_type_select])

def spectralGroupToClusterViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(spectral_group_to_cluster_data_CDS , spectral_group_to_cluster_type_select.value , spectral_group_to_cluster_figure)
        push_notebook()
        
def spectralClusterToGroupViewCallback(change):
    if(change['type']=='change'):
        groupClusterViewCallback(spectral_cluster_to_group_data_CDS , spectral_cluster_to_group_type_select.value , spectral_cluster_to_group_figure)
        push_notebook()
        
spectral_group_to_cluster_type_select.observe(spectralGroupToClusterViewCallback,names="value")
spectral_cluster_to_group_type_select.observe(spectralClusterToGroupViewCallback,names="value")

In [35]:
spectral_scatter_figure = figure(title="Scatter Plot (Cluster-coloring, 2-variables)",
                        plot_width=500,plot_height=300,
                        x_range=[0,max(all_data[usable_column_names[0]])*1.1],
                        y_range=[0,max(all_data[usable_column_names[1]])*1.1],
                        x_axis_label = usable_column_names[0], y_axis_label = usable_column_names[1],
                        tools=[hover_tool,scatter_tools])
spectral_scatter_figure.circle(x='x_data',y='y_data',fill_color='colors',line_color='colors',legend='ClusterLabel',size=10,hover_fill_color="#000000",source=spectral_data_CDS)

spectral_colbox_figure = figure(title="Distribution of data in Clusters (1-variable)",
                              plot_width=500,plot_height=300,
                              x_range=list(np.unique(spectral_data_CDS.data['ClusterLabel'])),
                              y_range=[min(spectral_data_CDS.data[usable_column_names[0]])*0.9,max(spectral_data_CDS.data[usable_column_names[0]])*1.1],
                              x_axis_label = "Cluster Labels" , y_axis_label = usable_column_names[0])
spectral_colbox_figure.vbar(x='x_range' , width=0.7 , top='upperq_val' , bottom='lowerq_val' , fill_color='colors' , line_color='colors' , source=spectral_colbox_data_CDS)
spectral_colbox_figure.segment(x0='x_range' , y0='min_val' , x1='x_range' , y1='max_val' , line_color='colors' , line_width=3 , source=spectral_colbox_data_CDS)

spectral_group_to_cluster_figure = figure(title="(Proportion/Count) distribution of Clusters in Groups",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(spectral_data_CDS.data['UniqueGroup'])),
                                        y_range=[0,1],
                                        x_axis_label="Group Labels" , y_axis_label="Proportion of Cluster")
spectral_group_to_cluster_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=spectral_group_to_cluster_data_CDS)

spectral_cluster_to_group_figure = figure(title="(Proportion/Count) distribution of Groups in Clusters",
                                        plot_width=500,plot_height=300,
                                        x_range=list(np.unique(spectral_data_CDS.data['ClusterLabel'])),
                                        y_range=[0,1],
                                        x_axis_label="Cluster Labels" , y_axis_label="Proportion of Group")
spectral_cluster_to_group_figure.vbar(x='x_range' , width=0.7 , top='top_val' , bottom='bottom_val' , fill_color='colors' , line_color='colors' , source=spectral_cluster_to_group_data_CDS)

In [36]:
show(column(row(spectral_scatter_figure , spectral_colbox_figure) , row(spectral_group_to_cluster_figure , spectral_cluster_to_group_figure)), notebook_handle=True)

In [37]:
display(HBox([spectral_scatter_cluster_box,spectral_scatter_view_box,VBox([spectral_colbox_view_box,spectral_group_cluster_type_box])]))