In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_notebook
from bokeh.io import push_notebook
from ipywidgets import interact, fixed
from collections import OrderedDict
from math import log, sqrt
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, ColumnDataSource, Select, CustomJS, Title, Slider
from bokeh.models.widgets import CheckboxButtonGroup
from bokeh.palettes import Category20_20 as palette
from bokeh.application.handlers.function import FunctionHandler
from bokeh.application.application import Application
from sklearn.cluster import AffinityPropagation, KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.decomposition import PCA
output_notebook()

TOOLS="crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"

data = pd.read_csv('Wholesale customers data.csv')
columns = data.columns

ap = AffinityPropagation()
affinity_metric = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
km = KMeans()
ac = AgglomerativeClustering()
db = DBSCAN()
sc = SpectralClustering()

clusterers = {}
clusterers['AffinityPropagation'] = ap
clusterers['KMeans'] = km
clusterers['AgglomerativeClustering'] = ac
clusterers['DBSCAN'] = db
clusterers['SpectralClustering'] = sc

transformed_data = {}
transformed_data['PCA'] = PCA(n_components=2).fit_transform(data.values)
transformed_data['t-SNE'] = TSNE(n_components=2, perplexity=10, learning_rate=200, random_state=31).fit_transform(data.values)


clusters_info = {}
clusters_info['AffinityPropagation'] = ap.fit_predict(data.values)
clusters_info['KMeans'] = km.fit_predict(data.values)
clusters_info['AgglomerativeClustering'] = ac.fit_predict(data.values)
clusters_info['SpectralClustering'] = sc.fit_predict(data.values)
clusters_info['DBSCAN'] = db.fit_predict(data.values)




dwarf_constant = 10
width = 700
height = 700

def scatter_plots(df, column1, column2, radius_column, take_log=False):
    # Main function for interaction with the scatter plot
    def update(attr, new, old):
        cluster_algo = cluster_algo_select.value
            
        if toggle_log_normalization.active:
            take_log = True
        else:
            take_log = False
        
        if domain_transform.value == 'None':
            df = data
            if cluster_algo == 'KMeans':
                clusters_info[cluster_algo] = KMeans(n_clusters=clusters_slider.value).fit_predict(df)
            elif cluster_algo == 'AgglomerativeClustering':
                clusters_info[cluster_algo] = AgglomerativeClustering(n_clusters=clusters_slider.value, linkage='complete', 
                                                                      affinity=affinity.value).fit_predict(df)
            elif cluster_algo == 'SpectralClustering':
                clusters_info[cluster_algo] = SpectralClustering(n_clusters=clusters_slider.value).fit_predict(df)
            if take_log:
                x = np.log(df[xaxis.value])
                y = np.log(df[yaxis.value])
                if size.value != 'None':
                    radii = np.log(df[size.value])/(dwarf_constant*x.mean())
                else:
                    radii = 0.15*np.ones((x.shape[0],))
            else:
                x = df[xaxis.value]
                y = df[yaxis.value]
                if size.value != 'None':
                    radii = df[size.value]/dwarf_constant
                else:
                    radii = (100*dwarf_constant)*df[xaxis.value]/df[xaxis.value]
            colors = [palette[(c+1)%20] for c in clusters_info[cluster_algo]]
            p.title.text = xaxis.value+" vs "+yaxis.value +" scatter plot (radius of circles="+size.value+")"
            p.xaxis.axis_label = xaxis.value
            p.yaxis.axis_label = yaxis.value
        else:
            df = transformed_data[domain_transform.value]
            x = np.log(df[:, 0])
            y = np.log(df[:, 1])
            radii = 0.1*np.ones((x.shape[0],))
            if cluster_algo == 'KMeans':
                clusters_info[cluster_algo] = KMeans(n_clusters=clusters_slider.value).fit_predict(df)
            elif cluster_algo == 'AgglomerativeClustering':
                clusters_info[cluster_algo] = AgglomerativeClustering(n_clusters=clusters_slider.value, linkage='complete', 
                                                                      affinity=affinity.value).fit_predict(df)
            elif cluster_algo == 'SpectralClustering':
                clusters_info[cluster_algo] = SpectralClustering(n_clusters=clusters_slider.value).fit_predict(df)
            else:
                clusters_info[cluster_algo] = clusterers[cluster_algo].fit_predict(df)
            colors = [palette[(c+1)%20] for c in clusters_info[cluster_algo]]
            p.title.text = "Component 1 vs Component 2"
            p.xaxis.axis_label = "Component 1"
            p.yaxis.axis_label = "Component 2"
            
        p_s.data_source.data['x'] = x
        p_s.data_source.data['y'] = y
        p_s.data_source.data['radius'] = radii
        p_s.data_source.data['fill_color'] = colors
            
       
    if take_log:
        x = np.log(df[column1])
        y = np.log(df[column2])
        radii = np.log(df[radius_column])/(dwarf_constant*x.mean())
    else:
        x = df[column1]
        y = df[column2]
        radii = df[radius_column]/(dwarf_constant*x.mean())
    
    colors = [palette[(c+1)%20] for c in clusters_info[clusters_info.keys()[0]]]
    p = figure(tools=TOOLS, plot_width=width, plot_height=height, title=column1+" vs "+column2 +" scatter plot (radius of circles="+radius_column+")")
    p_s = p.scatter(x, y, radius=radii,
          fill_color=colors, fill_alpha=0.6, line_color=None)
    p.xaxis.axis_label = column1
    p.yaxis.axis_label = column2
    hover = HoverTool(tooltips=[
    ("Type", "$color:fill_color"),
    ("(x,y)", "($x, $y)"),
    ])
    p.add_tools(hover)

    cluster_algo_select = Select(title='Clustering Algo', value=clusters_info.keys()[0], options=clusters_info.keys())
    cluster_algo_select.on_change('value', update)
    
    xaxis = Select(title='X-Axis', value=column1, options=list(columns[2:].values))
    xaxis.on_change('value', update)

    yaxis = Select(title='Y-Axis', value=column2, options=list(columns[2:].values))
    yaxis.on_change('value', update)

    size = Select(title='Point Size', value=radius_column, options=['None'] + list(columns[2:].values))
    size.on_change('value', update)
    
    domain_transform = Select(title='Domain Transform', value='None', options=['None'] + list(transformed_data.keys()))
    domain_transform.on_change('value', update)
    
    affinity = Select(title='Affinity', value='euclidean', options=affinity_metric)
    affinity.on_change('value', update)
      
    clusters_slider = Slider(start=1, end=20, value=1, step=1, title="No. of Clusters")
    clusters_slider.on_change('value', update)
          
    toggle_log_normalization = CheckboxButtonGroup(labels=["Log normalize"], active=[0, 1])
    toggle_log_normalization.on_change('active', update)
    
    controls = widgetbox([cluster_algo_select, xaxis, yaxis, size, domain_transform, affinity, clusters_slider, toggle_log_normalization], width=250)
    
    layout = row([controls, p])
    return layout, p, p_s



In [2]:
def make_doc(doc):
    scatter_layout, scatter_figure, scatter_plot = scatter_plots(data, "Milk", "Fresh", "Grocery", True)
    doc.add_root(scatter_layout)
    doc.title = "Clustering Visualization"
    
app = Application(FunctionHandler(make_doc))
doc = app.create_document()
show(app)