In [12]:
import pandas as pd
import numpy as np
import math
from bokeh.plotting import figure, show, output_notebook
from bokeh.io import push_notebook
from ipywidgets import interact, fixed
from collections import OrderedDict
from math import log, sqrt
from bokeh.layouts import column, row, widgetbox
from bokeh.models import HoverTool, ColumnDataSource, Select, CustomJS, Title
from bokeh.models.widgets import CheckboxButtonGroup
from bokeh.application.handlers.function import FunctionHandler
from bokeh.application.application import Application
output_notebook()

TOOLS="crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"

orig_data = pd.read_csv('Wholesale customers data.csv')
data = pd.read_csv('Wholesale customers data-missing.csv')
columns = data.columns

imputing_data_approaches = ['Original', 'Missing values dataset', 'Drop NAs', 'Fill NA with 1', 'Fill NA with mean', 'Nearest', 'Slinear', 'Quadratic', '5th Order Polynomial']
missing_val_indices = pd.isnull(data).any(1).nonzero()[0]
print("Missing values index array: "+str(missing_val_indices))
dataframe_dict = {}
dataframe_dict[imputing_data_approaches[0]] = orig_data
dataframe_dict[imputing_data_approaches[1]] = data
# approach 1: ignore row with missing value
dataframe_dict[imputing_data_approaches[2]] = data.dropna(axis=0)
# approach 2: fill missing value with 1
dataframe_dict[imputing_data_approaches[3]] = data.fillna(1)
# approach 3: fill it with mean value
mean = data.mean(axis=0)
dataframe_dict[imputing_data_approaches[4]] = data.fillna(mean)
# approach 4: fill it using nearest neighbour search
dataframe_dict[imputing_data_approaches[5]] = data.interpolate(method='nearest')
# approach 5: fill it using spline interpolation of first order
dataframe_dict[imputing_data_approaches[6]] = data.interpolate(method='slinear')
# approach 6: fill it using spline interpolation of second order
dataframe_dict[imputing_data_approaches[7]] = data.interpolate(method='quadratic')
# approach 7: fill it using spline interpolation of 5th order
dataframe_dict[imputing_data_approaches[8]] = data.interpolate(method='polynomial', order=5)

region_color = OrderedDict([
    ("Lisbon",   "#0d3362"),
    ("Oporto", "#c64737"),
    ("Other Region",     "black"  ),
])

dwarf_constant = 10
width = 700
height = 700
inner_radius = 90
outer_radius = 300 - 10
delta = outer_radius - inner_radius

minr = sqrt(log(0.01*1E4))
maxr = sqrt(log(2000 * 1E4))
a = delta / (minr - maxr)
b = inner_radius - a * maxr

def rad(mic, max_vals):
    return inner_radius + (delta*mic/max_vals)

def circle_renderer(df, categorical_column, columns_to_drop):
    new_df = df.drop(columns_to_drop, axis=1)
    grouped = new_df.groupby(categorical_column)
    new_df = grouped.aggregate('mean')
    big_angle = 2.0 * np.pi / (len(new_df.columns))
    small_angle = big_angle / 7
    p = figure(tools=TOOLS, plot_width=width, plot_height=height, title="Burtin",
        x_axis_type=None, y_axis_type=None,
        x_range=(-420, 420), y_range=(-420, 420),
        min_border=0, outline_line_color="black",
        background_fill_color="#f0e1d2", border_fill_color="#f0e1d2",
        toolbar_sticky=False)

    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None

    # annular wedges
    angles = np.pi/2 - big_angle/2 - np.arange(0, len(new_df.columns))*big_angle
    colors = ["#aeaeb8" for gram in new_df.columns]
    p.annular_wedge(
        0, 0, inner_radius, outer_radius, -big_angle+angles, angles, color=colors,
    )

    # small wedges
    max_vals = new_df.max()
    p.annular_wedge(0, 0, inner_radius, rad(new_df.loc[1], max_vals),
                    -big_angle+angles+5*small_angle, -big_angle+angles+6*small_angle,
                    color=region_color['Lisbon'])
    p.annular_wedge(0, 0, inner_radius, rad(new_df.loc[2], max_vals),
                    -big_angle+angles+3*small_angle, -big_angle+angles+4*small_angle,
                    color=region_color['Oporto'])
    p.annular_wedge(0, 0, inner_radius, rad(new_df.loc[3], max_vals),
                    -big_angle+angles+1*small_angle, -big_angle+angles+2*small_angle,
                    color=region_color['Other Region'])

    # circular axes and lables
    labels = np.power(10.0, np.arange(-3, 4))
    radii = a * np.sqrt(np.log(labels * 1E4)) + b
    p.circle(0, 0, radius=radii, fill_color=None, line_color="white")
    p.text(0, radii[2:], ['$'+str((1/r)*1000) for r in labels[2:]],
           text_font_size="8pt", text_align="center", text_baseline="middle")

    # radial axes
    p.annular_wedge(0, 0, inner_radius, outer_radius,
                    -big_angle+angles, -big_angle+angles, color="black")

    # product labels
    xr = radii[0]*np.cos(np.array(-big_angle/2 + angles))
    yr = radii[0]*np.sin(np.array(-big_angle/2 + angles))
    label_angle=np.array(-big_angle/2+angles)
    label_angle[label_angle < -np.pi/2] += np.pi # easier to read labels on the left side
    p.text(xr, yr, new_df.columns, angle=label_angle,
           text_font_size="9pt", text_align="center", text_baseline="middle")

    p.rect([-40, -40, -40], [18, 0, -18], width=30, height=13,
           color=list(region_color.values()))
    p.text([-15, -15, -15], [18, 0, -18], text=list(region_color),
           text_font_size="9pt", text_align="left", text_baseline="middle")
    return p

def scatter_plots(df, column1, column2, radius_column, take_log=False):
    # Main function for interaction with the scatter plot
    def update(attr, new, old):
        df = dataframe_dict[dataset.value]
        if toggle_log_normalization.active:
            take_log = True
        else:
            take_log = False
        
        if take_log:
            x = np.log(df[xaxis.value])
            y = np.log(df[yaxis.value])
            if size.value != 'None':
                radii = np.log(df[size.value])/(dwarf_constant*x.mean())
        else:
            x = df[xaxis.value]
            y = df[yaxis.value]
            if size.value != 'None':
                radii = df[size.value]/dwarf_constant
        
        if color.value == 'all':
            selected_indices = x.index.values
        elif color.value == 'missing' and dataset.value != 'Drop NAs':
            selected_indices = missing_val_indices
        elif color.value == 'normal' and dataset.value != 'Drop NAs':
            selected_indices = (x.index.difference(missing_val_indices)).values
        else:
            selected_indices = x.index.values
        
        colors = ["blue" if (not math.isnan(r) and not math.isnan(g)) else "red" for r, g in zip(x, y)]
        colors = np.array(colors)
        if dataset.value != 'Drop NAs':
            colors[missing_val_indices] = "red"
        colors = colors[selected_indices]
        colors = list(colors)
        p.title.text = xaxis.value+" vs "+yaxis.value +" scatter plot (radius of circles="+size.value+")"
        p.xaxis.axis_label = xaxis.value
        p.yaxis.axis_label = yaxis.value
        p_s.data_source.data['x'] = x.loc[selected_indices]
        p_s.data_source.data['y'] = y.loc[selected_indices]
        if size.value != 'None':
            p_s.data_source.data['radius'] = radii.loc[selected_indices]
        p_s.data_source.data['fill_color'] = colors
        
    if take_log:
        x = np.log(df[column1])
        y = np.log(df[column2])
        radii = np.log(df[radius_column])/(dwarf_constant*x.mean())
    else:
        x = df[column1]
        y = df[column2]
        radii = df[radius_column]/(dwarf_constant*x.mean())
    
    colors = ["blue" if (not math.isnan(r) and not math.isnan(g)) else "red" for r, g in zip(x, y)]
    colors = np.array(colors)
    colors[missing_val_indices] = "red"
    colors = list(colors)
    p = figure(tools=TOOLS, plot_width=width, plot_height=height, title=column1+" vs "+column2 +" scatter plot (radius of circles="+radius_column+")")
    p_s = p.scatter(x, y, radius=radii,
          fill_color=colors, fill_alpha=0.6, line_color=None)
    p.xaxis.axis_label = column1
    p.yaxis.axis_label = column2
    hover = HoverTool(tooltips=[
    ("index", "$index"),
    ("(x,y)", "($x, $y)"),
    ])
    p.add_tools(hover)

    dataset = Select(title='Dataset', value=imputing_data_approaches[0], options=imputing_data_approaches)
    dataset.on_change('value', update)
    
    xaxis = Select(title='X-Axis', value=column1, options=list(columns[2:].values))
    xaxis.on_change('value', update)

    yaxis = Select(title='Y-Axis', value=column2, options=list(columns[2:].values))
    yaxis.on_change('value', update)

    size = Select(title='Size', value=radius_column, options=['None'] + list(columns[2:].values))
    size.on_change('value', update)
    
    color = Select(title='Imputed Values selection', value="all", options=["missing", "normal", "all"])
    color.on_change('value', update)
    
    toggle_log_normalization = CheckboxButtonGroup(labels=["Log normalize"], active=[0, 1])
    toggle_log_normalization.on_change('active', update)

    controls = widgetbox([dataset, xaxis, yaxis, size, color, toggle_log_normalization], width=250)
    layout = row([controls, p])
    return layout, p, p_s

def change_scatter_plot(handle_number, dataset=imputing_data_approaches[0], x_value='Fresh', y_value='Milk', size_value='Grocery', take_log=False):
    df = dataframe_dict[dataset]
    if take_log:
        x = np.log(df[x_value])
        y = np.log(df[y_value])
        if size_value != 'None':
            radii = np.log(df[size_value])/(dwarf_constant*x.mean())
    else:
        x = df[x_value]
        y = df[y_value]
        if size_value != 'None':
            radii = df[size_value]/dwarf_constant
    colors = ["blue" if (not math.isnan(r) and not math.isnan(g)) else "red" for r, g in zip(x, y)]
    colors = np.array(colors)
    if dataset != imputing_data_approaches[2]:
        colors[missing_val_indices] = "red"
    colors = list(colors)
    scatter_figure.title.text = x_value+" vs "+y_value +" scatter plot (radius of circles="+size_value+")"
    scatter_figure.xaxis.axis_label = x_value
    scatter_figure.yaxis.axis_label = y_value
    scatter_plot.data_source.data['x'] = x
    scatter_plot.data_source.data['y'] = y
    if size_value != 'None':
        scatter_plot.data_source.data['radius'] = radii
    scatter_plot.data_source.data['fill_color'] = colors
    push_notebook(handle=handle_number)
  


Missing values index array: [ 75 172 180 225 274 370]


In [13]:
def make_doc(doc):
    p1 = circle_renderer(dataframe_dict[imputing_data_approaches[0]], "Region", "Channel")
    scatter_layout, scatter_figure, scatter_plot = scatter_plots(dataframe_dict[imputing_data_approaches[0]], "Milk", "Fresh", "Grocery", True)
    layout = column([p1, scatter_layout])
    doc.add_root(layout)
    doc.title = "Missing Data"
    
app = Application(FunctionHandler(make_doc))
doc = app.create_document()
t1 = show(app, notebook_handle=True)

In [14]:
# To be used as selector if running notebook through nbviewer, other default widgets will work
# interact(change_scatter_plot, handle_number=fixed(t1), dataset=imputing_data_approaches, x_value=list(columns[2:].values), y_value=list(columns[2:].values), size_value=['None'] + list(columns[2:].values), take_log=True)

