# EDA for Instacart analysis

Includes code to generate all Javascript visualizations used in my [first](https://p-mckenzie.github.io/2017/12/12/instacart-part-1/ "Instacart Part 1 - Feature Engineering") and [second](https://p-mckenzie.github.io/2017/12/12/instacart-part-2/ "Instacart Part 2 - Modeling") blog posts.

In [1]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.layouts import column, row
from bokeh.models import HoverTool, ColumnDataSource, Label
from bokeh.embed import components
from bokeh.transform import factor_cmap
from bokeh.models.widgets import Panel, Tabs

import numpy as np
import pandas as pd

import matplotlib
from matplotlib.cm import get_cmap
import matplotlib.pyplot as plt
output_notebook()

# Post 1
## Visualizations on engineered features

In [2]:
train = pd.read_csv('x_train.csv', index_col=0)
train.head()

Unnamed: 0,user_id,order_id,product_id,target,avg_order_size,prev_ord_size,avg_days_between_orders,num_orders_placed,reordered_usr_avg,overall_avg_prod_disp,...,usr_avg_aisle_disp,usr_avg_dept_disp,usr_avg_prod_disp,prod_due_overall_perc,prod_due_user_perc,aisle_due_overall_perc,aisle_due_user_perc,dept_due_overall_perc,dept_due_user_perc,reorder_custom
0,1,1187899,196,1,5.9,9.0,19.56,11,0.695,66.272329,...,19.0,19.0,19.0,0.909,0.909,0.695,0.909,0.909,0.695,1
1,1,1187899,10258,1,5.9,9.0,19.56,11,0.695,56.35544,...,19.44,19.0,19.44,0.818,0.909,0.695,0.818,0.909,0.695,1
2,1,431534,10326,0,5.9,9.0,19.56,11,0.695,57.078752,...,87.5,87.5,97.0,0.182,0.182,0.695,0.182,0.182,0.695,0
3,1,2550362,12427,0,5.9,9.0,19.56,11,0.695,76.984336,...,19.0,19.0,19.0,0.909,0.909,0.695,0.909,0.909,0.695,1
4,1,1187899,13032,1,5.9,9.0,19.56,11,0.695,83.045575,...,58.33,58.33,58.33,0.273,0.273,0.695,0.273,0.273,0.695,1


In [3]:
hover = HoverTool(tooltips=[
    ("Number of products", "@hist"),
    ("Average percentage", "@center")
    ])

p_1 = figure(plot_height=300, plot_width=800, title="Distribution of prod_due_user_perc, separated by purchase status",tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==1]['prod_due_user_perc'].values, bins=75)
center = (edges[:-1]+edges[1:])/2
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_1.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#66c2a5', legend='Bought')

p_2 = figure(plot_height=300, plot_width=800, tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==0]['perc_prod_support'].values, bins=75)
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))
p_2.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#ed7953', legend='Not bought')

p_2.xaxis.axis_label = 'prod_due_user_perc'
p_1.title.align = "center"
p_1.title.text_font_size = "20px"

p_perc = column(p_1, p_2)

hover = HoverTool(tooltips=[
    ("Number of products", "@hist"),
    ("Average percentage", "@center")
    ])

p_1 = figure(plot_height=300, plot_width=800, title="Distribution of perc_prod_support, separated by purchase status",tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==1]['perc_prod_support'].values, bins=75)
center = (edges[:-1]+edges[1:])/2
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_1.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#66c2a5', legend='Bought')

p_2 = figure(plot_height=300, plot_width=800, tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==0]['perc_prod_support'].values, bins=75)
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_2.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#ed7953', legend='Not bought')

p_2.xaxis.axis_label = 'perc_prod_support'
p_1.title.align = "center"
p_1.title.text_font_size = "20px"

p_support = column(p_1, p_2)

hover = HoverTool(tooltips=[
    ("Number of products", "@hist"),
    ("Average", "@center")
    ])

p_1 = figure(plot_height=300, plot_width=800, title="Distribution of days_since_prod, separated by purchase status",tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==1]['days_since_prod'].values, bins=75)
center = (edges[:-1]+edges[1:])/2
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_1.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#66c2a5', legend='Bought')

p_2 = figure(plot_height=300, plot_width=800, tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==0]['days_since_prod'].values, bins=75)
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_2.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#ed7953', legend='Not bought')

p_2.xaxis.axis_label = 'avg_ord_pos'
p_1.title.align = "center"
p_1.title.text_font_size = "20px"

p_sep = column(p_1, p_2)

hover = HoverTool(tooltips=[
    ("Number of products", "@hist"),
    ("Average ratio", "@center")
    ])

p_1 = figure(plot_height=300, plot_width=800, title="Distribution of prod_dept_ratio, separated by purchase status",tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==1]['prod_dept_ratio'].values, bins=75)
center = (edges[:-1]+edges[1:])/2
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_1.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#66c2a5', legend='Bought')

p_2 = figure(plot_height=300, plot_width=800, tools=[hover, "pan,reset,ywheel_zoom"])

hist, edges = np.histogram(train[train['target']==0]['prod_dept_ratio'].values, bins=75)
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], center=center))

p_2.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='#ed7953', legend='Not bought')

p_2.xaxis.axis_label = 'prod_dept_ratio'
p_1.title.align = "center"
p_1.title.text_font_size = "20px"

p_ratio = column(p_1, p_2)

show(column(p_perc,p_support,p_sep,p_ratio))

In [4]:
script, div = components([p_perc,p_support,p_sep,p_ratio])
print script, div[0], div[1], div[2], div[3]


<script type="text/javascript">
  (function() {
    var fn = function() {
      Bokeh.safely(function() {
        (function(root) {
          function embed_document(root) {
            
          var docs_json = '{"c9158e7e-3ff4-4e6e-9c8f-ba27de4a680a":{"roots":{"references":[{"attributes":{"active_drag":"auto","active_inspect":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"9e78d943-1bf6-44aa-acd7-c89949dba18c","type":"HoverTool"},{"id":"c156f9eb-2df8-48e7-ba82-ccc19f39dfd0","type":"PanTool"},{"id":"419f093b-7951-449d-8191-2a24253c5eb2","type":"ResetTool"},{"id":"78d3fc8e-a498-4193-891e-8e60769344fc","type":"WheelZoomTool"}]},"id":"e3004fcc-6e6c-47a2-a3f3-43de9691b661","type":"Toolbar"},{"attributes":{"formatter":{"id":"c64a8d60-0c7f-4df3-948e-5f886f5f39d5","type":"BasicTickFormatter"},"plot":{"id":"5f19d810-35a1-441f-af91-c0def86fcae2","subtype":"Figure","type":"Plot"},"ticker":{"id":"092c85dc-51b1-4baf-bad2-b70a8e5fd0e2","type":"BasicTicker"}},"id":"ff4db281-1bfc

# Post 2
## Visualizations for final analysis

In [5]:
orders = pd.read_csv('orders.csv')
all_orders = pd.read_csv('order_products__prior.csv').append(pd.read_csv('order_products__train.csv'), ignore_index=True).merge(orders, how='left', on='order_id')
del orders
all_orders.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


## Average order size by user

In [6]:
hover = HoverTool(tooltips=[
    ("Number of Users", "@hist"),
    ("Avg Order Size", "@center")
    ])

p_size1 = figure(plot_height=250, plot_width=800, title="Distribution of Average Order Size",tools=[hover, "ywheel_zoom,pan,reset"])

hist, edges = np.histogram(all_orders.groupby(['user_id', 'order_id'])['product_id'].nunique().reset_index().rename(columns={'product_id':'count'})[['user_id', 'count']].groupby('user_id')['count'].mean().values, bins=75)
center = np.round((edges[:-1]+edges[1:])/2)
colors = ["#%02x%02x%02x" % (int(r), int(g), int(b)) for r, g, b, _ in 255*matplotlib.cm.Spectral(matplotlib.colors.Normalize()(center))]
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], 
                                    center=center,
                         colors=colors))

p_size1.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='colors')

p_size1.xaxis.axis_label = 'Average Order Size'
p_size1.yaxis.axis_label = 'Number of Users'

show(p_size1)

## Previous Order Size by User

In [7]:
hover = HoverTool(tooltips=[
    ("Number of Users", "@hist"),
    ("Prev Order Size", "@center")
    ])

p_size2 = figure(plot_height=250, plot_width=800, title="Distribution of Previous Order Size",tools=[hover, "ywheel_zoom,pan,reset"])

hist, edges = np.histogram(pd.read_csv('x_train.csv').drop_duplicates('user_id')['prev_ord_size'].values, bins=75)
center = np.round((edges[:-1]+edges[1:])/2)
colors = ["#%02x%02x%02x" % (int(r), int(g), int(b)) for r, g, b, _ in 255*matplotlib.cm.Spectral(matplotlib.colors.Normalize()(center))]
source = ColumnDataSource(data=dict(hist=hist, left=edges[:-1], right=edges[1:], 
                                    center=center,
                         colors=colors))

p_size2.quad(source=source, top='hist', bottom=0, left='left', right='right', fill_color='colors')

p_size2.xaxis.axis_label = 'Previous Order Size'
p_size2.yaxis.axis_label = 'Number of Users'

show(p_size2)

# Cross-validation results

## MLP

In [8]:
test_scores = [0.273223238,0.268352593,0.268273315,0.268010023,0.26852826,0.273200709,0.267458391,0.273462352,0.268104988,0.268226137,0.267937949,0.268119689,0.267529202,0.269416292,0.267846049,0.267441873,0.2728082582,0.268245887248,0.268024671953,0.267624619082]
units = [10,20,30,40,30,10,40,10,20,40,40,30,50,10,60,60,10,30,40,50]
layers = [10,20,30,40,10,30,10,40,40,20,30,40,10,50,10,20,5,5,5,5]

disp2 = ['{:.4f}'.format(i) for i in test_scores]
scale = [min((i-min(test_scores))/(max(test_scores)-min(test_scores))*120+1,18) for i in test_scores]

source_mlp = ColumnDataSource(data=dict(units=units, layers=layers, test_scores=test_scores, scale=scale, disp2=disp2))

In [9]:
from bokeh.models import Label

hover = HoverTool(tooltips=[
    ("RMSE", "@disp2"),
    ("Nodes", "@units"),
    ("Layers", "@layers")
    ])

p_mlp = figure(plot_height=600, plot_width=600, x_range=[0,70], y_range=[0,60], 
           tools=[hover, 'wheel_zoom,pan,reset'], title="MLP Parameter Tuning")

p_mlp.circle(source=source_mlp, x='units', y='layers', size='scale', color='#ab0f45')

babysource_mlp = ColumnDataSource(data=dict(units=[40], layers=[10], disp2=['0.2674'], color=['green']))
p_mlp.square('units', 'layers', size=30, color='color', alpha=0.4, source=babysource_mlp)

p_mlp.xaxis.axis_label = 'Nodes'
p_mlp.yaxis.axis_label = 'Layers'
p_mlp.title.align = "center"
p_mlp.title.text_font_size = "20px"

citation = Label(x=10, y=495, x_units='screen', y_units='screen',
                 text='* points sized by AUC', render_mode='css')

p_mlp.add_layout(citation)

show(p_mlp)

## XGBoost

In [10]:
learning_rate = [[0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05],[0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1],[0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,]]
n_tree = [40,40,40,140,140,140,240,240,240,340,340,340]
depth = [5,10,15,5,10,15,5,10,15,5,10,15]

test_scores =[0.272747516,0.270945475,0.271032318,0.265477977,0.264535903,0.265742237,0.265084157,0.264508324,0.265742237,0.264891058,0.264508324,0.265742237,0.266136178,0.264920637,0.266208122,0.265052263,0.264704484,0.266417481,0.264751236,0.264704484,0.266417481,0.264689308,0.264704484,0.266417481,0.265478048,0.26494173,0.267652538,0.264842843,0.26515672,0.267652538,0.264685497,0.26515672,0.267652538,0.264685497,0.26515672,0.267652538]

disp2 = ['{:.4f}'.format(i) for i in test_scores]

scale = [min((i-min(test_scores))/(max(test_scores)-min(test_scores))*120+1,18) for i in test_scores]

source_xgb1 = ColumnDataSource(data=dict(
    learning_rate=learning_rate[1], n_tree=n_tree, depth=depth, test_scores=test_scores[12:24], 
    disp2=disp2[12:24], scale=scale[12:24], disp=[str(i) for i in learning_rate[1]]))

source_xgb2 = ColumnDataSource(data=dict(
    learning_rate=learning_rate[2], n_tree=n_tree, depth=depth, test_scores=test_scores[24:], 
    disp2=disp2[24:], scale=scale[24:], disp=[str(i) for i in learning_rate[2]]))

source_xgb05 = ColumnDataSource(data=dict(
    learning_rate=learning_rate[0], n_tree=n_tree, depth=depth, test_scores=test_scores[:12], 
    disp2=disp2[:12], scale=scale[:12], disp=[str(i) for i in learning_rate[0]]))

In [11]:
hover = HoverTool(tooltips=[
    ("RMSE", "@disp2"),
    ("# of trees", "@n_tree"),
    ("Tree depth", "@depth")
    ])

p_xgb05 = figure(plot_height=600, plot_width=600, y_range=[0, 380], x_range=[0,20],
           tools=[hover, 'wheel_zoom,pan,reset'], title="XGBoost Parameter Tuning")

p_xgb05.circle(source=source_xgb05, x='depth', y='n_tree', size='scale', color='#f46d43')

babysource_xgb = ColumnDataSource(data=dict(depth=[10], n_tree=[240], disp2=['0.2645'], disp=['.05'], learning_rate=[.05], color=['green']))
p_xgb05.square('depth', 'n_tree', size=30, color='color', alpha=0.4, source=babysource_xgb)

p_xgb05.yaxis.axis_label = '# of trees'
p_xgb05.xaxis.axis_label = 'Tree depth'
p_xgb05.title.align = "center"
p_xgb05.title.text_font_size = "20px"

size = Label(x=10, y=495, x_units='screen', y_units='screen',
             text='* points sized by AUC', render_mode='css')
p_xgb05.add_layout(size)

rate = Label(x=10, y=5, x_units='screen', y_units='screen',
             text='Learning rate=0.05', render_mode='css')
p_xgb05.add_layout(rate)

show(p_xgb05)

### LightGBM

In [12]:
learning_rate = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2]
n_tree = [40,40,40,140,140,140,240,240,240,340,340,340,40,40,40,140,140,140,240,240,240,340,340,340]
num_leaves = [20,40,80,20,40,80,20,40,80,20,40,80,20,40,80,20,40,80,20,40,80,20,40,80]

test_scores =[0.266010643,0.2654812,0.265193886,0.264974643,0.264730081,0.264635793,0.264766324,0.264612124,0.264605123,0.264647122,0.264598483,0.264606057,0.265384372,0.265092002,0.264933189,0.264837342,0.26482876,0.264864249,0.264732721,0.26482876,0.264864249,0.264732721,0.26482876,0.264864249]

disp2 = ['{:.4f}'.format(i) for i in test_scores]

scale = [min((i-min(test_scores))/(max(test_scores)-min(test_scores))*120+1,18) for i in test_scores]

source_lgb1 = ColumnDataSource(data=dict(
    learning_rate=learning_rate[:12], n_tree=n_tree[:12], num_leaves=num_leaves[:12], test_scores=test_scores[:12], 
    disp2=disp2[:12], scale=scale[:12], disp=[str(i) for i in learning_rate[:12]]))

source_lgb2 = ColumnDataSource(data=dict(
    learning_rate=learning_rate[12:], n_tree=n_tree[12:], num_leaves=num_leaves[12:], test_scores=test_scores[12:], 
    disp2=disp2[12:], scale=scale[12:], disp=[str(i) for i in learning_rate[12:]]))

In [13]:
from bokeh.models import LogColorMapper, Ticker, ColorBar
from bokeh.models import LabelSet
from matplotlib.pyplot import get_cmap

hover = HoverTool(tooltips=[
    ("RMSE", "@disp2"),
    ("# of trees", "@n_tree"),
    ("# of leaves", "@num_leaves")
    ])

p_lgb1 = figure(plot_height=600, plot_width=600, x_range=[20,360], y_range=[10, 90],
           tools=[hover, 'wheel_zoom,pan,reset'], title="LightGBM Parameter Tuning")

p_lgb1.circle(source=source_lgb1, x='n_tree', y='num_leaves', size='scale', color='#0d0887')

babysource_lgb = ColumnDataSource(data=dict(n_tree=[340], num_leaves=[40], disp2=['0.2645'], color=['green'], disp=['0.1']))
p_lgb1.square('n_tree', 'num_leaves', size=30, color='color', alpha=0.4, source=babysource_lgb)

p_lgb1.xaxis.axis_label = '# of trees'
p_lgb1.yaxis.axis_label = 'num_leaves'
p_lgb1.title.align = "center"
p_lgb1.title.text_font_size = "20px"

size = Label(x=10, y=495, x_units='screen', y_units='screen',
                 text='* points sized by AUC', render_mode='css')

p_lgb1.add_layout(size)

rate = Label(x=10, y=5, x_units='screen', y_units='screen',
             text='Learning rate=0.1', render_mode='css')
p_lgb1.add_layout(rate)

show(p_lgb1)

# AUC comparison - initial

In [14]:
auc_initial = figure(plot_width=600, plot_height=400, tools=['wheel_zoom,pan,reset,'], title='ROC Curves for Classifiers')

x = [[0.0, 0.010549844357798795, 1.0], [0.0, 0.011759197647638072, 1.0], [0.0, 0.011478408276887052, 1.0], [0.0, 0.011795112567152736, 1.0]]
y = [[0.0, 0.1614487415592388, 1.0], [0.0, 0.17122858965562898, 1.0], [0.0, 0.18905499584732605, 1.0], [0.0, 0.19128179202927334, 1.0]]
auc = [0.575449448601, 0.579734696004, 0.588788293785, 0.589743339731]
label = ['Logistic', 'MLP', 'XGBoost', 'LightGBM']
color = ['#0d0887', '#9c179e', '#ed7953', '#f0f921']
col_inv = [color[i] for i in range(3,-1,-1)]
legend = ['{}: AUC {}'.format(label[i],str(round(auc[i],4))) for i in range(4)]
source1 = ColumnDataSource(data=dict(x=x[::-1], y=y[::-1], auc=auc[::-1], label=label[::-1], color=color, legend=legend[::-1]))

auc_initial.multi_line('x', 'y', source=source1, line_width=2, color='color', legend='legend')

auc_initial.legend.location = "bottom_right"
auc_initial.title.align = "center"
auc_initial.title.text_font_size = "20px"
show(auc_initial)

# AUC Comparison - final

In [15]:
auc_final = figure(plot_width=600, plot_height=410, tools=['wheel_zoom,pan,reset,'], title='ROC Curves for Regressors')

x = [[0.0, 0.12180512303471928, 1.0], [0.0, 0.12087068212880135, 1.0], [0.0, 0.11934005355893998, 1.0], [0.0, 0.11926430572869086, 1.0]]
y = [[0.0, 0.5151662875094789, 1.0], [0.0, 0.52377857220236157, 1.0], [0.0, 0.5379638657181719, 1.0], [0.0, 0.53865597804499332, 1.0]]
auc = [0.69668058223737983, 0.70145394503678016, 0.70931190607961603, 0.70969583615815124]
label = ['Linear', 'MLP', 'XGBoost', 'LightGBM']
color = ['#0d0887', '#9c179e', '#ed7953', '#f0f921']
col_inv = [color[i] for i in range(3,-1,-1)]
legend = ['{}: AUC {}'.format(label[i],str(round(auc[i],4))) for i in range(4)]
source1 = ColumnDataSource(data=dict(x=x[::-1], y=y[::-1], auc=auc[::-1], label=label[::-1], color=color, legend=legend[::-1]))

auc_final.multi_line('x', 'y', source=source1, line_width=2, color='color', legend='legend')

auc_final.legend.location = "bottom_right"
auc_final.title.align = "center"
auc_final.title.text_font_size = "20px"

citation = Label(x=10, y=325, x_units='screen', y_units='screen',
             text='* Previous order size assumption', render_mode='css')
auc_final.add_layout(citation)

show(auc_final)

# Feature importance comparison

In [16]:
labels = ['avg_order_size', 'prev_ord_size', 'avg_days_between_orders', 'num_orders_placed', 'reordered_usr_avg', 'overall_avg_prod_disp', 'overall_avg_aisle_disp', 'perc_aisle_support', 'overall_avg_dept_disp', 'perc_dept_support', 'avg_ord_pos', 'days_since_aisle', 'days_since_department', 'days_since_prod', 'order_aisle_displacement', 'orders_since_prod', 'perc_prod_support', 'prod_aisle_ratio', 'prod_dept_ratio', 'streak_length', 'usr_avg_aisle_disp', 'usr_avg_dept_disp', 'usr_avg_prod_disp', 'prod_due_overall_perc', 'prod_due_user_perc', 'aisle_due_overall_perc', 'aisle_due_user_perc', 'dept_due_overall_perc', 'dept_due_user_perc', 'reorder_custom']

xgboost = [13753, 11916, 11483, 9056, 14336, 12422, 9280, 8025, 5562, 6987, 10107, 7595, 7325, 7281, 9243, 4867, 7680, 2124, 3632, 3294, 6580, 7206, 1412, 1150, 1739, 2442, 381, 236, 611, 119]
lightgbm = [801, 581, 644, 459, 942, 644, 425, 281, 296, 262, 349, 453, 408, 551, 574, 341, 619, 139, 228, 272, 234, 296, 119, 45, 46, 120, 1, 14, 30, 5]

feat_xgb = figure(plot_width=800, plot_height=410, x_range=labels,
                  tools=['wheel_zoom,pan,reset,'], title='XGBoost Feature Importance')

feat_xgb.vbar(x=labels, top=xgboost, width=0.9, color='#9c179e')
feat_xgb.xaxis.major_label_orientation = 1


feat_lgb = figure(plot_width=800, plot_height=410, x_range=labels,
                  tools=['wheel_zoom,pan,reset,'], title='LightGBM Feature Importance')

feat_lgb.vbar(x=labels, top=lightgbm, width=0.9, color='#0d0887')
feat_lgb.xaxis.major_label_orientation = 1
feat_xgb.yaxis.visible = False
feat_lgb.yaxis.visible = False
feat_lgb.x_range.range_padding = 0.1
feat_xgb.x_range.range_padding = 0.1

tabs = Tabs(tabs=[ Panel(child=feat_xgb, title='XGBoost'), Panel(child=feat_lgb, title='LightGBM') ])

show(tabs)

# Kaggle submissions

In [17]:
from bokeh.models import Span
from bokeh.models import ColumnDataSource, FactorRange
fruits = ['Linear', 'MLP', 'XGB', 'LGB']
years = ['Previous', 'Average']

data = {'models' : fruits,
        'Previous'   : [0.3528851, 0.3590081, 0.3680951, 0.3671077],
        'Average'   : [0.3602763, 0.3655024, 0.3752033, 0.3746257]}

# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]
x = [ (fruit, year) for fruit in fruits for year in years ]
counts = sum(zip(data['Previous'], data['Average']), ()) # like an hstack

color = ['#ed7953', '#f0f921','#ed7953', '#f0f921','#ed7953', '#f0f921','#ed7953', '#f0f921']
source = ColumnDataSource(data=dict(x=x, counts=counts, color=color))

p = figure(x_range=FactorRange(*x), y_range=[.3,.377],
           plot_height=250, title="Submission Scores by Model and Assumption",
           toolbar_location=None, tools=['wheel_zoom,pan,reset,'])

p.vbar(x='x', top='counts', width=0.9, source=source, line_color="white",
       fill_color='color')

p.x_range.range_padding = 0.2
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
p.title.align = "center"
p.title.text_font_size = "15px"

benchmark = Span(location=0.3118025,
                            dimension='width', line_color='black',
                            line_dash='dashed', line_width=2)

p.add_layout(benchmark)

show(p)

In [18]:
script, div = components([auc_initial, p_size1, p_size2, p_mlp, p_xgb05, p_lgb1, auc_final, tabs, p])
print script, div[0], div[1], div[2], div[3], div[4], div[5], div[6], div[7], div[8]


<script type="text/javascript">
  (function() {
    var fn = function() {
      Bokeh.safely(function() {
        (function(root) {
          function embed_document(root) {
            
          var docs_json = '{"4e8649e5-301f-451f-b972-016625ce512e":{"roots":{"references":[{"attributes":{"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"line_width":{"value":2},"xs":{"field":"x"},"ys":{"field":"y"}},"id":"4f21b1e5-5fab-40e7-8509-583406fdab27","type":"MultiLine"},{"attributes":{},"id":"f729b5a9-c730-466c-b863-ce79b029581d","type":"ResetTool"},{"attributes":{"source":{"id":"0dd6542e-47a3-4e89-a4c4-ee087af4b743","type":"ColumnDataSource"}},"id":"4cc0e61c-822c-40d7-8ea9-8bc1fa4579d0","type":"CDSView"},{"attributes":{"label":{"value":"Not bought"},"renderers":[{"id":"23d8009a-9857-477b-a4c0-8fbda3c061a2","type":"GlyphRenderer"}]},"id":"98f4836c-22e7-4c76-b36d-990c667d889d","type":"LegendItem"},{"attributes":{"callback":null},"id":"989f44da-ac29-492d-81f4-32838d4b1c7c","type":