In [1]:
import numpy as np
import pandas as pd

from bokeh.layouts import column, row
from bokeh.io import output_notebook, push_notebook
from bokeh.palettes import Category20
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, FuncTickFormatter

from ipywidgets import interact, Dropdown, HBox, VBox, SelectMultiple, Button
from IPython.display import display

from helper_functions import *

output_notebook()

In [2]:
all_data = pd.read_csv("Datasets/nutrition_raw_anonymized_data.csv")
num_rows = len(list(all_data.index))
num_cols = len(list(all_data.columns))

all_columns = list(all_data.columns)

In [3]:
g = all_data.columns.to_series().groupby(all_data.dtypes).groups
temp_dict = {k.name: v for k,v in g.items()}
object_columns = list(temp_dict['int64'])
num_unique_elems = [len(list(all_data[col].unique())) for col in object_columns]
binary_cols = [i for i,j in enumerate(num_unique_elems) if j==2]
print([object_columns[i] for i in binary_cols])

['BREAKFASTSANDWICHQUAN', 'BUNSQUAN', 'PEACHESQUAN', 'DRIEDFRUITQUAN', 'MEATSUBSTITUTESQUAN', 'OTHERSOUPQUAN', 'EATMEAT', 'VARIETYMEATQUAN', 'EATFISH', 'FRIEDORBREADEDFISHQUAN', 'CEREALBARSQUAN', 'PUMPKINPIEFREQ', 'PUDDINGQUAN', 'SAUCEICECREAMQUAN', 'MILKQUAN', 'MEALREPLACEMENTDRINKSQUAN', 'DECAFCOFFEETYPE', 'REGULARCOFFEETYPE', 'BOTHKINDSCOFFEETYPE', 'SUGARINCOFFEE', 'DECAFHOTTEATYPE', 'REGULARHOTTEATYPE', 'BOTHKINDSHOTTEATYPE', 'CREAMINTEA', 'SUGARINTEA', 'ALLBRANORIGTYPE', 'APPLEJACKSTYPE', 'CAPNCRUNCHTYPE', 'CHEERIOSPLAINTYPE', 'CHEERIOSHONNUTTYPE', 'CHEXWHEATTYPE', 'CHEXOTHERTYPE', 'CINNTOASTCRTYPE', 'COCOAKRISPIESTYPE', 'CORNFLAKESTYPE', 'FIBERONETYPE', 'FROSTEDFLAKESTYPE', 'FROSTEDMINIWHEATSTYPE', 'GRANOLATYPE', 'GRAPENUTSTYPE', 'HONBUNCHOATSTYPE', 'KASHIGOLNORHR2HRTYPE', 'LIFETYPE', 'OATSQUARESTYPE', 'RAISINBRANTYPE', 'SHREDDEDWHEATTYPE', 'SPECIALKPLAINTYPE', 'SPECIALKFLAVSTYPE', 'OTHERUNSWEETCEREALTYPE', 'OTHERWHOLEGRAINCEREALTYPE', 'OTHERFIBERCEREALTYPE', 'DONTEATORDONTKNOWCE

In [4]:
all_data['bool_disease'] = pd.Series(makeBoolColumn(all_data,['cancer','diabetes','heart_disease'],'Yes'))
all_data['type_smoking'] = pd.Series(makeTypeColumn(all_data,['smoke_rarely','smoke_often'],'Yes',False))
all_data['bool_smoking'] = pd.Series(makeBoolColumn(all_data,['smoke_rarely','smoke_often'],'Yes'))
all_data['type_hand'] =    pd.Series(makeTypeColumn(all_data,['left_hand','right_hand'],'Yes',True))
all_data['type_pisa'] =    pd.Series(makeTypeColumn(all_data,['readingMath','mathReading'],'Yes',False))
all_data['type_cable'] =   pd.Series(makeTypeColumn(all_data,['unfavCable','neutralCable','favCable'],'Yes',True))
all_data['type_crash'] =   pd.Series(makeTypeColumn(all_data,['noCrash','uhCrash','yesCrash'],'Yes',True))
all_data['type_pet'] =     pd.Series(makeTypeColumn(all_data,['cat','dog'],'Yes',False))
all_data['bool_pet'] =     pd.Series(makeBoolColumn(all_data,['cat','dog'],'Yes'))
all_data['bool_belly'] =   pd.Series(makeBoolColumn(all_data,['belly'],'Outie'))
all_data['bool_hist_smoked'] = pd.Series(makeBoolColumn(all_data,['ever_smoked'],'Yes'))
all_data['bool_rash'] =    pd.Series(makeBoolColumn(all_data,['rash'],'Yes'))
all_data['type_race'] =    pd.Series(makeTypeColumn(all_data,['LATINO','WHITE','BLACK','ASIAN','NATIVEAMER','HAWAIIAN'],1,False))
all_data['bool_cancer'] =  pd.Series(makeBoolColumn(all_data,['cancer'],'Yes'))
all_data['bool_diabetes'] =  pd.Series(makeBoolColumn(all_data,['diabetes'],'Yes'))
all_data['bool_heart_disease'] =  pd.Series(makeBoolColumn(all_data,['heart_disease'],'Yes'))

label_dict = {}

label_dict['bool_disease'] = {0: 'No Disease', 1: 'Diseased'}
label_dict['type_smoking'] = {0: 'No Smoking', 1: 'Smoke Rarely', 2: 'Smoke Often'}
label_dict['bool_smoking'] = {0: 'No Smoking', 1: 'Smoking'}
label_dict['type_hand'] = {0: 'Left-Handed', 1: 'Right-Handed'}
label_dict['type_pisa'] = {0: 'Science', 1: 'Reading', 2: 'Math'}
label_dict['type_cable'] = {0: 'Not Favourite-Cable', 1: 'Neutral-Cable', 2: 'Favourite-Cable'}
label_dict['type_crash'] = {0: 'No Crash', 1: 'Uh Crash', 2: 'Yes Crash'}
label_dict['type_pet'] = {0: 'No Pet', 1: 'Cat', 2: 'Dog'}
label_dict['bool_pet'] = {0: 'No Pet', 1: 'Has Pet'}
label_dict['bool_belly'] = {0: 'Innie', 1: 'Outie'}
label_dict['bool_hist_smoked'] = {0: 'Never Smoked', 1: 'Has Smoked'}
label_dict['bool_rash'] = {0: 'No Rash', 1: 'Has Rash'}
label_dict['type_race'] = {0: 'Not Specified', 1: 'Latino', 2: 'White', 3: 'Black', 4: 'Asian', 5: 'Native American', 6: 'Hawaiian'}
label_dict['bool_cancer'] =  {0: 'No Cancer' , 1: 'Cancer'}
label_dict['bool_diabetes'] =  {0: 'No Diabetes' , 1: 'Diabetes'}
label_dict['bool_heart_disease'] = {0: 'No Heart Disease' , 1: 'Heart Disease'}

cat_cat_column_names_dict = {
                            '(Boolean) Disease': 'bool_disease',
                            '(Boolean) Cancer': 'bool_cancer',
                            '(Boolean) Diabetes': 'bool_diabetes',
                            '(Boolean) Heart Disease': 'bool_heart_disease',
                            '(Boolean) Currently Smoking': 'bool_smoking',
                            '(Boolean) Historical Smoking': 'bool_hist_smoked',
                            '(Boolean) Pets': 'bool_pet',
                            '(Boolean) Belly': 'bool_belly',
                            '(Boolean) Rash': 'bool_rash',
                            '(Type of) Smoking': 'type_smoking',
                            '(Type of) Handedness': 'type_hand',
                            '(Type of) Pisa Best Score': 'type_pisa',
                            '(Type of) Cable Favoritism': 'type_cable',
                            '(Type of) Crash': 'type_crash',
                            '(Type of) Pet': 'type_pet',
                            '(Type of) Race': 'type_race'
                            
}

In [5]:
color_dict = {i: Category20[20][i*2] for i in range(9)}

In [6]:
cat_cat_params_dict = {
                    'x_name': list(cat_cat_column_names_dict.values())[0],
                    'y_name': list(cat_cat_column_names_dict.values())[1],
                    'target_name': list(cat_cat_column_names_dict.values())[2],
                    'radius_scale': 1
}

cat_cat_data_dict = makeDataDictForScatter(cat_cat_params_dict , list(label_dict[cat_cat_params_dict['target_name']].values()) , all_data , color_dict, label_dict)
cat_cat_CDS = ColumnDataSource(data=cat_cat_data_dict)

In [7]:
cat_cat_x_select = Dropdown(options=cat_cat_column_names_dict,value=cat_cat_params_dict['x_name'],description="X Data")
cat_cat_y_select = Dropdown(options=cat_cat_column_names_dict,value=cat_cat_params_dict['y_name'],description="Y Data")
cat_cat_target_select = Dropdown(options=cat_cat_column_names_dict,value=cat_cat_params_dict['target_name'],description="Target Variable")

cat_cat_mult_select = SelectMultiple(options=list(label_dict[cat_cat_target_select.value].values()),value=list(label_dict[cat_cat_target_select.value].values()),description="Selected Targets")

cat_cat_options_box = VBox([HBox([cat_cat_x_select , cat_cat_y_select , cat_cat_target_select]),cat_cat_mult_select])

def catCatAxesCallback(change):
    if(change["type"] == "change"):
        cat_cat_params_dict['x_name'] = cat_cat_x_select.value
        cat_cat_params_dict['y_name'] = cat_cat_y_select.value

        cat_cat_CDS.data = makeDataDictForScatter(cat_cat_params_dict , cat_cat_mult_select.value, all_data , color_dict, label_dict)

        cat_cat_figure.x_range.factors = list(label_dict[cat_cat_params_dict['x_name']].values())
        cat_cat_figure.y_range.factors = list(label_dict[cat_cat_params_dict['y_name']].values())

        push_notebook()
        
def catCatTargetSelectCallback(change):
    if(change["type"] == "change"):
        cat_cat_params_dict['target_name'] = cat_cat_target_select.value
        
        cat_cat_mult_select.options = list(label_dict[cat_cat_target_select.value].values())
        cat_cat_mult_select.value = list(label_dict[cat_cat_target_select.value].values())
        
        cat_cat_CDS.data = makeDataDictForScatter(cat_cat_params_dict , cat_cat_mult_select.value, all_data , color_dict, label_dict)
        
        push_notebook()
        

def catCatMultSelectCallback(change):
    if(change["type"] == "change"):
        cat_cat_CDS.data = makeDataDictForScatter(cat_cat_params_dict , cat_cat_mult_select.value, all_data , color_dict, label_dict)
        push_notebook()
        
    
cat_cat_x_select.observe(catCatAxesCallback,names="value")
cat_cat_y_select.observe(catCatAxesCallback,names="value")
cat_cat_target_select.observe(catCatTargetSelectCallback,names="value")
cat_cat_mult_select.observe(catCatMultSelectCallback,names="value")

In [8]:
cat_cat_figure = figure(title="Category-Category Scatter Plot",
                        plot_width=500,plot_height=450,
                        x_range=list(label_dict[cat_cat_params_dict['x_name']].values()),
                        y_range=list(label_dict[cat_cat_params_dict['y_name']].values()))
#                         x_range=[-1,len(label_dict[cat_cat_params_dict['x_name']].keys())],
#                         y_range=[-1,len(label_dict[cat_cat_params_dict['y_name']].keys())])

cat_cat_figure.circle(x='x_data' , y='y_data' ,
                      fill_color='color_data' , line_color='color_data' , fill_alpha=0.8 ,
                      radius='radius_data' , legend='legend_data' ,
                      source=cat_cat_CDS)

show(cat_cat_figure, notebook_handle=True)

In [9]:
display(cat_cat_options_box)