# Setup

In [34]:
from bokeh.io import output_notebook, show
from bokeh.models import CustomJS, ColumnDataSource, Slider, CDSView, IndexFilter
from bokeh.models.widgets import Select
from bokeh.layouts import column
from bokeh.plotting import figure
from bokeh.charts import Line, Scatter
import pandas as pd
import numpy as np

#Set output to notebook
output_notebook()

#Read in correct data and missing data
df = pd.read_csv("data.csv")
df_missing = pd.read_csv("data-missing.csv")

#Focus on rows with missing values and the corresponding correct rows
rows_missing = df_missing[df_missing.isnull().any(axis=1)]
rows_correct = df[df_missing.isnull().any(axis=1)]

print(rows_missing)
print(rows_correct)

     Channel  Region  Fresh    Milk  Grocery  Frozen  Detergents_Paper  \
75         1       3  20398  1137.0      NaN    4407               3.0   
172        1       3    955  5479.0      NaN     333            2840.0   
180        1       3  12356     NaN   8887.0     402            1382.0   
225        1       1  12680  3243.0   4157.0     660               NaN   
274        1       3    894  1703.0   1841.0     744             759.0   
370        2       3  39679  3944.0   4955.0    1364               NaN   

     Delicassen  
75        975.0  
172       707.0  
180      2794.0  
225       786.0  
274         NaN  
370      2235.0  
     Channel  Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  \
75         1       3  20398  1137        3    4407                 3   
172        1       3    955  5479     6536     333              2840   
180        1       3  12356  6036     8887     402              1382   
225        1       1  12680  3243     4157     660               76

AttributeError: 'Figure' object has no attribute 'Line'

# Display missing and correct data

In [36]:
line1 = Line(
        rows_missing.loc[:,'Fresh':'Delicassen'], 
        title="Missing", 
        legend="top_right", 
        ylabel='Value'
    )
line2 = Line(
        rows_correct.loc[:,'Fresh':'Delicassen'], 
        title="Correct", 
        legend="top_right", 
        ylabel='Value',
        x_range=line1.x_range, 
        y_range=line1.y_range
    )


show(row(line1, line2))

# Impute using attribute mean

In [37]:
rows_missing = rows_missing.fillna(df_missing.mean().astype(int))
line1 = Line(
        rows_missing.loc[:,'Fresh':'Delicassen'], 
        title="Missing", 
        legend="top_right", 
        ylabel='Value'
    )
line2 = Line(
        rows_correct.loc[:,'Fresh':'Delicassen'], 
        title="Correct", 
        legend="top_right", 
        ylabel='Value',
        x_range=line1.x_range, 
        y_range=line1.y_range
    )


show(row(line1, line2))

# Impute using attribute median

In [48]:
rows_missing = rows_missing.fillna(df_missing.median().astype(int))
line1 = Line(
        rows_missing.loc[:,'Fresh':'Delicassen'], 
        title="Missing", 
        legend="top_right", 
        ylabel='Value'
    )
line2 = Line(
        rows_correct.loc[:,'Fresh':'Delicassen'], 
        title="Correct", 
        legend="top_right", 
        ylabel='Value',
        x_range=line1.x_range, 
        y_range=line1.y_range
    )


show(row(line1, line2))

# Impute using pandas interpolation


In [47]:
rows_missing = df_missing[df_missing.isnull().any(axis=1)]
df_interpolated = df_missing.interpolate()
rows_interpolated = df_interpolated.loc[rows_missing.index]

line1 = Line(
        rows_interpolated.loc[:,'Fresh':'Delicassen'], 
        title="Missing", 
        legend="top_right", 
        ylabel='Value'
    )
line2 = Line(
        rows_correct.loc[:,'Fresh':'Delicassen'], 
        title="Correct", 
        legend="top_right", 
        ylabel='Value',
        x_range=line1.x_range, 
        y_range=line1.y_range
    )


show(row(line1, line2))