# `Data Visualization Using Bokeh`

#### `Packages`

In [125]:
import pandas as pd
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, HoverTool, Select
from bokeh.layouts import column
from bokeh.io import output_notebook, show

#### `Data & Data Handling`

In [106]:
jan_first_data = pd.read_csv("archive\KwhConsumptionBlower78_1.csv", sep=",", index_col=0)
jan_second_data = pd.read_csv("archive\KwhConsumptionBlower78_2.csv", sep=",", index_col=0)
feb_data = pd.read_csv("archive\KwhConsumptionBlower78_3.csv", sep=",", index_col=0)

In [107]:
# Merged datasets
data_df = pd.concat([jan_first_data, jan_second_data, feb_data])

In [108]:
total_df_length = (len(jan_first_data) + len(jan_second_data) + len(feb_data))
if total_df_length == len(data_df):
    print("DataFrames successfully merged")
else:
    print("DataFrames NOT merged! Perform the merge operation again")

DataFrames successfully merged


In [109]:
class DataCheck():
    def __init__(self, data):
        self.data = data

    def missing_records(self):
        self.missing = self.data.isna().sum()
        return print("Missing Records: ", self.missing)
    
    def duplicated_records(self):
        self.duplicates = self.data.duplicated().sum()
        return print("Duplicated Records: ", self.duplicates)

In [110]:
check = DataCheck(data_df)
check.missing_records()
print("-----------------------")
check.duplicated_records()

Missing Records:  TxnDate        0
TxnTime        0
Consumption    0
dtype: int64
-----------------------
Duplicated Records:  24


- `There are 24 duplicated records in the data. We will drop them.`

In [111]:
data_df.drop_duplicates(inplace=True)
if (data_df.duplicated().sum() > 0):
    print("Duplicated records still present in the data. Drop them!")
else:
    print("Duplicated records dropped successfully")

Duplicated records dropped successfully


#### `Exploratory Data Analysis`

- `Note: Power consumption of less than 0.5KWh means that the power was off at that particular time period.`

In [112]:
# Records when the power was off - consumption less than 0.5KWh
power_off = data_df[data_df["Consumption"] < 0.5]
print("Power was off for {} days in total for the whole period".format(len(power_off)))
print("The power outage rate for the entire period is {}%".format(round((len(power_off)/(len(data_df))*100), 2)))

power_off

Power was off for 3 days in total for the whole period
The power outage rate for the entire period is 0.08%


Unnamed: 0,TxnDate,TxnTime,Consumption
966,16 Feb 2022,20:21:23,0.204
995,16 Feb 2022,20:01:43,0.096
1051,17 Feb 2022,00:53:11,0.0


- `Change the format of the data to proper datetime ISO: %d %b %Y %H:%M:%S`

In [113]:
data_df["Datetime"] = pd.to_datetime(
    data_df["TxnDate"] + " " + data_df["TxnTime"],
    format="%d %b %Y %H:%M:%S"
)

In [114]:
# Set the Datetime column as index 
# Sort the data according to the index
# Drop the TxnDate and TxnTime columns from the data_df
data_df = data_df.set_index("Datetime").sort_index().drop(columns=["TxnDate","TxnTime"])

In [115]:
# We can further localize the time zone for our context - Kenya
data_df.index = data_df.index.tz_localize("Africa/Nairobi")

In [117]:
# Data 
data_df[-10:]

Unnamed: 0_level_0,Consumption
Datetime,Unnamed: 1_level_1
2022-02-28 20:39:22+03:00,0.996
2022-02-28 20:58:52+03:00,0.92
2022-02-28 21:19:30+03:00,0.952
2022-02-28 21:39:55+03:00,0.968
2022-02-28 22:01:52+03:00,1.02
2022-02-28 22:25:43+03:00,1.152
2022-02-28 22:44:37+03:00,0.888
2022-02-28 23:07:15+03:00,1.064
2022-02-28 23:29:13+03:00,1.036
2022-02-28 23:50:27+03:00,1.0


#### `Bokeh`

In [119]:
output_notebook()

source = ColumnDataSource(data_df.reset_index())

p = figure(
    x_axis_type="datetime",
    title="Energy Consumption Over Time",
    width=900,
    height=400,
    tools="pan,wheel_zoom,reset,save"
)

p.line(
    x="Datetime",
    y="Consumption",
    source=source,
    line_width=2
)

p.xaxis.axis_label = "Time"
p.yaxis.axis_label = "Consumption (KWh)"

show(p)