In [68]:
import pandas as pd

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource

output_notebook()

In [30]:
# Load data from csv
canceled_df = pd.read_csv('query1.csv/part-00000-52553417-36e9-4c25-b381-2f5ee8d6c911-c000.csv', parse_dates=False)
canceled_df.head()

Unnamed: 0,Year,Month,DayofMonth,percentageCancelled
0,1994,1,1,0.005264
1,1994,1,2,0.004492
2,1994,1,3,0.015418
3,1994,1,4,0.155609
4,1994,1,5,0.047656


In [48]:
# Build datetimes from Year, Month, Day columns
canceled_df.rename(columns={'DayofMonth': 'Day'}, inplace=True)
dates = pd.to_datetime(canceled_df[['Year', 'Month', 'Day']])
# Add Date to the original DataFrame
canceled_df = canceled_df.assign(Date=dates)
# Set Date as the new index column
canceled_df.set_index('Date', inplace=True)
canceled_df.head()

Unnamed: 0_level_0,Year,Month,Day,percentageCancelled
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1994-01-01,1994,1,1,0.005264
1994-01-02,1994,1,2,0.004492
1994-01-03,1994,1,3,0.015418
1994-01-04,1994,1,4,0.155609
1994-01-05,1994,1,5,0.047656


# Percentage of canceled flights aggregated per year

### Percentage of canceled flights per day

In [237]:
# Areate a new plot (with a title) using figure, set the x axis to display dates
p = figure(plot_width=900, plot_height=400,
           title="Percentage of canceled flights per day", x_axis_type='datetime')

# Load the Dataframe as Bokeh DataSource
source = ColumnDataSource(canceled_df)

# Add a line renderer
p.line('Date', 'percentageCancelled', source=source)
p.y_range.start = 0

show(p) # show the results

### Percentage of canceled flights aggregated per year (mean)

In [238]:
from bokeh.models.annotations import Span
from bokeh.palettes import viridis
from bokeh.transform import factor_cmap

In [239]:
# Drop useless columns
yearly_df = canceled_df.drop(columns=['Month', 'Day'])
yearly_df.Year = yearly_df.Year.astype(str)

# Prepare the data source
year_avg = yearly_df.groupby('Year')
source = ColumnDataSource(year_avg)


p = figure(plot_width=900, plot_height=400, x_range=year_avg)
p.vbar(x='Year', top='percentageCancelled_mean', width=.9, source=source, line_color="white",
      fill_color=factor_cmap('Year', palette=viridis(15),factors=yearly_df.Year.unique()))

# Styling
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Year"
p.yaxis.axis_label = "Mean percentage of canceled flights"
p.y_range.start = 0

show(p)