In [1]:
import datetime

In [2]:
import pandas as pd
import random
import numpy as np

In [3]:
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook, reset_output
from bokeh.layouts import gridplot


In [4]:
output_notebook()

In [5]:
def resample(l):
    final = []
    for i in range(len(l)):
        final.append(random.choice(l))
    return final

In [6]:
def repeat_resample(sample_a, sample_b, num_iter = 1000):
    difference_in_means = []#keep track of the difference in heights for each experiment
    for i in range(num_iter):
        resample_a = resample(sample_a)
        resample_b = resample(sample_b)
        difference = np.mean(resample_a) - np.mean(resample_b)
        difference_in_means.append(difference)
    return difference_in_means

In [7]:
def make_bar(labels, nums, title = None, y_range = None, plot_width = 350, plot_height = 350):
    p = figure(title = title, plot_width = plot_width, plot_height = plot_height,
              y_range = y_range)
    p.vbar(x=labels, top=nums, width=0.9)
    p.xgrid.grid_line_color = None
    return p

In [8]:
def hist(l):
    hist, edges = np.histogram(l, density=True)
    p = figure()
    p.quad(top = hist, bottom=0, left=edges[:-1], right=edges[1:], alpha = .4)
    return p

In [9]:
df = pd.read_csv('data/energy.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.head()


Unnamed: 0,Date,Region,thermal_actual,thermal_est,nuclear_actual,nuclear_est,hydro_actual,hydro_est
0,2017-09-01,Northern,624.23,484.21,30.36,35.57,273.27,320.81
1,2017-09-01,Western,1106.89,1024.33,25.17,3.81,72.0,21.53
2,2017-09-01,Southern,576.66,578.55,62.73,49.8,111.57,64.78
3,2017-09-01,Eastern,441.02,429.39,,,85.94,69.36
4,2017-09-01,NorthEastern,29.11,15.91,,,24.64,21.21


In [10]:
def thermal_lines():
    north = df[df['Region'] == 'Northern']
    southern = df[df['Region'] == 'Southern']
    eastern = df[df['Region'] == 'Eastern']
    dates= north['Date']
    thermal_n = north['thermal_actual']
    thermal_s = southern['thermal_actual']
    thermal_e = eastern['thermal_actual']
    p = figure(x_axis_type = 'datetime', title = 'Thermal Comparison', 
                 plot_width = 450 , plot_height =450, y_range = None)

    p.line(x = dates, y = thermal_n, color = 'blue', legend_label = 'north')
    p.line(x = dates, y = thermal_s, color = 'green', legend_label = 'south')
    p.line(x = dates, y = thermal_e, color = 'orange', legend_label = 'east')
    return p

show(thermal_lines())

In [11]:
def nuclear_lines():
    north = df[df['Region'] == 'Northern']
    southern =df[df['Region'] == 'Southern']
    dates = north['Date']
    nuclear_n = north['nuclear_actual']
    nuclear_s = southern['nuclear_actual']
    p = figure(x_axis_type = 'datetime', title = 'Nuclear Comparison', 
                 plot_width = 450 , plot_height =450, y_range = None)
    p.line(x = dates, y = nuclear_n, color = 'blue', legend_label = 'north')
    p.line(x = dates, y = nuclear_s, color = 'red', legend_label = 'south')
    return p
show(nuclear_lines())

In [12]:
def bar_compare():
    north = df[df['Region'] == 'Northern']
    thermal_n = north['thermal_actual']
    southern = df[df['Region'] == 'Southern']
    thermal_s =  southern['thermal_actual']
    p1 = make_bar(labels = [x for x in range(len(thermal_n))], nums = thermal_n, y_range = (0, 825))
    p2 = make_bar(labels = [x for x in range(len(thermal_s))], nums = thermal_s, y_range = (0, 825))
    grid = gridplot([p1, p2], ncols = 2)
    return grid

show(bar_compare())

In [13]:
def resample_thermal():
    north = df[df['Region'] == 'Northern']
    thermal_n = north['thermal_actual']
    southern = df[df['Region'] == 'Southern']
    thermal_s =  southern['thermal_actual']
    print( np.mean(thermal_n), np.mean(thermal_s))
    results = repeat_resample(thermal_n.tolist(), thermal_s.tolist())
    lower = np.percentile(results, 2.5)
    upper = np.percentile(results, 97.5)
    lies_inside =   lower < 14 < upper
    print(lies_inside)
    # reject null hypothesis that thermal_n < 34 thermal_s
    show(hist(results))
resample_thermal()

658.1080258899675 619.0249514563106
False


In [16]:
def year_diff():
    north_2018 = df[(df['Region'] == 'Northern') & 
               (df['Date'] >=datetime.datetime(2018,1,1))
              & (df['Date'] < datetime.datetime(2019,1,1))]
    north_2019 = df[(df['Region'] == 'Northern') & 
               (df['Date'] >=datetime.datetime(2019,1,1))
              & (df['Date'] < datetime.datetime(2020,1,1))]
    thermal_2018 = north_2018['thermal_actual']
    thermal_2019 = north_2019['thermal_actual']
    print('mean of 2018 is {m1} and mean of 2019 is {m2}'.format(
        m1 = np.mean(thermal_2018), m2 = np.mean(thermal_2019)))
    results = repeat_resample(thermal_2019.tolist(), thermal_2018.tolist())
    lower = np.percentile(results, 2.5)
    upper = np.percentile(results, 97.5)

    lies_inside =   lower < 14 < upper
    print(lies_inside)

    # reject null hypothesis that 2019 energy was less than 36 from 2018

year_diff()



mean of 2018 is 642.194120879121 and mean of 2019 is 681.9789589041096
False


Exercises:

1. Do the same exercise as the first example, and show if there is a difference between the Eastern region and Southern region.

2. Do the same exercise as the second example, and see if there is a difference in the Southern region between 2018 and 2019. 
