In [1]:
import json

import pandas as pd
import numpy as np

from eemeter import (
    load_sample,
    # merge_temperature_data,
    get_baseline_data,
    # segment_timeseries,
    get_feature_hour_of_week,
    get_feature_occupancy,
    get_design_matrix,
    caltrack_hourly_method,
    get_feature_binned_temperatures,
)

from eemeter import (
    merge_features,
    segment_time_series,
    compute_time_features,
    compute_temperature_features,
    estimate_hour_of_week_occupancy,
    fit_temperature_bins,
    iterate_segmented_dataset,
)

import matplotlib.pyplot as plt

%matplotlib inline

%load_ext autoreload
%autoreload 2

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
meter_data, temperature_data, metadata = \
    load_sample('il-electricity-cdd-hdd-hourly')

In [3]:
meter_data.head()

Unnamed: 0_level_0,value
start,Unnamed: 1_level_1
2015-11-22 06:00:00+00:00,0.29
2015-11-22 07:00:00+00:00,1.47
2015-11-22 08:00:00+00:00,0.58
2015-11-22 09:00:00+00:00,0.28
2015-11-22 10:00:00+00:00,1.25


In [4]:
segment_time_series(meter_data.index, 'single').head()

Unnamed: 0_level_0,all
start,Unnamed: 1_level_1
2015-11-22 06:00:00+00:00,1.0
2015-11-22 07:00:00+00:00,1.0
2015-11-22 08:00:00+00:00,1.0
2015-11-22 09:00:00+00:00,1.0
2015-11-22 10:00:00+00:00,1.0


In [5]:
segment_time_series(meter_data.index, 'one_month').head()

Unnamed: 0_level_0,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-11-22 06:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2015-11-22 07:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2015-11-22 08:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2015-11-22 09:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2015-11-22 10:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
segment_time_series(meter_data.index, 'three_month').head()

Unnamed: 0_level_0,dec-jan-feb,jan-feb-mar,feb-mar-apr,mar-apr-may,apr-may-jun,may-jun-jul,jun-jul-aug,jul-aug-sep,aug-sep-oct,sep-oct-nov,oct-nov-dec,nov-dec-jan
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-11-22 06:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2015-11-22 07:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2015-11-22 08:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2015-11-22 09:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2015-11-22 10:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


In [7]:
segment_time_series(meter_data.index, 'three_month_weighted').head()

Unnamed: 0_level_0,dec-jan-feb-weighted,jan-feb-mar-weighted,feb-mar-apr-weighted,mar-apr-may-weighted,apr-may-jun-weighted,may-jun-jul-weighted,jun-jul-aug-weighted,jul-aug-sep-weighted,aug-sep-oct-weighted,sep-oct-nov-weighted,oct-nov-dec-weighted,nov-dec-jan-weighted
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-11-22 06:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5
2015-11-22 07:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5
2015-11-22 08:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5
2015-11-22 09:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5
2015-11-22 10:00:00+00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5


In [8]:
time_features = compute_time_features(meter_data.index)
time_features.head()

Unnamed: 0_level_0,hour_of_week
start,Unnamed: 1_level_1
2015-11-22 06:00:00+00:00,151
2015-11-22 07:00:00+00:00,152
2015-11-22 08:00:00+00:00,153
2015-11-22 09:00:00+00:00,154
2015-11-22 10:00:00+00:00,155


In [9]:
temperature_features = compute_temperature_features(
    meter_data.index, temperature_data,
    heating_balance_points=[50], cooling_balance_points=[65],
    degree_day_method='hourly'
)
temperature_features.head()

Unnamed: 0_level_0,temperature_mean,cdd_65,hdd_50
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-11-22 06:00:00+00:00,21.01,0.0,28.99
2015-11-22 07:00:00+00:00,20.35,0.0,29.65
2015-11-22 08:00:00+00:00,19.38,0.0,30.62
2015-11-22 09:00:00+00:00,19.02,0.0,30.98
2015-11-22 10:00:00+00:00,17.82,0.0,32.18


In [10]:
merged_data = merge_features([  # not sure how to represent this in platform
    meter_data.value.to_frame('meter_value'),
    temperature_features,
    time_features
])
merged_data.head()

Unnamed: 0_level_0,meter_value,temperature_mean,cdd_65,hdd_50,hour_of_week
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-11-22 06:00:00+00:00,0.29,21.01,0.0,28.99,151
2015-11-22 07:00:00+00:00,1.47,20.35,0.0,29.65,152
2015-11-22 08:00:00+00:00,0.58,19.38,0.0,30.62,153
2015-11-22 09:00:00+00:00,0.28,19.02,0.0,30.98,154
2015-11-22 10:00:00+00:00,1.25,17.82,0.0,32.18,155


In [11]:
baseline_data, warnings = get_baseline_data(
    data=merged_data, end=merged_data.index[-1], max_days=365)
baseline_data.shape

(8761, 5)

In [12]:
segmented_weights = segment_time_series(baseline_data.index, 'three_month_weighted')
segmented_weights.head()

Unnamed: 0_level_0,dec-jan-feb-weighted,jan-feb-mar-weighted,feb-mar-apr-weighted,mar-apr-may-weighted,apr-may-jun-weighted,may-jun-jul-weighted,jun-jul-aug-weighted,jul-aug-sep-weighted,aug-sep-oct-weighted,sep-oct-nov-weighted,oct-nov-dec-weighted,nov-dec-jan-weighted
start,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-02-08 06:00:00+00:00,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-08 07:00:00+00:00,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-08 08:00:00+00:00,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-08 09:00:00+00:00,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-08 10:00:00+00:00,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
occupancy_lookup = estimate_hour_of_week_occupancy(baseline_data, segmentation=segmented_weights)
occupancy_lookup.head()

Unnamed: 0_level_0,dec-jan-feb-weighted,jan-feb-mar-weighted,feb-mar-apr-weighted,mar-apr-may-weighted,apr-may-jun-weighted,may-jun-jul-weighted,jun-jul-aug-weighted,jul-aug-sep-weighted,aug-sep-oct-weighted,sep-oct-nov-weighted,oct-nov-dec-weighted,nov-dec-jan-weighted
hour_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0


In [14]:
# now do binned temperatures
temperature_bins = fit_temperature_bins(
    baseline_data, segmentation=segmented_weights,
    default_bins=[30, 45, 55, 65, 75, 90],
    min_temperature_count=20
)
temperature_bins

Unnamed: 0_level_0,dec-jan-feb-weighted,jan-feb-mar-weighted,feb-mar-apr-weighted,mar-apr-may-weighted,apr-may-jun-weighted,may-jun-jul-weighted,jun-jul-aug-weighted,jul-aug-sep-weighted,aug-sep-oct-weighted,sep-oct-nov-weighted,oct-nov-dec-weighted,nov-dec-jan-weighted
bin_endpoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
30,True,True,True,True,False,False,False,False,False,True,True,True
45,True,True,True,True,True,False,False,False,True,True,True,True
55,True,True,True,True,True,True,True,True,True,True,True,True
65,True,True,True,True,True,True,True,True,True,True,True,True
75,False,False,True,True,True,True,True,True,True,True,True,False
90,False,False,False,False,True,True,True,True,True,True,False,False


In [56]:

def get_temperature_bin_features(temperatures, bin_endpoints):
    bin_endpoints = [-np.inf] + bin_endpoints + [np.inf]
    
    bins = {}

    for i, (left_bin, right_bin) in enumerate(zip(bin_endpoints, bin_endpoints[1:])):
        
        bin_name = 'bin_{}'.format(i)
        
        lt_bin = temperatures < left_bin
        in_bin = (temperatures > left_bin) & (temperatures <= right_bin)
        gt_bin = temperatures > right_bin
        is_na = temperatures.isna()
        
        not_in_bin_index = temperatures.index[~in_bin]
        gt_bin_index = temperatures.index[gt_bin]

        if i == 0:
            temps_in_bin = temperatures[in_bin].reindex(temperatures.index, fill_value=0)
            temps_out_of_bin = pd.Series(right_bin, index=not_in_bin_index).reindex(temperatures.index, fill_value=0)
            bin_values = (
                temps_in_bin + temps_out_of_bin
            )
        else:
            temps_in_bin = (temperatures[in_bin] - left_bin).reindex(temperatures.index, fill_value=0)
            temps_gt_bin = pd.Series(right_bin - left_bin, index=gt_bin_index).reindex(temperatures.index, fill_value=0)
            bin_values = (
                temps_in_bin + temps_gt_bin
            )
        bins[bin_name] = bin_values[~is_na].reindex(temperatures.index)
    return pd.DataFrame(bins)


def get_hourly_design_matrices(data, segmentation=None, occupancy_lookup=None, temperature_bins=None):
    
    for segment_name, segmented_data in \
            iterate_segmented_dataset(data, segmentation=segmentation):
        
        # get occupied feature
        hour_of_week = segmented_data.hour_of_week.to_frame()
        occupancy = occupancy_lookup[segment_name].to_frame('occupancy')
        occupied_feature = pd.merge(
            hour_of_week,
            occupancy,
            how='left', 
            left_on='hour_of_week',
            right_index=True,
        ).occupancy.rename('occupied')
        
        # get temperature bin features
        temperature_bin_endpoints = temperature_bins[segment_name].index[temperature_bins[segment_name]].tolist()
        print(temperature_bin_endpoints)
        
        temperatures = segmented_data.temperature_mean.rename('temperatures')
        temperature_bin_features = get_temperature_bin_features(
            temperatures, temperature_bin_endpoints
        )
        
        df = merge_features([
            segmented_data.meter_value,
            segmented_data.hour_of_week,
            occupied_feature,
            temperature_bin_features,
            segmented_data.weight,
        ])
        print(df.head())
        
get_hourly_design_matrices(
    baseline_data, segmentation=segmented_weights,
    occupancy_lookup=occupancy_lookup,
    temperature_bins=temperature_bins)

[30, 45, 55, 65]
                           meter_value hour_of_week  occupied  bin_0  bin_1  \
start                                                                         
2017-02-08 06:00:00+00:00         0.96           55       0.0   30.0   3.39   
2017-02-08 07:00:00+00:00         0.02           56       0.0   30.0   2.02   
2017-02-08 08:00:00+00:00         3.28           57       0.0   30.0   1.20   
2017-02-08 09:00:00+00:00         1.29           58       0.0   30.0   0.56   
2017-02-08 10:00:00+00:00         4.13           59       0.0   30.0   0.36   

                           bin_2  bin_3  bin_4  weight  
start                                                   
2017-02-08 06:00:00+00:00    0.0    0.0    0.0     0.5  
2017-02-08 07:00:00+00:00    0.0    0.0    0.0     0.5  
2017-02-08 08:00:00+00:00    0.0    0.0    0.0     0.5  
2017-02-08 09:00:00+00:00    0.0    0.0    0.0     0.5  
2017-02-08 10:00:00+00:00    0.0    0.0    0.0     0.5  
[30, 45, 55, 65]
             

In [16]:
def caltrack_hourly_method(data, segmentation):
    pass

what is the output of make design matrix?

get_design_matrix()
pass baseline data, segmentation, occupancy lookup, bins



In [17]:
design_matrix, preprocessors_fit, warnings = \
    get_design_matrix(
        baseline_data,
        functions=[
            {
                'function': get_feature_occupancy,
                'kwargs': {
                    'occupancy_lookup': occupancy_lookup
                }
            },
            {
                'function': get_feature_temperature_bins,
                'kwargs': {}
            },
        ],
        segmentation=None,
    )

NameError: name 'get_feature_temperature_bins' is not defined

In [None]:
preprocessors_fit

In [None]:
design_matrix.shape

In [None]:
design_matrix.head()

In [None]:
preprocessors = {
    'segment_timeseries': {
        'function': segment_timeseries,
        'kwargs': {'segment_type': 'three_month_weighted'}
    },
    'get_feature_hour_of_week': {
        'function': get_feature_hour_of_week,
        'kwargs': {}
    },
    'get_feature_occupancy': {
        'function': get_feature_occupancy,
        'kwargs': {'threshold': 0.6}
    },
    'get_feature_binned_temperatures': {
        'function': get_feature_binned_temperatures,
        'kwargs': {}
    }
}
formula = (
    'meter_value ~ C(hour_of_week) - 1 + '
    'bin_0:occupancy + '
    'bin_1:occupancy + bin_2:occupancy + '
    'bin_3:occupancy + bin_4:occupancy + '
    'bin_5:occupancy + bin_6:occupancy'
)
model_fit = caltrack_hourly_method(
    baseline_data, formula, preprocessors)

In [None]:
model_fit

In [None]:
model_fit.__dict__

In [None]:
results, design_matrix, warnings = model_fit.model.predict(baseline_data)

In [None]:
design_matrix.shape

In [None]:
design_matrix.head()

In [None]:
results.shape

In [None]:
results.head()

meter data, temperature data

-> merge into meter data/temperature data single

merged data

-> baseline

baseline merged data

-> segment

segmented merged data - consider making segments a set of weights over a time index, like a mask. Make input just the time index. main advantage: don't have to repeat data, feature hour of week can be agnostic to segmenting. If a method could work with and without segmenting it's nice to be able to pass the segmenting in as a separate object (potentially none).

missing hour of week warnings can be on whole or parts


-> fit features

occupancy lookup

pivot the occupancy lookup to have rows of hours, columns of model ids and contents of booleans
dict of models like before.

-> compute features

occupancy matrix from occupancy lookup and hour of week features, one column per model?

compute temperature bins - maybe name the bins by their extents? then you could create the bins once for every temperature and use the bins that are 

-> merge features

feature-rich segmented data, fitted features.
(could these be separated into a fit step and a compute step?)
(some features depend on each other like occupancy on time of week.)
(dependency ordering)

-> compute "granular" dataframe

result

-> predict

What did he mean by "replace with design matrix function"? i think he meant column - instead of precomputing just get the column
