### Load data set with features

In [1]:
from create_df import read_df
df, dfc, all_homes, appliance_min, national_average = read_df()

Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 20 days


In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')

In [3]:
run all_functions.py

In [4]:
run features.py

In [5]:
import json

#### First round of CV, finding optimal features maximising median accuracy

In [7]:
out_median, optimal_dict_median = find_optimal_features(df, dfc, all_homes, appliance_min, national_average, 
                                                        ["hvac","light","fridge","dr","wm","dw"], {"All":feature_map["All"]},
                          NUM_NEIGHBOURS_MAX=7, F_length_max=7, metric="median",
                          train_outlier=True, test_outlier=False, outlier_features=None, outlier_fraction=0.1,
                     print_steps=True)

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
hvac
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
********************
All
********************
include: aggregate_5 -> feature_subset: ['aggregate_5']. Accuracy: 81.5093101239
include: seasonal_energy_5 -> feature_subset: ['aggregate_5', 'seasonal_energy_5']. Accuracy: 84.0200083767
include: gt_1000 -> feature_subset: ['aggregate_5', 'seasonal_energy_5', 'gt_1000']. Accuracy: 86.4613257867
include: seasonal_energy_8 -> feature_subset: ['aggregate_5', 'seasonal_energy_5', 'gt_1000', 'seasonal_energy_8']. Accuracy: 86.786292743
include: aggregate_3 -> feature_subset: ['aggregate_5', 'seasonal_energy_5', 'gt_1000', 'seasonal_energy_8', 'aggregate_3']. Accuracy: 86.7656903574
include: seasonal_energy_6 -> feature_subset: ['aggregate_5', 'seasonal_energy_5', 'gt_1000', 'seasonal_energy_8', 'aggregate_3', 'seasonal_energy_6']. Accuracy: 86.7656903574
include: seasonal_energy_9 -> feature_subset: ['aggregate_5', 'seasonal_energy_5', 'gt_1000', 'seasonal_e

In [10]:
import json
json.dump(optimal_dict_median, open("optimal_dict_cv1.json","w"))
json.dump(out_median, open("out_cv1.json","w"))


### Second round of cross validation- Maximising mean accuracy after removing test outlier

In [11]:
out_cv2 = {}
optimal_cv2 = {}
for appliance in ["fridge","hvac","dw","dr","wm","light"]:
    out_cv2[appliance] = {}
    optimal_cv2[appliance] = {}
    o, opt = find_optimal_features(df, dfc, all_homes, appliance_min, national_average, 
                                                            [appliance], {"All":feature_map["All"]},
                              NUM_NEIGHBOURS_MAX=7, F_length_max=7, metric="mean",
                              train_outlier=True, test_outlier=True, outlier_features=
                                   optimal_dict_median[appliance]["All"]["f"], outlier_fraction=0.1,
                         print_steps=True)
    out_cv2[appliance]["All"]=o
    optimal_cv2[appliance]["All"]=opt
    

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
fridge
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
********************
All
********************
include: aggregate_9 -> feature_subset: ['aggregate_9']. Accuracy: 73.5329848207
include: temperature_corr -> feature_subset: ['aggregate_9', 'temperature_corr']. Accuracy: 75.0146394211
include: max_seasonal_12 -> feature_subset: ['aggregate_9', 'temperature_corr', 'max_seasonal_12']. Accuracy: 77.9851874786
include: gt_1000 -> feature_subset: ['aggregate_9', 'temperature_corr', 'max_seasonal_12', 'gt_1000']. Accuracy: 78.1906563659
include: aggregate_11 -> feature_subset: ['aggregate_9', 'temperature_corr', 'max_seasonal_12', 'gt_1000', 'aggregate_11']. Accuracy: 77.9851874786
include: fft_4 -> feature_subset: ['aggregate_9', 'temperature_corr', 'max_seasonal_12', 'gt_1000', 'aggregate_11', 'fft_4']. Accuracy: 77.8694101893
include: cluster_big -> feature_subset: ['aggregate_9', 'temperature_corr', 'max_seasonal_12', 'gt_1000', 'aggregate_11', 'fft_4'

In [59]:
json.dump(optimal_cv2, open("optimal_cv2.json",'w'))
json.dump(out_cv2, open("out_cv2.json",'w'))

### Storing the predictions in pd.Panel

In [1]:
import json

In [2]:
optimal_cv2 = json.load(open("optimal_cv2.json",'r'))
optimal_dict_median = json.load(open("optimal_dict_cv1.json",'r'))

out_cv2 = json.load(open("out_cv2.json",'r'))

In [8]:
optimal_cv2['wm']

{u'All': {u'wm': {u'All': {u'accuracy': 66.44378118606728,
    u'f': [u'cluster_big',
     u'difference_min_max',
     u'area',
     u'bet_500_1000',
     u'ratio_min_max',
     u'aggregate_2',
     u'aggregate_4'],
    u'k': 2}}}}

In [13]:
predictions = {}
for appliance in ["fridge","hvac","dw","dr","wm","light"]:
    print appliance
    best_feature = optimal_cv2[appliance]["All"][appliance]['All']['f']
    best_k = optimal_cv2[appliance]["All"][appliance]['All']['k']
    temp= create_predictions(df, dfc, all_homes, appliance_min, 
                                              national_average, appliance=appliance,
                                                feature=best_feature, NUM_NEIGHBOURS=best_k,
                                                   train_outlier=True, test_outlier=False, 
                                                outlier_features=optimal_dict_median[appliance]["All"]["f"],
                                                outlier_fraction=0.1)
    errors = {}
    if appliance =="hvac":
        start_month, end_month = 5, 11
    else:
        start_month, end_month = 1, 13
    for i in range(start_month, end_month):
        errors[i] = percentage_error(temp[i]["gt"], temp[i]["pred"])
        error_df = pd.DataFrame(errors)
        accur_df = 100-error_df
        accur_df[accur_df<0]=0

    tdf = accur_df
    if appliance =="hvac":
        for home in [624, 1953, 6636, 6836, 7769, 9922]:
            tdf.loc[home, 5]=np.NaN
            tdf.loc[home, 10]=np.NaN
            
    predictions[appliance]=tdf.T

fridge
hvac
dw
dr
wm
light


In [14]:
p = pd.Panel(predictions)

In [19]:
accuracy_store = pd.HDFStore("accuracy.h5")

In [20]:
accuracy_store.append("N-NILM", p)



In [21]:
accuracy_store.close()

In [9]:
appliance="hvac"
best_feature = optimal_cv2[appliance]["All"][appliance]['All']['f']
best_k = optimal_cv2[appliance]["All"][appliance]['All']['k']
temp= create_predictions(df, dfc, all_homes, appliance_min, 
                                          national_average, appliance=appliance,
                                            feature=best_feature, NUM_NEIGHBOURS=best_k,
                                               train_outlier=True, test_outlier=True, 
                                            outlier_features=optimal_dict_median[appliance]["All"]["f"],
                                            outlier_fraction=0.1)
errors = {}
if appliance =="hvac":
    start_month, end_month = 5, 11
else:
    start_month, end_month = 1, 13
for i in range(start_month, end_month):
    errors[i] = percentage_error(temp[i]["gt"], temp[i]["pred"])
    error_df = pd.DataFrame(errors)
    accur_df = 100-error_df
    accur_df[accur_df<0]=0

tdf = accur_df
if appliance =="hvac":
    for home in [624, 1953, 6636, 6836, 7769, 9922]:
        tdf.loc[home, 5]=np.NaN
        tdf.loc[home, 10]=np.NaN

predictions[appliance]=tdf.T

NameError: name 'create_predictions' is not defined