In [1]:
# Sebastian Raschka
# last updated: 03/29/2014 
# Sequential Floating Forward Selection (SFFS)

def seq_float_forw_select(features, max_k, criterion_func, print_steps=False):
    """
    Implementation of Sequential Floating Forward Selection.
    
    Keyword Arguments:
        features (list): The feature space as a list of features.
        max_k: Termination criterion; the size of the returned feature subset.
        criterion_func (function): Function that is used to evaluate the
            performance of the feature subset.
        print_steps (bool): Prints the algorithm procedure if True.
    
    Returns the selected feature subset, a list of features of length max_k.

    """

    # Initialization
    feat_sub = []
    k = 0

    while True:

        # Step 1: Inclusion
        if print_steps:
            print('\nInclusion from features', features)
        if len(features) > 0:
            crit_func_max = criterion_func(feat_sub + [features[0]])
            best_feat = features[0]
            if len(features) > 1:
                for x in features[1:]:
                    crit_func_eval = criterion_func(feat_sub + [x])
                    if crit_func_eval > crit_func_max:
                        crit_func_max = crit_func_eval
                        best_feat = x
            features.remove(best_feat)
            feat_sub.append(best_feat)
            if print_steps:
                print('include: {} -> feature_subset: {}'.format(best_feat, feat_sub))

        # Step 2: Conditional Exclusion
            worst_feat_val = None
            if len(features) + len(feat_sub) > max_k:
                crit_func_max = criterion_func(feat_sub)
                for i in reversed(range(0,len(feat_sub))):
                    crit_func_eval = criterion_func(feat_sub[:i] + feat_sub[i+1:])
                    if crit_func_eval > crit_func_max:
                        worst_feat, crit_func_max = i, crit_func_eval
                        worst_feat_val = feat_sub[worst_feat]
                if worst_feat_val:
                    del feat_sub[worst_feat]
            if print_steps:
                print('exclude: {} -> feature subset: {}'.format(worst_feat_val, feat_sub))


        # Termination condition
        k = len(feat_sub)
        if k == max_k:
            break

    return feat_sub

In [15]:
__author__ = 'nipunbatra'

import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

df = pd.read_csv("../main_15min_decomposition_12_daily_weekly_cluster_diff_frac_temp.csv",index_col=0)
dfc = df.copy()

df = df.drop(871)
df = df.drop(1169)



w=df[['aggregate_%d' %i for i in range(1,13)]]

df = df.ix[w[w>0].dropna().index]

"""
features_individual = {#'fraction':["fraction_%d" % i for i in range(1, 25)],
                       'area': 'area',
                       'autocorr':'autocorr',
                       'month': ["aggregate_%d" % i for i in range(1, 13)],
                       'occupants': 'total_occupants',
                       'rooms': 'num_rooms',
                       #'seasonal_daily':['stdev_seasonal_daily','max_seasonal_daily'],
                       #'trend_daily':['stdev_trend_daily','max_trend_daily'],
                       'seasonal_weekly':['stdev_seasonal_weekly','max_seasonal_weekly'],
                       'trend_weekly':['stdev_trend_weekly','max_trend_weekly'],}
                       #'disag_fridge':'disag_fridge'}
                       #'mins_hvac':'mins_hvac',}
                       #'month_extract':['variance','ratio_min_max', 'difference_min_max',
                        #                'ratio_difference_min_max']}

"""
features_individual = {#'fraction':["fraction_%d" % i for i in range(1, 25)],
                       'area': 'area',
                       'autocorr':'autocorr',
                       'month': ["aggregate_%d" % i for i in range(1, 13)],
                       'occupants': 'total_occupants',
                       'rooms': 'num_rooms',
                       'seasonal_12':['stdev_seasonal_12','max_seasonal_12'],
                       'trend_12':['stdev_trend_12','max_trend_12'],

                       'seasonal_daily':['stdev_seasonal_daily','max_seasonal_daily'],
                       'trend_daily':['stdev_trend_daily','max_trend_daily'],
                       'seasonal_weekly':['stdev_seasonal_weekly','max_seasonal_weekly'],
                       'trend_weekly':['stdev_trend_weekly','max_trend_weekly'],
                       'cluster_big':'cluster_big',
                       'cluster_small':'cluster_small',
                       'diff':['lt_500','bet_500_1000','gt_1000'],
                       'temp':'temperature_corr',
                       #'disag_fridge':'disag_fridge'}
                       'mins_hvac':'mins_hvac',
                       'month_extract':['variance','ratio_min_max', 'difference_min_max',
                                        'ratio_difference_min_max']}

### Monthly ONLY
"""
features_individual = {#'fraction':["fraction_%d" % i for i in range(1, 25)],
                       'area': 'area',
                       #'autocorr':'autocorr',
                       'month': ["aggregate_%d" % i for i in range(1, 13)],
                       'occupants': 'total_occupants',
                       'rooms': 'num_rooms',
                       #'seasonal_12':['stdev_seasonal_12','max_seasonal_12'],
                       #'trend_12':['stdev_trend_12','max_trend_12'],
                       #'seasonal_daily':['stdev_seasonal_daily','max_seasonal_daily'],
                       #'trend_daily':['stdev_trend_daily','max_trend_daily'],
                       #'seasonal_weekly':['stdev_seasonal_weekly','max_seasonal_weekly'],
                       #'trend_weekly':['stdev_trend_weekly','max_trend_weekly'],}
                       #'disag_fridge':'disag_fridge'}
                       #'mins_hvac':'mins_hvac',
                       #'cluster_big':'cluster_big',
                       #'diff':['lt_500','bet_500_1000','gt_1000'],
                       'temp':'temperature_corr',
                       'month_extract':['variance','ratio_min_max', 'difference_min_max',
                                        'ratio_difference_min_max']}


"""

from itertools import combinations
features_dict = {}
for feature_size in range(1,min(4,len(features_individual))):
    combinations_size_n = list(combinations(features_individual.keys(), feature_size))
    for com in combinations_size_n:
        features_dict[com] = np.hstack([features_individual[x] for x in com]).tolist()



hvac_fhmm_pred = pd.read_csv("../fhmm_disag_new.csv", index_col=0)
fridge_fhmm_pred = pd.read_csv("../fridge_fhmm.csv", index_col=0)
appliance_fhmm = {'fridge': fridge_fhmm_pred,
                  'hvac': hvac_fhmm_pred}

national_average = {"fridge": 0.07, "hvac": 0.18, 'wm': 0.01, 'furnace': 0.09, 'dw': 0.02, 'dr': 0.04, 'light': .11}


#Normalising features
max_aggregate = df[["aggregate_%d" % i for i in range(1, 13)]].max().max()
df[["aggregate_%d" % i for i in range(1, 13)]] = df[["aggregate_%d" % i for i in range(1, 13)]].div(max_aggregate)

df['area'] = df['area'].div(df['area'].max())

df['num_rooms'] = df['num_rooms'].div(df['num_rooms'].max())
df['total_occupants'] = df['total_occupants'].div(df['total_occupants'].max())
df['mins_hvac'] =  df['mins_hvac'].div(df['mins_hvac'].max())

max_cols = {}
for col in ["stdev_trend_12","stdev_seasonal_12","max_seasonal_12","max_trend_12",
            "stdev_trend_daily","stdev_seasonal_daily","max_seasonal_daily","max_trend_daily",
            "stdev_trend_weekly","stdev_seasonal_weekly","max_seasonal_weekly","max_trend_weekly","disag_fridge",
            'stdev_trend','stdev_seasonal','max_seasonal','max_trend',
            'cluster_small','cluster_big', 'temperature_corr']:
    if col in df.columns:
        max_cols[col] = dfc[col].max()
        df[col] = df[col].div(df[col].max())


# Adding new feature
aa = df[["aggregate_%d" % i for i in range(1, 13)]].copy()
df['variance'] = df[["aggregate_%d" % i for i in range(1, 13)]].var(axis=1)
df['ratio_min_max'] = aa.min(axis=1)/aa.max(axis=1)

df['difference_min_max'] = aa.max(axis=1)-aa.min(axis=1)
df['ratio_difference_min_max'] = (aa.max(axis=1)-aa.min(axis=1)).div(aa.max(axis=1))

dfs = {}
total = features_dict.values()[np.array(map(len, features_dict.values())).argmax()]
for appliance in ['fridge','hvac','dr','light','dw','wm']:
    temp=df.ix[df[['%s_%d' %(appliance, i) for i in range(1,13)]].dropna().index]
    dfs[appliance] =temp.ix[temp[total].dropna().index]
    print appliance, len(dfs[appliance])

appliance_min = {'fridge':5,'hvac':5,'wm':0,'dw':0,'dr':0,'light':0}

all_homes = {
    'dw':[  94,  370,  545,  624, 2156, 2242, 2814, 2829, 3723,
            4767, 5357,6636, 6910, 7769, 9934],
    'wm':[  94,  370,  545,  624, 2156, 2242, 2814, 3367, 3456, 3723, 3967,
            5357, 7769, 9654, 9922, 9934],
    'hvac':[  26,   94,  370,  410,  545,  624, 1283, 1642, 1953, 2129,
            2156, 2242, 2470, 2814, 2829,  3367, 3456, 3723,
            3967, 4767, 5218, 5357, 5371, 5746, 5785, 5814, 6072,
            6636, 6836, 6910, 7731, 7769, 7866, 9609, 9654, 9922, 9933, 9934],
    'fridge':[  94,  370,  410,  545,  624, 1953, 2156, 2242, 2814, 2829, 3367,
            3456, 3723, 3967, 4767, 5357, 5371, 6072, 6636, 6910, 7769, 7866],
    'light':df.index.tolist(),
        #[ 624, 1334, 2814, 2925, 2986, 3367, 3456, 3482, 3723, 3967, 4732,
        #    4767, 5814, 5817, 6072, 6266, 6910, 7016, 7429, 7731, 7769, 7866,
        #    8317, 8626, 9052, 9654, 9922],
    'dr':[  94,  370,  410, 2156, 2242, 2814, 3456, 3723, 4767,
            5785, 5814, 6072, 6636, 6836, 7731, 7769, 7866, 9654, 9922,
            9933, 9982]
}

all_homes = {appliance:dfs[appliance].index for appliance in dfs.keys()}

all_homes['fridge'] = np.array(np.setdiff1d(all_homes['fridge'], [2233, 5746, 7016]))
all_homes['hvac'] = np.array(np.setdiff1d(all_homes['hvac'], [252, 2925, 2986, 3482, 4732, 5439, 6266,
                                                              8626, 1800, 2233, 5817, 7016, 7429, 8317,
                                                              9052, 9982]))

all_homes['dw'] =  np.array(np.setdiff1d(all_homes['dw'],[2233, 7016]))


def create_predictions(appliance="hvac", feature=['num_rooms', 'total_occupants'],k=2, weights='uniform'):
    print "IN THIS BLOCK"
    print feature
    out_month = {}
    gt_month = {}
    overall_dfs = {}
    df_pred_copy = df.copy()
    #df_pred_copy = dfs[appliance].copy()
    df_pred_copy = df_pred_copy.ix[all_homes[appliance]]
    for i, month in enumerate(["%s_%d" %(appliance,i) for i in range(1,13)]):
        y = df_pred_copy[month]
        y2 = y.dropna()
        y3 = y2[y2>appliance_min[appliance]].dropna()
        df3 = df_pred_copy[feature].ix[y3.index].dropna()
        
        #df3 = df.ix[y3.index].dropna()
        y3 = y3.ix[df3.index]
        #df3 = df3.ix[appliance_fhmm[appliance].index].dropna()
        #y3 = y3.ix[df3.index]
        from sklearn.cross_validation import LeaveOneOut
        from sklearn.neighbors import RadiusNeighborsRegressor
        #clf = RadiusNeighborsRegressor(radius=k)
        clf = KNeighborsRegressor(n_neighbors=k, weights=weights)
        #clf = KNeighborsRegressor(n_neighbors=k, weights = 'distance' )
        loo = LeaveOneOut(len(df3))
        out_pred = []

        for train, test in loo:
            #clf.fit(preprocessing.normalize(df3[feature_columns[feature]].values[train]), y3.values[train])
            clf.fit(df3[feature].values[train], y3.values[train])
            #out_pred.append(clf.predict(preprocessing.normalize(df3[feature_columns[feature]].values[test])))
            out_pred.append(clf.predict(df3[feature].values[test]))

        out_pred = np.hstack(out_pred)

        out_month[i+1] = out_pred
        gt_month[i+1] = y3.values
        overall_dfs[i+1] = pd.DataFrame({"gt":y3.values, "pred":out_pred,
                                              "gt_total":dfc.ix[y3.index]["aggregate_"+str(i+1)].values}, index=y3.index)
        overall_dfs[i+1]["national average"] = overall_dfs[i+1]["gt_total"]*national_average[appliance]
    return overall_dfs

def percentage_error(gt, pred):
    return 100*np.abs(gt-pred)/(gt)




def compute_metrics(df):
    temp = df[df.gt_total>0.0]
    temp = temp[temp.gt>temp.gt_total]
    return {"Percentage error in appliance energy":np.median(percentage_error(df["gt"], df["pred"]))
            }



fridge 34
hvac 57
dr 35
light 24
dw 31
wm 28


In [19]:
def criterion_function(feature_set):
    
    k=1
    appliance="hvac"
    temp = create_predictions(appliance, feature_set, k)
    errors = {}
    for i in range(1, 13):
        errors[i] = percentage_error(temp[i]["gt"], temp[i]["pred"])
    error_df = pd.DataFrame(errors)
    accur_df = 100-error_df
    accur_df[accur_df<0]=0
   
    if appliance is "hvac":
         tdf = accur_df[range(5, 11)]
    else:
        tdf = accur_df
    print tdf.dropna().median().mean()
    return tdf.dropna().median().mean()

In [27]:
seq_float_forw_select(features=['area','autocorr', 'temperature_corr','total_occupants','num_rooms'], max_k=1,
                      criterion_func=criterion_function, print_steps=True)

('\nInclusion from features', ['area', 'autocorr', 'temperature_corr', 'total_occupants', 'num_rooms'])
IN THIS BLOCK
['area']
62.6659668625
IN THIS BLOCK
['autocorr']
52.7725688538
IN THIS BLOCK
['temperature_corr']
56.5084909322
IN THIS BLOCK
['total_occupants']
20.5161972056
IN THIS BLOCK
['num_rooms']
39.5001105926
include: area -> feature_subset: ['area']
IN THIS BLOCK
['area']
62.6659668625
IN THIS BLOCK
[]


ValueError: Found array with 0 feature(s) (shape=(13, 0)) while a minimum of 1 is required.

In [28]:
def seq_forw_select(features, max_k, criterion_func, print_steps=False):
    """
    Implementation of a Sequential Forward Selection algorithm.
    
    Keyword Arguments:
        features (list): The feature space as a list of features.
        max_k: Termination criterion; the size of the returned feature subset.
        criterion_func (function): Function that is used to evaluate the
            performance of the feature subset.
        print_steps (bool): Prints the algorithm procedure if True.
    
    Returns the selected feature subset, a list of features of length max_k.

    """

    # Initialization
    feat_sub = []
    k = 0
    d = len(features)
    if max_k > d:
        max_k = d

    while True:

        # Inclusion step
        if print_steps:
            print('\nInclusion from feature space', features)
        crit_func_max = criterion_func(feat_sub + [features[0]])
        best_feat = features[0]
        for x in features[1:]:
            crit_func_eval = criterion_func(feat_sub + [x])
            if crit_func_eval > crit_func_max:
                crit_func_max = crit_func_eval
                best_feat = x
        feat_sub.append(best_feat)
        if print_steps:
            print('include: {} -> feature subset: {}'.format(best_feat, feat_sub))
        features.remove(best_feat)

        # Termination condition
        k = len(feat_sub)
        if k == max_k:
            break

    return feat_sub

In [34]:
seq_float_forw_select(features=['autocorr', 'temperature_corr','total_occupants','num_rooms','area', 'aggregate_1'], max_k=4,
                      criterion_func=criterion_function, print_steps=True)

('\nInclusion from features', ['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'area', 'aggregate_1'])
IN THIS BLOCK
['autocorr']
52.7725688538
IN THIS BLOCK
['temperature_corr']
56.5084909322
IN THIS BLOCK
['total_occupants']
20.5161972056
IN THIS BLOCK
['num_rooms']
39.5001105926
IN THIS BLOCK
['area']
62.6659668625
IN THIS BLOCK
['aggregate_1']
60.6085118348
include: area -> feature_subset: ['area']
IN THIS BLOCK
['area']
62.6659668625
IN THIS BLOCK
[]


ValueError: Found array with 0 feature(s) (shape=(13, 0)) while a minimum of 1 is required.

In [35]:
from copy import deepcopy

def seq_backw_select(features, max_k, criterion_func, print_steps=False):
    """
    Implementation of a Sequential Backward Selection algorithm.
    
    Keyword Arguments:
        features (list): The feature space as a list of features.
        max_k: Termination criterion; the size of the returned feature subset.
        criterion_func (function): Function that is used to evaluate the
            performance of the feature subset.
        print_steps (bool): Prints the algorithm procedure if True.
        
    Returns the selected feature subset, a list of features of length max_k.

    """
    # Initialization
    feat_sub = deepcopy(features)
    k = len(feat_sub)
    i = 0

    while True:

        # Exclusion step
        if print_steps:
            print('\nExclusion from feature subset', feat_sub)
        worst_feat = len(feat_sub)-1
        worst_feat_val = feat_sub[worst_feat]
        crit_func_max = criterion_func(feat_sub[:-1])

        for i in reversed(range(0,len(feat_sub)-1)):
            crit_func_eval = criterion_func(feat_sub[:i] + feat_sub[i+1:])
            if crit_func_eval > crit_func_max:
                worst_feat, crit_func_max = i, crit_func_eval
                worst_feat_val = feat_sub[worst_feat]
        del feat_sub[worst_feat]
        if print_steps:
            print('exclude: {} -> feature subset: {}'.format(worst_feat_val, feat_sub))

        # Termination condition
        k = len(feat_sub)
        if k == max_k:
            break

    return feat_sub

In [36]:
seq_backw_select(features=['autocorr', 'temperature_corr','total_occupants','num_rooms','area', 'aggregate_1'], max_k=4,
                      criterion_func=criterion_function, print_steps=True)

('\nExclusion from feature subset', ['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'area', 'aggregate_1'])
IN THIS BLOCK
['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'area']
60.7848622505
IN THIS BLOCK
['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'aggregate_1']
59.1731614297
IN THIS BLOCK
['autocorr', 'temperature_corr', 'total_occupants', 'area', 'aggregate_1']
60.4731194239
IN THIS BLOCK
['autocorr', 'temperature_corr', 'num_rooms', 'area', 'aggregate_1']
58.5853586772
IN THIS BLOCK
['autocorr', 'total_occupants', 'num_rooms', 'area', 'aggregate_1']
58.1013026508
IN THIS BLOCK
['temperature_corr', 'total_occupants', 'num_rooms', 'area', 'aggregate_1']
59.5159888871
exclude: aggregate_1 -> feature subset: ['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'area']
('\nExclusion from feature subset', ['autocorr', 'temperature_corr', 'total_occupants', 'num_rooms', 'area'])
IN THIS BLOCK
['autocorr', 'temperature_cor

['autocorr', 'temperature_corr', 'num_rooms', 'area']

In [40]:
criterion_function(a)

IN THIS BLOCK
['fraction_1' 'fraction_2' 'fraction_3' 'fraction_4' 'fraction_5'
 'fraction_6' 'fraction_7' 'fraction_8' 'fraction_9' 'fraction_10'
 'fraction_11' 'fraction_12' 'fraction_13' 'fraction_14' 'fraction_15'
 'fraction_16' 'fraction_17' 'fraction_18' 'fraction_19' 'fraction_20'
 'fraction_21' 'fraction_22' 'fraction_23' 'fraction_24' 'autocorr'
 'max_seasonal_12' 'stdev_seasonal_12' 'max_trend_12' 'stdev_trend_12'
 'max_seasonal_daily' 'stdev_seasonal_daily' 'max_trend_daily'
 'stdev_trend_daily' 'max_seasonal_weekly' 'stdev_seasonal_weekly'
 'max_trend_weekly' 'stdev_trend_weekly' 'disag_fridge' 'cluster_small'
 'cluster_big' 'lt_500' 'bet_500_1000' 'gt_1000' 'temperature_corr']
66.1889825289


66.188982528901349

In [38]:
a = np.hstack([
            ["fraction_%d" %i for i in range(1,25)],
         "autocorr",
         "max_seasonal_12",
        "stdev_seasonal_12",
        "max_trend_12",
        "stdev_trend_12",
        "max_seasonal_daily",
        "stdev_seasonal_daily",
        "max_trend_daily",
        "stdev_trend_daily",
         "max_seasonal_weekly",
        "stdev_seasonal_weekly",
        "max_trend_weekly",
        "stdev_trend_weekly",
        "disag_fridge",
        "cluster_small",
        "cluster_big",
        "lt_500",
        "bet_500_1000",
        "gt_1000",
        "temperature_corr"

           ])

In [39]:
a

array(['fraction_1', 'fraction_2', 'fraction_3', 'fraction_4',
       'fraction_5', 'fraction_6', 'fraction_7', 'fraction_8',
       'fraction_9', 'fraction_10', 'fraction_11', 'fraction_12',
       'fraction_13', 'fraction_14', 'fraction_15', 'fraction_16',
       'fraction_17', 'fraction_18', 'fraction_19', 'fraction_20',
       'fraction_21', 'fraction_22', 'fraction_23', 'fraction_24',
       'autocorr', 'max_seasonal_12', 'stdev_seasonal_12', 'max_trend_12',
       'stdev_trend_12', 'max_seasonal_daily', 'stdev_seasonal_daily',
       'max_trend_daily', 'stdev_trend_daily', 'max_seasonal_weekly',
       'stdev_seasonal_weekly', 'max_trend_weekly', 'stdev_trend_weekly',
       'disag_fridge', 'cluster_small', 'cluster_big', 'lt_500',
       'bet_500_1000', 'gt_1000', 'temperature_corr'], 
      dtype='|S21')