In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import impute
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [4]:
df = pd.read_pickle(r'C:\Users\lukep\Documents\big_data\ASHRAE\PROCESSED_TRAIN_DF.pkl')

In [27]:
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,sea_level_pressure,wind_direction,wind_speed,had_air_temperature,had_cloud_coverage,had_dew_temperature,had_precip_depth_1_hr,had_sea_level_pressure,had_wind_direction,had_wind_speed
0,0,0,0.0,0.0,0,Education,7432,2008.0,-1.0,19.40625,...,1019.5,0.0,0.0,True,False,True,True,False,True,True
1,1,0,0.0,0.0,0,Education,2720,2004.0,-1.0,19.40625,...,1019.5,0.0,0.0,True,False,True,True,False,True,True
2,2,0,0.0,0.0,0,Education,5376,1991.0,-1.0,19.40625,...,1019.5,0.0,0.0,True,False,True,True,False,True,True
3,3,0,0.0,0.0,0,Education,23685,2002.0,-1.0,19.40625,...,1019.5,0.0,0.0,True,False,True,True,False,True,True
4,4,0,0.0,0.0,0,Education,116607,1975.0,-1.0,19.40625,...,1019.5,0.0,0.0,True,False,True,True,False,True,True


In [21]:
df.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'year_built', 'floor_count',
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'had_air_temperature', 'had_cloud_coverage',
       'had_dew_temperature', 'had_precip_depth_1_hr',
       'had_sea_level_pressure', 'had_wind_direction', 'had_wind_speed'],
      dtype='object')

In [24]:
def make_is_bad_zero(Xy_subset, min_interval=48, summer_start=3000, summer_end=7500):
    """Helper routine for 'find_bad_zeros'.
    
    This operates upon a single dataframe produced by 'groupby'. We expect an 
    additional column 'meter_id' which is a duplicate of 'meter' because groupby 
    eliminates the original one."""
    meter = Xy_subset.meter.iloc[0]
    is_zero = Xy_subset.meter_reading == 0
    if meter == 0:
        # Electrical meters should never be zero. Keep all zero-readings in this table so that
        # they will all be dropped in the train set.
        return is_zero

    transitions = (is_zero != is_zero.shift(1))
    all_sequence_ids = transitions.cumsum()
    ids = all_sequence_ids[is_zero].rename("ids")
    if meter in [2, 3]:
        # It's normal for steam and hotwater to be turned off during the summer
        keep = set(ids[(Xy_subset.timestamp < summer_start) |
                       (Xy_subset.timestamp > summer_end)].unique())
        is_bad = ids.isin(keep) & (ids.map(ids.value_counts()) >= min_interval)
    elif meter == 1:
        time_ids = ids.to_frame().join(Xy_subset.timestamp).set_index("timestamp").ids
        is_bad = ids.map(ids.value_counts()) >= min_interval

        # Cold water may be turned off during the winter
        jan_id = time_ids.get(0, False)
        dec_id = time_ids.get(8283, False)
        if (jan_id and dec_id and jan_id == time_ids.get(500, False) and
                dec_id == time_ids.get(8783, False)):
            is_bad = is_bad & (~(ids.isin(set([jan_id, dec_id]))))
    else:
        raise Exception(f"Unexpected meter type: {meter}")

    result = is_zero.copy()
    result.update(is_bad)
    return result


def find_bad_zeros(df):
    """Returns an Index object containing only the rows which should be deleted."""
    is_bad_zero = df.groupby(["building_id", "meter"]).apply(make_is_bad_zero)
    return is_bad_zero[is_bad_zero].index.droplevel([0, 1])

In [25]:
index = find_bad_zeros(df)

In [26]:
index

Int64Index([       0,     2301,     4594,     6893,     9189,    11485,
               13780,    16073,    18367,    20661,
            ...
            20192453, 20194817, 20197182, 20199546, 20201908, 20206634,
            20209000, 20211365, 20213731, 20216097],
           dtype='int64', length=1175847)

In [28]:
df_new=df.drop(index)

In [31]:
elec = df_new[df_new['meter']==0]

In [61]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(elec, elec['building_id']):
    strat_train_set = elec.iloc[train_index]
    strat_test_set= elec.iloc[test_index]

In [53]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
num_attribs = ['building_id', 'meter', 'site_id',
       'square_feet', 
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed',]
cat_attribs = ['primary_use',  'had_air_temperature', 'had_cloud_coverage',
       'had_dew_temperature', 'had_precip_depth_1_hr',
       'had_sea_level_pressure', 'had_wind_direction', 'had_wind_speed']
    
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', impute.SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder())
])

full_pipeline = FeatureUnion(transformer_list =[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [62]:
elec_prepared_train = full_pipeline.fit_transform(strat_train_set)
elec_labels_train=np.log1p(np.array(strat_train_set['meter_reading']))

elec_prepared_test = full_pipeline.transform(strat_test_set)
elec_labels_test =np.log1p(np.array(strat_test_set['meter_reading']))

In [63]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(elec_prepared_train, elec_labels_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [64]:
elec_labels_test_pred = lin_reg.predict(elec_prepared_test)

In [66]:
from sklearn.metrics import mean_squared_log_error
sampled_y = np.expm1(elec_labels_test)
sampled_prediction = np.expm1(elec_labels_test_pred)
rmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))

In [67]:
rmsle

1.103942249170461