In [1]:
import pandas as pd
import numpy as np
from hf import *
import lightgbm as lgb
import warnings

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn import impute
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression

# Classes

This class is used to extract values from a DataFrame

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

This class is used to add attributes such as hour of day and day of week

In [3]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_hourday = True):
        self.add_hourday = add_hourday
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.add_hourday:
            return X.assign(tm_day_of_week=((X.timestamp // 24) % 7), tm_hour_of_day=(X.timestamp % 24))
        else:
            return X

This class is used to fit a seperate model on each of the four meter types. Note that any model can be used 

In [4]:
class CatSplitRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model, col):
        self.model = model
        self.col = col

    def fit(self, X, y):
        self.fitted = {}
        importances = []
        for val in X[self.col].unique():
            X1 = X[X[self.col] == val].drop(columns=[self.col])
            self.fitted[val] = clone(self.model).fit(X1, y.reindex_like(X1))
            importances.append(self.fitted[val].feature_importances_)
            del X1
        fi = np.average(importances, axis=0)
        col_index = list(X.columns).index(self.col)
        self.feature_importances_ = [*fi[:col_index], 0, *fi[col_index:]]
        return self
    
    def transform(self, X):
        return X

    def predict(self, X):
        result = np.zeros(len(X))
        for val in X[self.col].unique():
            ix = np.nonzero((X[self.col] == val).to_numpy())
            predictions = self.fitted[val].predict(X.iloc[ix].drop(columns=[self.col]))
            result[ix] = predictions
        return result

This class is used to wrap the LGBM model (a model seperate from sci-kit learn's models)

In [5]:
categorical_columns = [
    "building_id", "meter", "site_id", "primary_use", "had_air_temperature", "had_cloud_coverage",
    "had_dew_temperature", "had_precip_depth_1_hr", "had_sea_level_pressure", "had_wind_direction",
    "had_wind_speed", "tm_day_of_week", "tm_hour_of_day"
]


class LGBMWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, categorical_feature=None, **params):
        self.model = lgb.LGBMRegressor(**params)
        self.categorical_feature = categorical_feature

    def fit(self, X, y):
        with warnings.catch_warnings():
            cats = None if self.categorical_feature is None else list(
                X.columns.intersection(self.categorical_feature))
            warnings.filterwarnings("ignore",
                                    "categorical_feature in Dataset is overridden".lower())
            self.model.fit(X, y, **({} if cats is None else {"categorical_feature": cats}))
            self.feature_importances_ = self.model.feature_importances_
            return self
        
    def transform(self, X):
        return X

    def predict(self, X):
        return self.model.predict(X)

    def get_params(self, deep=True):
        return {**self.model.get_params(deep), "categorical_feature": self.categorical_feature}

    def set_params(self, **params):
        ctf = params.pop("categorical_feature", None)
        if ctf is not None: self.categorical_feature = ctf
        self.model.set_params(params)

# Pipelines

In [6]:
num_attribs = ['building_id', 'meter', 'site_id',
       'square_feet', 
       'air_temperature', 'cloud_coverage', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed',]
cat_attribs = ['primary_use',  'had_air_temperature', 'had_cloud_coverage',
       'had_dew_temperature', 'had_precip_depth_1_hr',
       'had_sea_level_pressure', 'had_wind_direction', 'had_wind_speed']
    
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', impute.SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder())
])

numandcat_pipeline = FeatureUnion(transformer_list =[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])


## The Classifers
## 1. Linear Regression
lin_reg_clf = Pipeline([
    ('formatter', numandcat_pipeline),
    ('model', LinearRegression())
])

## 2. LGBM
lgbm_reg_clf = CatSplitRegressor(LGBMWrapper(categorical_feature = categorical_columns), "had_air_temperature")


# Train Model

## Open Data, Get Train/Test Sets

In [7]:
c =CombinedAttributesAdder()

In [8]:
df = pd.read_pickle(r'C:\Users\lukep\Documents\big_data\ASHRAE\PROCESSED_TRAIN_DF.pkl')
# df = df[df['meter']==0] # Remove Later
df = c.transform(df)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['building_id']):
    strat_train_set = df.iloc[train_index]
    strat_test_set= df.iloc[test_index]

In [9]:
X_train = strat_train_set.drop(['meter_reading', 'timestamp'], axis='columns')
y_train = strat_train_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

X_test = strat_test_set.drop(['meter_reading', 'timestamp'], axis='columns')
y_test = strat_test_set['meter_reading'].apply(np.log1p) # use logarithm bc of cost function at the end

## Train Linear Regression and LGBM Model

Fit linear model

In [10]:
#lin_reg_clf.fit(X_train, y_train)

Fit LGBM model (note that this class takes in a DataFrame)

In [15]:
lgbm_reg_clf.fit(X_train, y_train)

KeyboardInterrupt: 

## Look at Predictions

### Linear

Get predictions

In [12]:
# y_pred = lin_reg_clf.predict(X_test)

Compute RMSLE

In [13]:
'''
sampled_y = np.expm1(y_test)
sampled_prediction = np.expm1(y_pred)

rmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))
print(rmsle)
'''

'\nsampled_y = np.expm1(y_test)\nsampled_prediction = np.expm1(y_pred)\n\nrmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))\nprint(rmsle)\n'

### LGBM

Get predictions

In [None]:
y_pred = lgbm_reg_clf.predict(X_test)

Compute RMSLE

In [None]:
sampled_y = np.expm1(y_test)
sampled_prediction = np.expm1(y_pred)

rmsle = np.sqrt(mean_squared_log_error(sampled_y, np.clip(sampled_prediction, 0, None)))
print(rmsle)

In [None]:
importances = pd.Series(lgbm_reg_clf.feature_importances_, index=X_train.columns).rename("Importance")
display(importances.sort_values(ascending=False).to_frame().T)