In [13]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from numpy import argmax
from boruta import BorutaPy
from tpot import TPOTRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [15]:
dataset = pd.DataFrame.from_csv("https://raw.githubusercontent.com/LuisM78/Appliances-energy-prediction-data/master/energydata_complete.csv")

class Cleaner(BaseEstimator, TransformerMixin):
    """Takes in dataframe, performs cleaning if needed and returns cleaned dataframe"""

    def __init__(self):
        pass
    
    def seconds(self, x):
        sec = x.hour*3600+x.minute*60+x.second
        return sec
    
    def day_week(self, z):
        a=[]
        for y in z:
            if y == 0:
                a.append('Monday')
            elif y == 1:
                a.append('Tuesday')
            elif y == 2:
                a.append('Wednesday')
            elif y == 3:
                a.append('Thrusday')
            elif y == 4:
                a.append('Friday')
            elif y == 5:
                a.append('Saturday')
            elif y == 6:
                a.append('Sunday')
        return a
    
    def week(self, x):
        a=[]
        for y in x:
            if y == 'Saturday' or y == 'Sunday':
                a.append('weekend')
            else:
                a.append('weekday')
        return a
    
    def one_hot_encode(self, Data):
        label_encoder = LabelEncoder()
        int_encoded = label_encoder.fit_transform(Data['week_status'])
        int_encoded_day = label_encoder.fit_transform(Data['Day_Status'])
        onehot_encoder = OneHotEncoder(sparse=False)
        int_encoded = int_encoded.reshape(len(int_encoded), 1)
        int_encoded_day = int_encoded_day.reshape(len(int_encoded_day), 1)
        newWeek = onehot_encoder.fit_transform(int_encoded)
        newDay = onehot_encoder.fit_transform(int_encoded_day)
        # new2 = label_encoder.inverse_transform([argmax(new[len(new)-1, :])])
        Data.drop(['week_status', 'Day_Status'], axis=1, inplace=True)
        Data['Friday'] = pd.Series(newDay[:,0], index=Data.index)
        Data['Monday'] = pd.Series(newDay[:,1], index=Data.index)
        Data['Saturday'] = pd.Series(newDay[:,2], index=Data.index)
        Data['Sunday'] = pd.Series(newDay[:,3], index=Data.index)
        Data['Thursday'] = pd.Series(newDay[:,4], index=Data.index)
        Data['Tuesday'] = pd.Series(newDay[:,5], index=Data.index)
        Data['Wednesday'] = pd.Series(newDay[:,6], index=Data.index)
        Data['WeekDay'] = pd.Series(newWeek[:,0], index=Data.index)
        Data['Weekend'] = pd.Series(newWeek[:,1], index=Data.index)
        return Data

    def transform(self, df, y=None):
        """Adding the columns Day_Status, week_status and Num_sec_midnight"""
        
        df['Num_sec_midnight']=self.seconds(df.index)
        z = df.index.dayofweek
        df['Day_Status'] = z
        df['Day_Status'] = self.day_week(df.Day_Status)
        df['week_status'] = self.week(df.Day_Status)
        
        """Performing one hot encoding on week_status and day_status columns"""
        df=self.one_hot_encode(df)
        return df

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self


class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self, df, y=None):
        """Performs Normalization on all the columns except for Appliances"""
        for j in range(1, len(df.columns)-1,1):
            df.iloc[:,[j]] = (df.iloc[:,[j]] - df.iloc[:,[j]].mean())/df.iloc[:,[j]].std()
        df.to_csv("normalized.csv")
        return df
    
    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class SplitData(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self, df, y=None):
        y = df['Appliances']
        df4 = df.iloc[:,1:]
        X_train, X_test, y_train, y_test = train_test_split(df4, y, test_size=0.25)
        train = X_train.join(y_train)
        test = X_test.join(y_test)
        train.to_csv("train.csv")
        test.to_csv("test.csv")
        return X_train, X_test, y_train, y_test
    
    def fit(self, df, y=None):
        return self
    
pipeline = Pipeline([("cleaner", Cleaner()),
                     ("normalizer", Normalizer()),
                     ("train_test_split", SplitData()),
                     ("features", FeatureUnion([
                         ("Boruta_GB", BorutaPy(GradientBoostingRegressor())),
                         ("Boruta_rf", BorutaPy(RandomForestRegressor())),
                         ("Boruta_TP", BorutaPy(TPOTRegressor())),
                         ("Boruta_ML", BorutaPy(MLPRegressor(hidden_layer_sizes=(50,50,50,50,50,50), max_iter=1000)))
                     ])),
                     ("estimator", RandomForestRegressor())
                    ])
pipeline

Pipeline(memory=None,
     steps=[('cleaner', Cleaner()), ('normalizer', Normalizer()), ('train_test_split', SplitData()), ('features', FeatureUnion(n_jobs=1,
       transformer_list=[('Boruta_GB', BorutaPy(alpha=0.05,
     estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learni...timators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [10]:
pipeline.steps

[('cleaner', Cleaner()),
 ('normalizer', Normalizer()),
 ('train_test_split', SplitData()),
 ('features', FeatureUnion(n_jobs=1,
         transformer_list=[('Boruta_GB', BorutaPy(alpha=0.05,
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
               max_leaf_nodes=None, min_impurity_decrease=0.0,
               min_impur...
       max_iter=100, n_estimators=1000, perc=100, random_state=None,
       two_step=True, verbose=0))],
         transformer_weights=None)),
 ('estimator',
  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0, warm_start=False

In [16]:
dataset = Cleaner().transform(dataset)
dataset = Normalizer().transform(dataset)
X_train, X_test, y_train, y_test = SplitData().transform(dataset)

In [17]:
param_grid = [{'Boruta_GB__n_estimators': [500,1000,1500], 'Boruta_GB__max_depth':[4,5,6]},
            {'Boruta_rf__n_estimators': [100,200,300]},
             {'Boruta_ML__max_iter':[500,1000,1500]},
             {'Boruta_TP__generations':[100,200,300], 'Boruta_ML__population_size':[100,300,500], 'Boruta_ML__offspring_size':[40,50,60],
             'Boruta_ML__mutation_rate':[0.5,0.7,0.9], 'Boruta_ML__crossover_rate':[0.3,0.5,0.7]}]
grid = GridSearchCV(pipeline, cv = 10, param_grid=param_grid, n_jobs=2,verbose=2)
grid.fit(X_train, y_train)
pred = grid.predict(X_test)

Fitting 10 folds for each of 258 candidates, totalling 2580 fits


PicklingError: Can't pickle <class 'tpot.operator_utils.TPOT_FeatureAgglomeration'>: attribute lookup TPOT_FeatureAgglomeration on tpot.operator_utils failed