In [3]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from numpy import argmax

In [None]:
rng = np.random

dataset = pd.DataFrame.from_csv("https://raw.githubusercontent.com/LuisM78/Appliances-energy-prediction-data/master/energydata_complete.csv")
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = int(np.floor(n_samples * missing_rate))
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()
estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])
score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)


class Cleaner(BaseEstimator, TransformerMixin):
    """Takes in dataframe, performs cleaning if needed and returns cleaned dataframe"""

    def __init__(self):
        pass
    
    def seconds(self, x):
        sec = x.hour*3600+x.minute*60+x.second
        return sec
    
    def day_week(self, z):
        a=[]
        for y in z:
            if y == 0:
                a.append('Monday')
            elif y == 1:
                a.append('Tuesday')
            elif y == 2:
                a.append('Wednesday')
            elif y == 3:
                a.append('Thrusday')
            elif y == 4:
                a.append('Friday')
            elif y == 5:
                a.append('Saturday')
            elif y == 6:
                a.append('Sunday')
        return a
    
    def week(self, x):
        a=[]
        for y in x:
            if y == 'Saturday' or y == 'Sunday':
                a.append('weekend')
            else:
                a.append('weekday')
        return a
    
    def one_hot_encode(self, Data):
        label_encoder = LabelEncoder()
        int_encoded = label_encoder.fit_transform(Data['week_status'])
        int_encoded_day = label_encoder.fit_transform(Data['Day_Status'])
        onehot_encoder = OneHotEncoder(sparse=False)
        int_encoded = int_encoded.reshape(len(int_encoded), 1)
        int_encoded_day = int_encoded_day.reshape(len(int_encoded_day), 1)
        newWeek = onehot_encoder.fit_transform(int_encoded)
        newDay = onehot_encoder.fit_transform(int_encoded_day)
        # new2 = label_encoder.inverse_transform([argmax(new[len(new)-1, :])])
        Data.drop(['week_status', 'Day_Status'], axis=1, inplace=True)
        Data['Friday'] = pd.Series(newDay[:,0], index=Data.index)
        Data['Monday'] = pd.Series(newDay[:,1], index=Data.index)
        Data['Saturday'] = pd.Series(newDay[:,2], index=Data.index)
        Data['Sunday'] = pd.Series(newDay[:,3], index=Data.index)
        Data['Thursday'] = pd.Series(newDay[:,4], index=Data.index)
        Data['Tuesday'] = pd.Series(newDay[:,5], index=Data.index)
        Data['Wednesday'] = pd.Series(newDay[:,6], index=Data.index)
        Data['WeekDay'] = pd.Series(newWeek[:,0], index=Data.index)
        Data['Weekend'] = pd.Series(newWeek[:,1], index=Data.index)
        return Data

    def transform(self, df, y=None):
        """Adding the columns Day_Status, week_status and Num_sec_midnight"""
        
        df['Num_sec_midnight']=self.seconds(df.index)
        z = df.index.dayofweek
        df['Day_Status'] = z
        df['Day_Status'] = self.day_week(df.Day_Status)
        df['week_status'] = self.week(df.Day_Status)
        
        """Performing one hot encoding on week_status and day_status columns"""
        return df.apply(self.one_hot_encode(df))

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self


class Normalizer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self, df, y=None):
        """Performs Normalization on all the columns except for Appliances"""
        for j in range(1, len(df.columns)-1,1):
            df.iloc[:,[j]] = (df.iloc[:,[j]] - df.iloc[:,[j]].mean())/df.iloc[:,[j]].std()
        return df
    
    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
class SplitData(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def transform(self, df, y=None):
        y=centered_scaled_data['Appliances']
        df4 = centered_scaled_data.iloc[:,1:]
        X_train, X_test, y_train, y_test = train_test_split(df4, y, test_size=0.25)
        train = X_train.join(y_train)
        test = X_test.join(y_test)
        train.to_csv("train.csv")
        test.to_csv("test.csv")
        return df
    
    def fit(self, df, y=None):
        return self
    
pipeline = Pipeline([("cleaner", Cleaner()),
                     ("normalizer", Normalizer()),
                     ("train_test_split", SplitData())])