In [356]:
import import_ipynb

import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
import json
from data_generator import DataLoaderSequence
import math
import random
from PyEMD import EMD

In [357]:
class DataLoader():
    """A class for loading and transforming data for the lstm model"""

    def __init__(self, filename, split1, split2, cols, pre_len, input_timesteps,seq_len,**EMD_para):
        self.dataframe = pd.read_csv(filename,sep=',')
        if not isinstance(self.dataframe.index, pd.DatetimeIndex):
            self.dataframe['Date_Time'] = pd.to_datetime(self.dataframe['Date_Time'])
            self.dataframe = self.dataframe.set_index('Date_Time')
        
        self.cols = cols
        self.split1 = split1
        self.split2 = split2
        self.len_train_windows = None
        self.pre_len = pre_len
        self.input_timesteps = input_timesteps
        self.seq_len = seq_len
        print('the input cols are:', self.cols)
        self.Normalization(**EMD_para)
    
    def scale_EMD(self, activate_EMD=False):
        for col in self.cols:
            if col == 'Consumption':
                self.dataframe['Consumption'] = self.dataframe.set_index('Consumption').index.map(lambda x: math.log(x))
                print('scaling Consumption is done!')
        
        if activate_EMD==True:

            self.IMFs = EMD().emd(self.dataframe['Consumption'].values)
            print('the signal is decomposed into '+ str(self.IMFs.shape[0]) +' parts')
            
            self.df_names_IMF = locals()
            
            for ind, IMF in enumerate(self.IMFs):
                
                IMF_name = 'IMF'+str(ind)+'_consumption'
                data={IMF_name:self.IMFs[ind]}
                IMF_i = pd.DataFrame(data=data)
                self.df_names_IMF['IMF'+str(ind)] = pd.concat([IMF_i[IMF_name], self.dataframe.get(self.cols[1:])],axis=1)



    def Normalization(self, **EMD_para):
        
        i_split1 = int(len(self.dataframe) * self.split1)
        i_split2 = int(len(self.dataframe) * self.split2)
        
        if len(EMD_para) ==0:
            self.scale_EMD()
            
            self.data_train_original = self.dataframe.get(self.cols)[:i_split1]
            self.data_val_original   = self.dataframe.get(self.cols)[i_split1:i_split2]
            self.data_test_original  = self.dataframe.get(self.cols)[i_split2:]
            
        else:
            self.scale_EMD(activate_EMD=True)
            IMF_number = EMD_para['IMF_num']
            
            print('processing the data of IM'+ str(IMF_number))
            
            if IMF_number in range(self.IMFs.shape[0]):
                self.data_train_original = self.df_names_IMF['IMF'+str(IMF_number)][:i_split1]
                self.data_val_original   = self.df_names_IMF['IMF'+str(IMF_number)][i_split1:i_split2]
                self.data_test_original  = self.df_names_IMF['IMF'+str(IMF_number)][i_split2:]
            else:
                print("Oops!IMF_number was no valid number. it must between 0 and "+str(self.IMFs.shape[0]-1))

        self.min_max_scaler = preprocessing.MinMaxScaler().fit(self.data_train_original.values)

        self.data_train = self.min_max_scaler.transform(self.data_train_original.values)
        self.data_val = self.min_max_scaler.transform(self.data_val_original.values)
        self.data_test = self.min_max_scaler.transform(self.data_test_original.values)

        self.len_train  = len(self.data_train_original)
        self.len_val    = len(self.data_val_original)
        self.len_test   = len(self.data_test_original)
    
    def get_pre_time(self):
        data_windows = []

        for i in range((self.len_test-self.input_timesteps)//self.pre_len):
            data_windows.append(self.data_test_original.index[i*self.pre_len:i*self.pre_len+self.seq_len])
            
        pre_time = np.array([p[self.input_timesteps:] for p in data_windows])
       
        return pre_time
    
    def get_test_data(self):
        '''
        Create x, y test data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise reduce size of the training split.
        '''
        data_windows = []

        for i in range((self.len_test-self.input_timesteps)//self.pre_len):
            data_windows.append(self.data_test[i*self.pre_len:i*self.pre_len+self.seq_len])

        x = np.array([p[:self.input_timesteps,:] for p in data_windows])
        y = np.array([p[self.input_timesteps:,0] for p in data_windows])
        return x,y
    
    def get_train_data(self):
        '''
        Create x, y train data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise use generate_training_window() method.
        '''
        train_x = []
        train_y = []
        for i in range(self.len_train-self.seq_len):
            data_window = self.data_train[i:i+self.seq_len]
            train_x.append(data_window[:self.input_timesteps,:])
            train_y.append(data_window[self.input_timesteps:,0])
            
        train_x = np.array(train_x)
        train_y = np.array(train_y)
        
        sfl = list(range(len(train_x)))
        random.shuffle(sfl)
        train_x = train_x[sfl]
        train_y = train_y[sfl]
        
        return train_x, train_y
            
    def get_val_data(self):
        val_x = []
        val_y = []
        for i in range(self.len_val-self.seq_len):
            data_window = self.data_val[i:i+self.seq_len]
            val_x.append(data_window[:self.input_timesteps,:])
            val_y.append(data_window[self.input_timesteps:,0])
        val_x = np.array(val_x)
        val_y = np.array(val_y)
        
        sfl = list(range(len(val_x)))
        random.shuffle(sfl)
        val_x = val_x[sfl]
        val_y = val_y[sfl]
        
        return val_x, val_y
    
    def training_batch_generator(self, batch_size):
        train_x, train_y = self.get_train_data()
        return DataLoaderSequence(train_x,train_y, batch_size)
    
    def val_batch_generator(self, batch_size):
        val_x,val_y = self.get_val_data()
        return DataLoaderSequence(val_x, val_y, batch_size)

In [360]:
configs = json.load(open('config.json', 'r'))
data = DataLoader(
    filename=os.path.join('data', configs['data']['filename']),
    split1=configs['data']['train_test_split1'],
    split2=configs['data']['train_test_split2'],
    cols=configs['data']['columns'],
    pre_len=configs['model']['layers'][4]['neurons'],
    input_timesteps=configs['model']['layers'][0]['input_timesteps'],
    seq_len = configs['data']['sequence_length'],
    IMF_num = 10
)

the input cols are: ['Consumption', 'hour', 'holiday_indicator']
scaling Consumption is done!
the signal is decomposed into 11 parts
processing the data of IM10


  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [332]:
data.dataframe

Unnamed: 0_level_0,year,month,day,hour,week,day_of_week,weekday_,Consumption,quarter,holiday,holiday_indicator,1_hour_before,168_hour_before,2_hour_before,72_hour_before,96_hour_before
Date_Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-08-23 08:00:00,2018,8,23,8,33,4,False,1.798404,3,False,0,3.24,5.88,3.74,3.66,5.53
2018-08-23 09:00:00,2018,8,23,9,33,4,False,2.784394,3,False,0,6.04,14.44,3.24,3.82,13.27
2018-08-23 10:00:00,2018,8,23,10,33,4,False,2.830858,3,False,0,16.19,13.00,6.04,3.80,14.21
2018-08-23 11:00:00,2018,8,23,11,33,4,False,2.532903,3,False,0,16.96,8.60,16.19,6.20,7.14
2018-08-23 12:00:00,2018,8,23,12,33,4,False,2.830268,3,False,0,12.59,12.65,16.96,13.09,18.45
2018-08-23 13:00:00,2018,8,23,13,33,4,False,2.581731,3,False,0,16.95,3.72,12.59,10.68,3.75
2018-08-23 14:00:00,2018,8,23,14,33,4,False,2.158715,3,False,0,13.22,3.43,16.95,8.45,14.22
2018-08-23 15:00:00,2018,8,23,15,33,4,False,2.681706,3,False,0,8.66,3.77,13.22,19.57,16.21
2018-08-23 16:00:00,2018,8,23,16,33,4,False,2.887033,3,False,0,14.61,3.58,8.66,7.90,3.79
2018-08-23 17:00:00,2018,8,23,17,33,4,False,2.043814,3,False,0,17.94,5.34,14.61,11.75,3.36


In [333]:
# data.scale_EMD(activate_EMD=True)

In [334]:
emd=EMD()

In [335]:
data.Normalization()

scaling Consumption is done!
