In [2]:
import import_ipynb

import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
import json
from data_generator import DataLoaderSequence
import math
import random

importing Jupyter notebook from data_generator.ipynb


Using TensorFlow backend.


In [None]:
class DataLoader():
    """A class for loading and transforming data for the lstm model"""

    def __init__(self, filename, split1, split2, cols, pre_len, input_timesteps,seq_len):
        self.dataframe = pd.read_csv(filename,sep=',')
        if not isinstance(self.dataframe.index, pd.DatetimeIndex):
            self.dataframe['Date_Time'] = pd.to_datetime(self.dataframe['Date_Time'])
            self.dataframe = self.dataframe.set_index('Date_Time')
        
        self.cols = cols
        self.split1 = split1
        self.split2 = split2
        self.len_train_windows = None
        self.pre_len = pre_len
        self.input_timesteps = input_timesteps
        self.seq_len = seq_len
        print('the input cols are:', self.cols)
        self.Normalization()
        
    def Normalization(self):
        
        for col in self.cols:
            
            if col == 'Consumption':
                self.dataframe['Consumption'] = self.dataframe.set_index('Consumption').index.map(lambda x: math.log(x))
                print('scaling Consumption is done!')
                
                
        i_split1 = int(len(self.dataframe) * self.split1)
        i_split2 = int(len(self.dataframe) * self.split2)
        
        
        self.data_train_original = self.dataframe.get(self.cols)[:i_split1]
        self.data_val_original   = self.dataframe.get(self.cols)[i_split1:i_split2]
        self.data_test_original  = self.dataframe.get(self.cols)[i_split2:]
        
        
        min_max_scaler = preprocessing.MinMaxScaler().fit(self.data_train_original.values)
        
        self.data_train = min_max_scaler.transform(self.data_train_original.values)
        self.data_val = min_max_scaler.transform(self.data_val_original.values)
        self.data_test = min_max_scaler.transform(self.data_test_original.values)
        
        self.len_train  = len(self.data_train_original)
        self.len_val    = len(self.data_val_original)
        self.len_test   = len(self.data_test_original)
    
    def get_pre_time(self):
        data_windows = []

        for i in range((self.len_test-self.input_timesteps)//self.pre_len):
            data_windows.append(self.data_test_original.index[i*self.pre_len:i*self.pre_len+self.seq_len])
            
        pre_time = np.array([p[self.input_timesteps:] for p in data_windows])
       
        return pre_time
    
    def get_test_data(self):
        '''
        Create x, y test data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise reduce size of the training split.
        '''
        data_windows = []

        for i in range((self.len_test-self.input_timesteps)//self.pre_len):
            data_windows.append(self.data_test[i*self.pre_len:i*self.pre_len+self.seq_len])

        x = np.array([p[:self.input_timesteps,:] for p in data_windows])
        y = np.array([p[self.input_timesteps:,0] for p in data_windows])
        return x,y
    
    def get_train_data(self):
        '''
        Create x, y train data windows
        Warning: batch method, not generative, make sure you have enough memory to
        load data, otherwise use generate_training_window() method.
        '''
        train_x = []
        train_y = []
        for i in range(self.len_train-self.seq_len):
            data_window = self.data_train[i:i+self.seq_len]
            train_x.append(data_window[:self.input_timesteps,:])
            train_y.append(data_window[self.input_timesteps:,0])
            
        train_x = np.array(train_x)
        train_y = np.array(train_y)
        
        sfl = list(range(len(train_x)))
        random.shuffle(sfl)
        train_x = train_x[sfl]
        train_y = train_y[sfl]
        
        return train_x, train_y
            
    def get_val_data(self):
        val_x = []
        val_y = []
        for i in range(self.len_val-self.seq_len):
            data_window = self.data_val[i:i+self.seq_len]
            val_x.append(data_window[:self.input_timesteps,:])
            val_y.append(data_window[self.input_timesteps:,0])
        val_x = np.array(val_x)
        val_y = np.array(val_y)
        
        sfl = list(range(len(val_x)))
        random.shuffle(sfl)
        val_x = val_x[sfl]
        val_y = val_y[sfl]
        
        return val_x, val_y
    
    def training_batch_generator(self, batch_size):
        train_x, train_y = self.get_train_data()
        return DataLoaderSequence(train_x,train_y, batch_size)
    
    def val_batch_generator(self, batch_size):
        val_x,val_y = self.get_val_data()
        return DataLoaderSequence(val_x, val_y, batch_size)