# `LSTMPacker`

A simple example about using `LSTMPacker`

In [1]:
# if you cloned the repository you can do:
import sys
sys.path.append('../')

import logging
import pandas

logging.getLogger().setLevel(logging.DEBUG)

# Load some data

More dataset are available here: https://archive.ics.uci.edu/ml/datasets.html

_Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science._

In [2]:
df = pandas.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv')
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
df.columns

Index(['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint', 'rv1', 'rv2'],
      dtype='object')

# Data Raw

I'm going to define my target as the total energy used in `Wh` summing `Appliances` and `lights`. The data set will be composed by all columns excluded `date` (dataset is already sorted), `rv1` and `rv2` (see here for more details: https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction)

In [4]:
x = df[['T1', 'RH_1', 'T2', 'RH_2', 'T3',
       'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8',
       'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed',
       'Visibility', 'Tdewpoint']]
y = (df['Appliances'] + df['lights']).to_frame()

# Create a preprocessing pipeline using `pipesnake` for LSTM 

In [5]:
from pipesnake.pipe import SeriesPipe
from pipesnake.transformers.dropper import DropDuplicates
from pipesnake.transformers.imputer import KnnImputer
from pipesnake.transformers.misc import ColumnRenamer
from pipesnake.transformers.scaler import MadScaler
from pipesnake.transformers.scaler import UnitLenghtScaler

In [6]:
my_pipe = SeriesPipe(transformers=[
    ColumnRenamer(),  # nomalize columns names
    DropDuplicates(),  # drop duplicated rows and cols
    KnnImputer(x_cols='all'),  # impute missing values
    MadScaler(x_cols='all', y_cols='all'),  # scale by feature (cols)
    UnitLenghtScaler(x_cols='all'),  # scale by feature vector (rows)
])

In [7]:
x_new, y_new = my_pipe.fit_transform(x, y)

DEBUG:root:[series_pipe_95c4] : fitting...
DEBUG:root:Function: timed before Memory: 108.81 MB
INFO:root:[series_pipe_95c4] : fitting x...
INFO:root:[series_pipe_95c4] : -> column_renamer_3a21
DEBUG:root:[column_renamer_3a21] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23']
INFO:root:[series_pipe_95c4] : -> drop_duplicates_eb03
DEBUG:root:[drop_duplicates_eb03] : x shape: (19735, 24)
DEBUG:root:[drop_duplicates_eb03] :   shape: (19735, 24)
INFO:root:[series_pipe_95c4] : -> knn_imputer_f7a6
DEBUG:root:[knn_imputer_f7a6] : x_cols: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23']
INFO:root:[knn_imputer_f7a6] : current shape: (19735, 24)
DEBUG:root:[category2_number_5f5d] : fitting...


In [8]:
x_new.head()

Unnamed: 0,x_00,x_01,x_02,x_03,x_04,x_05,x_06,x_07,x_08,x_09,...,x_14,x_15,x_16,x_17,x_18,x_19,x_20,x_21,x_22,x_23
0,-0.16423,0.293406,-0.05615,0.160984,-0.173926,0.276208,-0.123446,0.23878,-0.181978,0.116963,...,-0.298576,0.182212,-0.173692,0.162597,-0.009017,-0.456834,0.081433,0.202389,0.317786,0.066402
1,-0.168801,0.267261,-0.057712,0.162861,-0.178766,0.286642,-0.126882,0.260008,-0.187042,0.120218,...,-0.306885,0.18623,-0.176001,0.168205,-0.012682,-0.46747,0.083699,0.187219,0.272192,0.064593
2,-0.17273,0.258195,-0.059055,0.162869,-0.182927,0.300029,-0.135548,0.262468,-0.191395,0.120801,...,-0.314027,0.186649,-0.185265,0.169903,-0.016471,-0.476224,0.085647,0.17029,0.222821,0.062355
3,-0.17566,0.253353,-0.060057,0.164161,-0.186031,0.308296,-0.140752,0.260981,-0.194642,0.122851,...,-0.327544,0.185634,-0.188408,0.16903,-0.020304,-0.482142,0.0871,0.151532,0.169951,0.059609
4,-0.177921,0.267289,-0.06083,0.163834,-0.188425,0.312264,-0.142564,0.257361,-0.194192,0.124432,...,-0.331759,0.188023,-0.190833,0.171205,-0.024164,-0.486156,0.088221,0.131556,0.114759,0.056522


In [9]:
y_new.head()

Unnamed: 0,y_0
0,0.159805
1,0.159805
2,0.106537
3,0.159805
4,0.213073


In [10]:
_, y_org = my_pipe.inverse_transform(x=None, y=y_new)

DEBUG:root:[series_pipe_95c4] : inverse transforming...
DEBUG:root:Function: timed before Memory: 318.09 MB
INFO:root:[series_pipe_95c4] : inverse transforming y...
INFO:root:[series_pipe_95c4] : -> unit_lenght_scaler_148
INFO:root:[series_pipe_95c4] : -> mad_scaler_68d
INFO:root:[series_pipe_95c4] : -> knn_imputer_f7a6
INFO:root:[series_pipe_95c4] : -> drop_duplicates_eb03
INFO:root:[series_pipe_95c4] : -> column_renamer_3a21
DEBUG:root:[column_renamer_3a21] : y original column names: [0]
DEBUG:root:Function: inverse_transform_y: 0.02 sec
DEBUG:root:Function: timed after Memory: 318.09 MB


In [11]:
y_org.head()

Unnamed: 0,0
0,90.0
1,90.0
2,80.0
3,90.0
4,100.0


In [12]:
from pipesnake.transformers.misc import ToNumpy
from pipesnake.transformers.deeplearning import LSTMPacker

my_pipe.extend([
    ToNumpy(),  # returns x and y as numpy matrix
    LSTMPacker(sequence_len=5),
])

In [13]:
x_new, y_new = my_pipe.fit_transform(x, y)

DEBUG:root:[series_pipe_95c4] : fitting...
DEBUG:root:Function: timed before Memory: 318.12 MB
INFO:root:[series_pipe_95c4] : fitting x...
INFO:root:[series_pipe_95c4] : -> column_renamer_3a21
DEBUG:root:[column_renamer_3a21] : x new column names: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23']
INFO:root:[series_pipe_95c4] : -> drop_duplicates_eb03
DEBUG:root:[drop_duplicates_eb03] : x shape: (19735, 24)
DEBUG:root:[drop_duplicates_eb03] :   shape: (19735, 24)
INFO:root:[series_pipe_95c4] : -> knn_imputer_f7a6
DEBUG:root:[knn_imputer_f7a6] : x_cols: ['x_00', 'x_01', 'x_02', 'x_03', 'x_04', 'x_05', 'x_06', 'x_07', 'x_08', 'x_09', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23']
INFO:root:[knn_imputer_f7a6] : current shape: (19735, 24)
DEBUG:root:[category2_number_28ac] : fitting...


INFO:root:[series_pipe_95c4] : -> unit_lenght_scaler_148
INFO:root:[series_pipe_95c4] : -> to_numpy_4ab9
INFO:root:[series_pipe_95c4] : -> lstm_packer_192e
DEBUG:root:Function: transform_y: 0.02 sec
DEBUG:root:Function: timed after Memory: 346.46 MB


In [14]:
x_new

array([[[-0.16423042,  0.2934058 , -0.05614959, ...,  0.2023891 ,
          0.31778608,  0.06640165],
        [-0.16880086,  0.26726139, -0.0577122 , ...,  0.18721932,
          0.27219157,  0.06459335],
        [-0.17272954,  0.25819465, -0.0590554 , ...,  0.17029038,
          0.22282126,  0.06235537],
        [-0.17566019,  0.25335294, -0.06005737, ...,  0.15153219,
          0.16995135,  0.05960854],
        [-0.1779208 ,  0.26728895, -0.06083027, ...,  0.13155625,
          0.114759  ,  0.05652189]],

       [[-0.16880086,  0.26726139, -0.0577122 , ...,  0.18721932,
          0.27219157,  0.06459335],
        [-0.17272954,  0.25819465, -0.0590554 , ...,  0.17029038,
          0.22282126,  0.06235537],
        [-0.17566019,  0.25335294, -0.06005737, ...,  0.15153219,
          0.16995135,  0.05960854],
        [-0.1779208 ,  0.26728895, -0.06083027, ...,  0.13155625,
          0.114759  ,  0.05652189],
        [-0.17955591,  0.25735566, -0.0613893 , ...,  0.11063772,
          0.05

In [15]:
y_new

array([[ 0.15980476],
       [ 0.26634127],
       [ 0.26634127],
       ..., 
       [ 1.1719016 ],
       [ 1.97092541],
       [ 2.02419367]])

now your data is properly packed for Keras or TensorFlow LSTM ;-) 