# Step 4) Preparing training data

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from tsm.data_utils import time_processing, ordinal2wave, compress_memory_usage

In [1]:
from tsm.normalizers import load_normalisers_and_scale_data_frame

In [5]:
import os

### Inputs

In [6]:
SEED = 42

In [7]:
np.random.seed(SEED)

## Pipeline for training data preparation

In [9]:
def pipeline_prepare_training_data(meter_no: int):
    data_path = 'data/comp/test_meter_{}.pkl'.format(str(meter_no))
    df = pd.read_pickle(data_path).reset_index(drop=True)
    df = time_processing(df, 'timestamp')
    df = ordinal2wave('dt_m', df)
    df = ordinal2wave('dt_w', df)
    df = ordinal2wave('dt_d', df)
    df = ordinal2wave('dt_hour', df)
    df = ordinal2wave('dt_day_week', df)
    df = ordinal2wave('dt_day_month', df)
    df = ordinal2wave('dt_week_month', df)
    df = load_normalisers_and_scale_data_frame(df=df, store_path='data/objects/train_meter_{}'.format(str(meter_no)))

    building_data = pd.read_pickle('data/prep/building_metadata.pkl')
    df = df.merge(building_data, on='building_id',  how='left')
    del building_data
    weather_data = pd.read_pickle('data/prep/weather_test.pkl')
    df = df.merge(weather_data, on=['site_id', 'timestamp'], how='left')
    del weather_data
    df, _ = compress_memory_usage(df)
    return df

In [10]:
%%time
for mtno in [0, 1, 2, 3]:
    mt = pipeline_prepare_training_data(meter_no=mtno)
    mt.to_pickle('data/prep/train_meter_{}.pkl'.format(mtno))
    os.remove('data/comp/train_meter_{}.pkl'.format(mtno))

HBox(children=(IntProgress(value=0, max=12060910), HTML(value='')))






HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=66, style=ProgressStyl…


Memory usage pre-compression was 3956.749954223633
Memory usage after-compression was 2990.566825866699
This is  75.5813953488372% of the initial size


HBox(children=(IntProgress(value=0, max=4182440), HTML(value='')))






HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=66, style=ProgressStyl…


Memory usage pre-compression was 1372.1078491210938
Memory usage after-compression was 1037.0582580566406
This is  75.5813953488372% of the initial size


HBox(children=(IntProgress(value=0, max=2708713), HTML(value='')))






HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=66, style=ProgressStyl…


Memory usage pre-compression was 888.6311264038086
Memory usage after-compression was 671.6398048400879
This is  75.5813953488372% of the initial size


HBox(children=(IntProgress(value=0, max=1264037), HTML(value='')))






HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=66, style=ProgressStyl…


Memory usage pre-compression was 414.6849899291992
Memory usage after-compression was 313.4247016906738
This is  75.5813953488372% of the initial size
CPU times: user 4min 54s, sys: 1min 20s, total: 6min 14s
Wall time: 2min 45s
