# Step 4) Preparing training data

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from tsm.data_utils import time_processing, ordinal2wave, compress_memory_usage

In [4]:
from tsm.normalizers import scale_and_save_data_frame_columns

In [5]:
import os

### Inputs

In [6]:
SEED = 42

In [7]:
np.random.seed(SEED)

## Pipeline for training data preparation

In [8]:
def pipeline_prepare_training_data(meter_no: int):
    data_path = 'data/comp/train_meter_{}.pkl'.format(str(meter_no))
    df = pd.read_pickle(data_path).reset_index(drop=True)
    df['log_meter_reading'] = np.log1p(df['meter_reading'])
    df = time_processing(df, 'timestamp')
    df = ordinal2wave('dt_m', df)
    df = ordinal2wave('dt_w', df)
    df = ordinal2wave('dt_d', df)
    df = ordinal2wave('dt_hour', df)
    df = ordinal2wave('dt_day_week', df)
    df = ordinal2wave('dt_day_month', df)
    df = ordinal2wave('dt_week_month', df)
    df['day_suspicious'] = df.groupby(["building_id","dt_d"])['meter_reading'].transform("min") < 1
    df['month_suspicious'] = df.groupby(["building_id","dt_m"])['meter_reading'].transform("min") < 1
    df = scale_and_save_data_frame_columns(df=df, 
                                           columns=['dt_m', 'dt_w', 'dt_d', 'dt_hour', 'dt_day_week', 'dt_day_month', 'dt_week_month'],
                                           store_path='data/objects/train_meter_{}'.format(str(meter_no)))

    building_data = pd.read_pickle('data/prep/building_metadata.pkl')
    df = df.merge(building_data, on='building_id',  how='left')
    del building_data
    weather_data = pd.read_pickle('data/prep/weather_train.pkl')
    df = df.merge(weather_data, on=['site_id', 'timestamp'], how='left')
    del weather_data
    df.drop('meter_reading', axis=1, inplace=True)
    df, _ = compress_memory_usage(df)
    return df

In [27]:
mtno = 3

In [28]:
%%time
mt = pipeline_prepare_training_data(meter_no=mtno)

HBox(children=(IntProgress(value=0, max=1264037), HTML(value='')))




HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=66, style=ProgressStyl…


Memory usage pre-compression was 417.09594917297363
Memory usage after-compression was 315.83566093444824
This is  75.72254335260115% of the initial size
CPU times: user 8.18 s, sys: 884 ms, total: 9.06 s
Wall time: 9 s


In [29]:
# Check
mt.head()

Unnamed: 0,building_id,timestamp,log_meter_reading,dt_m,dt_w,dt_d,dt_hour,dt_day_week,dt_day_month,dt_week_month,...,sea_level_pressure_ewm_01,sea_level_pressure_ewm_02,wind_direction_ewm_001,wind_direction_ewm_005,wind_direction_ewm_01,wind_direction_ewm_02,wind_speed_ewm_001,wind_speed_ewm_005,wind_speed_ewm_01,wind_speed_ewm_02
0,106,2016-01-01,0.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,...,0.685824,0.685824,0.666667,0.666667,0.666667,0.666667,0.163158,0.163158,0.163158,0.163158
1,109,2016-01-01,0.0,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,...,0.685824,0.685824,0.666667,0.666667,0.666667,0.666667,0.163158,0.163158,0.163158,0.163158
2,112,2016-01-01,4.584743,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,...,0.685824,0.685824,0.666667,0.666667,0.666667,0.666667,0.163158,0.163158,0.163158,0.163158
3,113,2016-01-01,3.025146,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,...,0.685824,0.685824,0.666667,0.666667,0.666667,0.666667,0.163158,0.163158,0.163158,0.163158
4,114,2016-01-01,4.61512,0.0,1.0,0.0,0.0,0.666667,0.0,0.0,...,0.685824,0.685824,0.666667,0.666667,0.666667,0.666667,0.163158,0.163158,0.163158,0.163158


In [30]:
# Save
mt.to_pickle('data/prep/train_meter_{}.pkl'.format(mtno))

In [31]:
del mt

In [32]:
# Delete old
os.remove('data/comp/train_meter_{}.pkl'.format(mtno))