In [1]:
%load_ext autoreload
%autoreload 2

# Preparing Data for Training

In [4]:
import pandas as pd
from tsm.data_utils import time_processing, encode_categories, ordinal2wave
from tsm.data_selector import get_series_past_k_lags

In [3]:
train_data = pd.read_pickle('kaggle/train_data.pkl')

In [5]:
train_data.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008,99,25.0,6,20.0,999,1019.700012,0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004,99,25.0,6,20.0,999,1019.700012,0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991,99,25.0,6,20.0,999,1019.700012,0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002,99,25.0,6,20.0,999,1019.700012,0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975,99,25.0,6,20.0,999,1019.700012,0,0.0


## Step 1) Add ordinal2wave for temporal features

In [None]:
from tsm.data_utils import ordinal2wave

In [None]:
train_data = time_processing(train_data, 'timestamp')

In [6]:
train_data, econder = encode_categories(train_data, cat_cols='primary_use')

In [7]:
from sklearn.externals import joblib



In [11]:
joblib.dump(econder, 'models/LabelEconder_train_data.pkl')

['models/LabelEconder_train_data.pkl']

In [None]:
train_data = ordinal2wave('dt_m', train_data)
train_data = ordinal2wave('dt_w', train_data)
train_data = ordinal2wave('dt_d', train_data)
train_data = ordinal2wave('dt_hour', train_data)
train_data = ordinal2wave('dt_day_week', train_data)
train_data = ordinal2wave('dt_day_month', train_data)
train_data = ordinal2wave('dt_week_month', train_data)

## Step 2) Add weather lags (mainly air and dew temperature) Alpha 00.1, 0.1, 0.25

In [3]:
train_data_pp = pd.read_pickle('kaggle/train_processed.pkl')

In [12]:
k_air_e_001 = train_data.air_temperature.ewm(alpha=0.01).mean().reset_index(drop=True)
k_air_e_001.name = 'k_air_e_001'

In [13]:
train_data_pp = pd.concat([train_data_pp.reset_index(drop=True), k_air_e_001], axis=1, sort=False)

In [14]:
del k_air_e_001

In [15]:
k_air_e_01 = train_data.air_temperature.ewm(alpha=0.1).mean().reset_index(drop=True)
k_air_e_01.name = 'k_air_e_01'

In [16]:
train_data_pp = pd.concat([train_data_pp.reset_index(drop=True), k_air_e_01], axis=1, sort=False)

In [17]:
del k_air_e_01

In [18]:
k_air_e_025 = train_data.air_temperature.ewm(alpha=0.25).mean().reset_index(drop=True)
k_air_e_025.name = 'k_air_e_025'

In [19]:
train_data_pp = pd.concat([train_data_pp.reset_index(drop=True), k_air_e_025], axis=1, sort=False)

In [20]:
del k_air_e_025

In [21]:
train_data_pp.to_pickle('kaggle/train_processed.pkl')

In [23]:
train_data_pp.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,air_temperature_t-2,air_temperature_t-3,air_temperature_t-6,dew_temperature_t-1,dew_temperature_t-2,dew_temperature_t-3,dew_temperature_t-6,k_air_e_001,k_air_e_01,k_air_e_025
0,0,0,2016-01-01,0.0,0,0,7432,2008,99,25.0,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,25.0,25.0,25.0
1,1,0,2016-01-01,0.0,0,0,2720,2004,99,25.0,...,99.0,99.0,99.0,20.0,99.0,99.0,99.0,25.0,25.0,25.0
2,2,0,2016-01-01,0.0,0,0,5376,1991,99,25.0,...,25.0,99.0,99.0,20.0,20.0,99.0,99.0,25.0,25.0,25.0
3,3,0,2016-01-01,0.0,0,0,23685,2002,99,25.0,...,25.0,25.0,99.0,20.0,20.0,20.0,99.0,25.0,25.0,25.0
4,4,0,2016-01-01,0.0,0,0,116607,1975,99,25.0,...,25.0,25.0,99.0,20.0,20.0,20.0,99.0,25.0,25.0,25.0


In [24]:
from tsm.data_selector import data_subset_by_dict

## Step3) Sample validation set

In [15]:
pd.set_option('display.max_columns', 100)

In [20]:
weeks_no_and_of_m = train_data_pp[['dt_w', 'dt_m', 'dt_week_month']]

In [22]:
only_even_months = weeks_no_and_of_m[weeks_no_and_of_m.dt_m % 2 == 0]

In [28]:
validation_weeks = []
for m in list(set(only_even_months.dt_m)):
    m_data = only_even_months[only_even_months.dt_m == m]
    m_data_sample = m_data.sample(n=1, axis=0)
    print(m_data_sample)
    validation_weeks.append(m_data_sample.dt_w.values[0])

         dt_w  dt_m  dt_week_month
1758636     5     2              1
         dt_w  dt_m  dt_week_month
5746876    16     4              3
         dt_w  dt_m  dt_week_month
8922900    24     6              2
          dt_w  dt_m  dt_week_month
12067377    32     8              2
          dt_w  dt_m  dt_week_month
15652993    41    10              2
          dt_w  dt_m  dt_week_month
19628517    51    12              3


In [29]:
validation_weeks

[5, 16, 24, 32, 41, 51]

In [30]:
training_weeks  = [x for x in range(1, 54) if x not in validation_weeks]

In [32]:
prepo_train = train_data_pp[train_data_pp.dt_w.isin(training_weeks)]
prepo_val = train_data_pp[train_data_pp.dt_w.isin(validation_weeks)]

In [33]:
prepo_train.to_pickle('kaggle/preproc_train.pkl')

In [34]:
prepo_val.to_pickle('kaggle/preproc_val.pkl')

In [35]:
del train_data_pp

## Step4) Divide by meter type

In [37]:
from tsm.data_selector import data_subset_by_dict

### Meter 0

In [38]:
prepo_train_met0 = data_subset_by_dict(prepo_train, {'meter': 0})
prepo_val_met0 = data_subset_by_dict(prepo_val, {'meter': 0})

In [39]:
prepo_train_met0.to_pickle('kaggle/prepo_train_met0.pkl')
prepo_val_met0.to_pickle('kaggle/prepo_val_met0.pkl')

In [40]:
del prepo_train_met0
del prepo_val_met0

### Meter 1

In [41]:
prepo_train_met1 = data_subset_by_dict(prepo_train, {'meter': 1})
prepo_val_met1 = data_subset_by_dict(prepo_val, {'meter': 1})

In [42]:
prepo_train_met1.to_pickle('kaggle/prepo_train_met1.pkl')
prepo_val_met1.to_pickle('kaggle/prepo_val_met1.pkl')

In [43]:
del prepo_train_met1
del prepo_val_met1

### Meter 2

In [44]:
prepo_train_met2 = data_subset_by_dict(prepo_train, {'meter': 2})
prepo_val_met2 = data_subset_by_dict(prepo_val, {'meter': 2})

In [46]:
prepo_train_met2.to_pickle('kaggle/prepo_train_met2.pkl')
prepo_val_met2.to_pickle('kaggle/prepo_val_met2.pkl')

In [47]:
del prepo_train_met2
del prepo_val_met2

### Meter 3

In [48]:
prepo_train_met3 = data_subset_by_dict(prepo_train, {'meter': 3})
prepo_val_met3 = data_subset_by_dict(prepo_val, {'meter': 3})

In [49]:
prepo_train_met3.to_pickle('kaggle/prepo_train_met3.pkl')
prepo_val_met3.to_pickle('kaggle/prepo_val_met3.pkl')