# Prepare the Data

## Table of Contents
* [Splitting the data into train and test sets](#splitting)
* [Data cleaning](#cleaning)
* [Feature selection](#feature_selection)
* [Feature engineering](#feature_eng)

In [1]:
# Libraries

%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import pandas as pd
import numpy as np
import datetime as dt
import gc
import missingno as msno
import pandas_profiling
import statsmodels as sm
from statsmodels.tsa.seasonal import seasonal_decompose
import random

from src.functions import data_import as dimp
from src.functions import data_exploration as dexp

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly as pty

import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

In [210]:
# Load data
file = '../../data/interim/by_site/site_4.csv'
df_s4 = dimp.import_data(file)

Memory usage of dataframe is 85.46 MB
Memory usage after optimization is: 21.03 MB
Decreased by 75.4%


## Splitting the data into training and test sets
<a id="splitting" />

We're using the two last months of each building observations for testing the models. The rest of rows will conform the training data set.

In [214]:
# convert timestamp to datetime
df_s4['timestamp'] = pd.to_datetime(df_s4['timestamp'])

In [235]:
df_s4.head()

Unnamed: 0,building_id,meter,year_built,primary_use,floor_count,square_feet,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,meter_reading
0,565,0,1954.0,Education,2,15326,2016-01-01 00:00:00,,,,,,,,7.25
1,565,0,1954.0,Education,2,15326,2016-01-01 01:00:00,9.398438,0.0,-2.199219,0.0,1021.5,360.0,3.099609,8.5
2,565,0,1954.0,Education,2,15326,2016-01-01 02:00:00,8.296875,0.0,-2.800781,0.0,1021.5,0.0,0.0,7.25
3,565,0,1954.0,Education,2,15326,2016-01-01 03:00:00,7.800781,0.0,-2.800781,0.0,1021.5,90.0,1.5,7.5
4,565,0,1954.0,Education,2,15326,2016-01-01 04:00:00,5.0,0.0,-0.600098,0.0,1022.0,0.0,0.0,8.0


In [280]:
# select the last two months of each building
date_time_str = '2016-10-31 23:00:00'
date_time_cut = dt.datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')

# create test and training sets
test_s4 = df_s4[df_s4['timestamp'] > date_time_cut]
train_s4 = df_s4[df_s4['timestamp'] <= date_time_cut]

In [281]:
len(test_s4) + len(train_s4)

746746

In [283]:
test_s4.head()

Unnamed: 0,building_id,meter,year_built,primary_use,floor_count,square_feet,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,meter_reading
7319,565,0,1954.0,Education,2,15326,2016-11-01 00:00:00,18.296875,,11.101562,,1015.5,200.0,2.099609,8.25
7320,565,0,1954.0,Education,2,15326,2016-11-01 01:00:00,17.796875,,11.101562,,1015.0,220.0,8.796875,8.5
7321,565,0,1954.0,Education,2,15326,2016-11-01 02:00:00,17.203125,,11.101562,,1015.0,180.0,7.199219,8.0
7322,565,0,1954.0,Education,2,15326,2016-11-01 03:00:00,16.703125,,11.703125,,1014.5,170.0,7.699219,8.25
7323,565,0,1954.0,Education,2,15326,2016-11-01 04:00:00,16.09375,,12.203125,,1015.0,180.0,6.199219,8.25


In [284]:
train_s4.tail()

Unnamed: 0,building_id,meter,year_built,primary_use,floor_count,square_feet,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,meter_reading
745277,655,0,1964.0,Education,11,222434,2016-10-31 19:00:00,17.203125,,11.703125,,1019.5,150.0,3.099609,124.875
745278,655,0,1964.0,Education,11,222434,2016-10-31 20:00:00,17.796875,,11.101562,,1018.5,160.0,3.599609,122.1875
745279,655,0,1964.0,Education,11,222434,2016-10-31 21:00:00,18.90625,4.0,10.601562,,1017.0,150.0,3.599609,122.1875
745280,655,0,1964.0,Education,11,222434,2016-10-31 22:00:00,18.90625,4.0,10.0,,1016.5,170.0,3.599609,120.625
745281,655,0,1964.0,Education,11,222434,2016-10-31 23:00:00,18.296875,,11.101562,,1015.5,170.0,3.099609,108.5625


In [285]:
# export training and test sets
train_s4.to_csv('../../data/interim/site_4/train_s4_2.csv')
test_s4.to_csv('../../data/interim/site_4/test_s4_2.csv')

## Data cleaning
<a id="cleaning" />

In [286]:
# 1. Convert 'timestamp' to datetime
train_s4['timestamp'] = pd.to_datetime(train_s4['timestamp'])

In [287]:
# 2. Convert 'year_built' to integer
train_s4['year_built'] = pd.array(train_s4['year_built'], dtype=pd.Int16Dtype())

In [288]:
# 3. Delete 'meter' as there's only one type of meter (0: electricity)
del train_s4['meter']

In [293]:
dexp.get_missing_percentage(
    train_s4.set_index(['building_id','timestamp'])
)


year_built             1.24
primary_use            0.00
floor_count            0.00
square_feet            0.00
air_temperature        0.01
cloud_coverage        47.90
dew_temperature        0.04
precip_depth_1_hr     17.02
sea_level_pressure     0.84
wind_direction         1.21
wind_speed             0.01
meter_reading          0.00
dtype: float64