In [1]:
# Load libraries

%load_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import datetime as dt
from datetime import date
import gc
from src.functions import data_exploration as dexp
from src.functions import data_import as dimp
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pandas_profiling

import chart_studio as cs
from chart_studio import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)

### Loading data

In [2]:
# Load data with no missing values got in the previous notebook (1.2-vcp-missing_values.ipynb)
df = dimp.import_data(
    '../../data/interim/site_1/processed/merged_data_no_nans.csv'
)

Memory usage of dataframe is 59.11 MB
Memory usage after optimization is: 15.16 MB
Decreased by 74.4%


In [3]:
del(df['Unnamed: 0'])
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [4]:
df.shape

(553357, 13)

## Outliers visualization

### Each feature independently

In [None]:
# box-plots
df[
    [
        'floor_count',
        'building_age',
        'square_feet',
        'air_temperature',
        'dew_temperature',
        'sea_level_pressure',
        'wind_direction',
        'wind_speed',
    ]
].iplot(
    subplots=True, 
    shape=(3,3),
    kind='box', 
    filename='cufflinks/box-plots'
)


### Time depending variables: time series

In [None]:
# Visualizations for building 106
df_106 = df[df.building_id == 106]
df_106.set_index('timestamp', inplace=True)

# weather variables
df_106[
    [
        'air_temperature',
        'dew_temperature',
        'sea_level_pressure',
        'wind_direction',
        'wind_speed'
    ]
].iplot(kind='scatter', filename='cufflinks/cf-simple-line')

# target variable meter_reading, by meter_id
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_106.index,
        y=df_106.loc[df_106.meter == 0, 'meter_reading'],
        mode="lines",
        name='meter 0',
    )
)

fig.add_trace(
    go.Scatter(
        x=df_106.index,
        y=df_106.loc[df_106.meter == 3, 'meter_reading'],
        mode="lines",
        name='meter 3',
    )
)

fig.show()

As a first aproximation, we'll only clean outliers for `meter_reading`; specifically, all rows for meter 0 (electricity) with a 0 value for `meter_redgin` will be dropped. It makes no sense 0 values for that meter, there's always consumption of electricity in "alive" buildings.

In [5]:
rows_to_drop = df[(df.meter_reading == 0) & (df.meter == 0)].index

In [6]:
df.drop(rows_to_drop, inplace=True)

In [7]:
df.shape

(553353, 13)

In [8]:
df.to_csv('../../data/interim/site_1/data_cleaned.csv', index=False)