In [1]:
%cd ../..

/Users/mlevydaniel/Desktop/modern-time-series-forecasting-with-python


In [2]:
import numpy as np
import os
import pandas as pd
from pathlib import Path
from tqdm.autonotebook import tqdm
import statsmodels.api as sm
import warnings
import random
from IPython.display import display, HTML
np.random.seed(42)
random.seed(42)
tqdm.pandas()

  from tqdm.autonotebook import tqdm


# Reading and Selecting Households

In [3]:
try:
    lclid_acorn_map = pd.read_pickle("data/london_smart_meters/preprocessed/london_smart_meters_lclid_acorn_map.pkl")
except FileNotFoundError:
    display(HTML("""
    <div class="alert alert-block alert-warning">
    <b>Warning!</b> File not found. Please make sure you have run 02 - Preprocessing London Smart Meter Dataset.ipynb in Chapter02
    </div>
    """))

ModuleNotFoundError: No module named 'numpy._core'

In [4]:
affluent_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Affluent", ["LCLid", 'file']]
adversity_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Adversity", ["LCLid", 'file']]
comfortable_households = lclid_acorn_map.loc[lclid_acorn_map.Acorn_grouped=="Comfortable", ["LCLid", 'file']]


Let's take a subset of the data because if we take everything, it will hit your RAM. Depending on how much RAM you have, we can choose larger data. But to maintain the variety in the dataset, we will do stratified sampling based on Acorn classifications.

* <= 50 households for 4GB RAM
* 50 - 100 households for 8GB RAM
* 100-150 households for 16GB RAM
* 250 households for 32GB RAM

Let's sample 150 households now, but feel free to reduce of increase as per your hardware constraints

150 households means 50 each from the three Acorn Groups - Affluent, Comfortable, Adversity (we are ignoring the households with unknown ACORN groups)

In [5]:
selected_households = pd.concat(
    [
        affluent_households.sample(50, random_state=76),
        comfortable_households.sample(50, random_state=76),
        adversity_households.sample(50, random_state=76),
    ]
)
selected_households['block'] = selected_households.file.str.split("_", expand=True).iloc[:, 1].astype(int)

In [7]:
# extracting the paths to the different blocks and extracting the starting and ending blocks
path_blocks = [
    (p, *list(map(int, p.name.split("_")[5].split(".")[0].split("-"))))
    for p in Path("data/london_smart_meters/preprocessed").glob(
        "london_smart_meters_merged_block*"
    )
]

In [10]:
household_df_l = []
for path, start_b, end_b in tqdm(path_blocks):
    block_df = pd.read_parquet(path)
    selected_households['block'].between
    mask = selected_households['block'].between(start_b, end_b)
    lclids = selected_households.loc[mask, "LCLid"]
    household_df_l.append(block_df.loc[block_df.LCLid.isin(lclids)])

  0%|          | 0/14 [00:00<?, ?it/s]

In [16]:
block_df = pd.concat(household_df_l)
del household_df_l
block_df.head()

Unnamed: 0,LCLid,start_timestamp,frequency,energy_consumption,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
57,MAC000768,2012-04-21,30min,"[0.8440000000000001, 0.265, 0.262, 0.233999999...",32544,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[251.0, nan, 251.0, nan, 246.0, nan, 242.0, na...","[6.42, nan, 6.2, nan, 5.68, nan, 5.16, nan, 4....","[3.54, nan, 3.61, nan, 3.52, nan, 3.11, nan, 2...","[994.96, nan, 994.98, nan, 994.82, nan, 994.79...","[3.79, nan, 3.67, nan, 3.15, nan, 2.61, nan, 1...","[3.64, nan, 3.42, nan, 3.25, nan, 3.13, nan, 2...","[rain, None, rain, None, rain, None, rain, Non...","[partly-cloudy-night, None, partly-cloudy-nigh...","[0.82, nan, 0.83, nan, 0.86, nan, 0.87, nan, 0...","[Partly Cloudy, None, Partly Cloudy, None, Cle..."
63,MAC000948,2012-05-02,30min,"[0.008, 0.009, 0.008, 0.008, 0.008, 0.009, 0.0...",32016,Std,ACORN-A,Affluent,block_1,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[351.0, nan, 0.0, nan, 0.0, nan, 351.0, nan, 3...","[11.81, nan, 11.12, nan, 11.2, nan, 11.18, nan...","[10.47, nan, 10.15, nan, 9.89, nan, 9.29, nan,...","[1021.42, nan, 1021.44, nan, 1021.33, nan, 102...","[11.81, nan, 11.12, nan, 11.2, nan, 11.18, nan...","[2.53, nan, 2.41, nan, 2.06, nan, 2.98, nan, 3...","[rain, None, rain, None, rain, None, rain, Non...","[partly-cloudy-night, None, partly-cloudy-nigh...","[0.91, nan, 0.94, nan, 0.92, nan, 0.88, nan, 0...","[Mostly Cloudy, None, Mostly Cloudy, None, Ove..."
2827,MAC003299,2012-09-25,30min,"[0.254, 0.201, 0.183, 0.2189999999999999, 0.18...",25008,Std,ACORN-C,Affluent,block_5,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[220.0, nan, 218.0, nan, 215.0, nan, 211.0, na...","[10.93, nan, 10.81, nan, 10.27, nan, 10.2, nan...","[7.76, nan, 8.07, nan, 8.04, nan, 7.62, nan, 7...","[989.26, nan, 989.27, nan, 989.0, nan, 988.78,...","[10.93, nan, 10.81, nan, 10.27, nan, 10.2, nan...","[4.9, nan, 4.98, nan, 4.45, nan, 4.51, nan, 4....","[rain, None, rain, None, rain, None, rain, Non...","[clear-night, None, clear-night, None, clear-n...","[0.81, nan, 0.83, nan, 0.86, nan, 0.84, nan, 0...","[Clear, None, Clear, None, Clear, None, Clear,..."
3389,MAC003157,2012-07-15,30min,"[0.181, 0.126, 0.13, 0.134, 0.18, 0.179, 0.118...",28464,ToU,ACORN-C,Affluent,block_6,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[250.0, nan, 259.0, nan, 267.0, nan, 284.0, na...","[12.72, nan, 12.72, nan, 12.98, nan, 12.63, na...","[12.01, nan, 12.0, nan, 12.32, nan, 11.83, nan...","[1011.2, nan, 1011.17, nan, 1011.23, nan, 1011...","[12.72, nan, 12.72, nan, 12.98, nan, 12.63, na...","[1.73, nan, 2.15, nan, 2.31, nan, 2.28, nan, 2...","[rain, None, rain, None, rain, None, rain, Non...","[clear-night, None, partly-cloudy-night, None,...","[0.95, nan, 0.95, nan, 0.96, nan, 0.95, nan, 0...","[Clear, None, Partly Cloudy, None, Partly Clou..."
3916,MAC000193,2012-01-01,30min,"[0.368, 0.386, 0.17, 0.021, 0.038, 0.038, 0.02...",37872,ToU,ACORN-D,Affluent,block_7,"[NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDAY, NO_HOLIDA...",...,"[229.0, nan, 238.0, nan, 229.0, nan, 231.0, na...","[12.12, nan, 12.59, nan, 12.45, nan, 12.03, na...","[10.97, nan, 11.02, nan, 11.04, nan, 10.94, na...","[1008.1, nan, 1007.88, nan, 1007.95, nan, 1007...","[12.12, nan, 12.59, nan, 12.45, nan, 12.03, na...","[5.9, nan, 6.06, nan, 5.31, nan, 4.68, nan, 4....","[rain, None, rain, None, rain, None, rain, Non...","[partly-cloudy-night, None, cloudy, None, part...","[0.93, nan, 0.9, nan, 0.91, nan, 0.93, nan, 0....","[Mostly Cloudy, None, Overcast, None, Mostly C..."


In [19]:
from src.utils.data_utils import compact_to_expanded

In [20]:
#Converting to expanded form
exp_block_df = compact_to_expanded(block_df, timeseries_col = 'energy_consumption',
static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
time_varying_cols = ['holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
       'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
       'humidity', 'summary'],
ts_identifier = "LCLid")

exp_block_df.head()

  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-04-21 00:00:00,MAC000768,0.844,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251.0,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
1,2012-04-21 00:30:00,MAC000768,0.265,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,,,,,,,,,,
2,2012-04-21 01:00:00,MAC000768,0.262,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251.0,6.2,3.61,994.98,3.67,3.42,rain,partly-cloudy-night,0.83,Partly Cloudy
3,2012-04-21 01:30:00,MAC000768,0.234,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,,,,,,,,,,
4,2012-04-21 02:00:00,MAC000768,0.046,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,246.0,5.68,3.52,994.82,3.15,3.25,rain,clear-night,0.86,Clear


## Reduce Memory Footprint

In [21]:
from src.utils.data_utils import reduce_memory_footprint

In [22]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 23471
Columns: 21 entries, timestamp to summary
dtypes: datetime64[ns](1), float64(9), int64(1), object(10)
memory usage: 3.0 GB


In [23]:
exp_block_df = reduce_memory_footprint(exp_block_df)

In [82]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4711440 entries, 0 to 33263
Columns: 21 entries, timestamp to summary
dtypes: category(10), datetime64[ns](1), float32(8), int32(2)
memory usage: 301.1 MB


# Train Test Valildation Split

We are going to keep 2014 data as the validation and test period. We have 2 months(Jan and Feb) of data in 2014. Jan is Validation and Feb is Test

In [28]:
test_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==2)
val_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==1)

train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}")
print(f"Max Date in Train: {train.timestamp.max()} | Min Date in Validation: {val.timestamp.min()} | Min Date in Test: {test.timestamp.min()}")

# of Training samples: 4293840 | # of Validation samples: 223200 | # of Test samples: 194400
Max Date in Train: 2013-12-31 23:30:00 | Min Date in Validation: 2014-01-01 00:00:00 | Min Date in Test: 2014-02-01 00:00:00


In [30]:
train.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_train.parquet")
val.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_val.parquet")
test.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_test.parquet")

## Train Test Split after filling in missing values

In [31]:
from src.imputation.interpolation import SeasonalInterpolation

# Remove seasonality and fill-in missing values with an interpolation
block_df.energy_consumption = block_df.energy_consumption.progress_apply(
    lambda x: SeasonalInterpolation(seasonal_period=48*7).fit_transform(x.reshape(-1,1)).squeeze()
)

  0%|          | 0/150 [00:00<?, ?it/s]

In [32]:
#Converting to expanded form
exp_block_df = compact_to_expanded(
    block_df,
    timeseries_col = 'energy_consumption',
    static_cols = ["frequency", "series_length", "stdorToU", "Acorn", "Acorn_grouped", "file"],
    time_varying_cols = [
        'holidays', 'visibility', 'windBearing', 'temperature', 'dewPoint',
        'pressure', 'apparentTemperature', 'windSpeed', 'precipType', 'icon',
        'humidity', 'summary'
        ],
    ts_identifier = "LCLid"
)

exp_block_df.head()

  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,timestamp,LCLid,energy_consumption,frequency,series_length,stdorToU,Acorn,Acorn_grouped,file,holidays,...,windBearing,temperature,dewPoint,pressure,apparentTemperature,windSpeed,precipType,icon,humidity,summary
0,2012-04-21 00:00:00,MAC000768,0.844,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251.0,6.42,3.54,994.96,3.79,3.64,rain,partly-cloudy-night,0.82,Partly Cloudy
1,2012-04-21 00:30:00,MAC000768,0.265,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,,,,,,,,,,
2,2012-04-21 01:00:00,MAC000768,0.262,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,251.0,6.2,3.61,994.98,3.67,3.42,rain,partly-cloudy-night,0.83,Partly Cloudy
3,2012-04-21 01:30:00,MAC000768,0.234,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,,,,,,,,,,
4,2012-04-21 02:00:00,MAC000768,0.046,30min,32544,Std,ACORN-A,Affluent,block_1,NO_HOLIDAY,...,246.0,5.68,3.52,994.82,3.15,3.25,rain,clear-night,0.86,Clear


## Reduce Memory Footprint

In [33]:
from src.utils.data_utils import reduce_memory_footprint

In [34]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 23471
Columns: 21 entries, timestamp to summary
dtypes: datetime64[ns](1), float64(9), int64(1), object(10)
memory usage: 3.0 GB


In [35]:
exp_block_df = reduce_memory_footprint(exp_block_df)

In [36]:
exp_block_df.info(memory_usage="deep", verbose=False)

<class 'pandas.core.frame.DataFrame'>
Index: 4711440 entries, 0 to 23471
Columns: 21 entries, timestamp to summary
dtypes: category(10), datetime64[ns](1), float32(9), int32(1)
memory usage: 301.1 MB


In [37]:
test_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==2)
val_mask = (exp_block_df.timestamp.dt.year==2014) & (exp_block_df.timestamp.dt.month==1)

train = exp_block_df[~(val_mask|test_mask)]
val = exp_block_df[val_mask]
test = exp_block_df[test_mask]
print(f"# of Training samples: {len(train)} | # of Validation samples: {len(val)} | # of Test samples: {len(test)}")
print(f"Max Date in Train: {train.timestamp.max()} | Min Date in Validation: {val.timestamp.min()} | Min Date in Test: {test.timestamp.min()}")

# of Training samples: 4293840 | # of Validation samples: 223200 | # of Test samples: 194400
Max Date in Train: 2013-12-31 23:30:00 | Min Date in Validation: 2014-01-01 00:00:00 | Min Date in Test: 2014-02-01 00:00:00


In [38]:
train.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_train_missing_imputed.parquet")
val.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_val_missing_imputed.parquet")
test.to_parquet("data/london_smart_meters/preprocessed/selected_blocks_test_missing_imputed.parquet")