# Step 2) Preparing weather data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tsm.weather import interpolate_weather_data, add_ewm_lags

In [3]:
from tsm.data_utils import compress_memory_usage

In [5]:
from tsm.normalizers import scale_and_save_data_frame_columns, load_normalisers_and_scale_data_frame

## Process Train

In [6]:
import pandas as pd

In [7]:
w_train = pd.read_pickle("data/comp/weather_train.pkl")

In [8]:
w_train = interpolate_weather_data(w_train)

Splitter will return list of 16 dataframe


In [9]:
w_train, _ = compress_memory_usage(w_train)

HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=9, style=ProgressStyle…


Memory usage pre-compression was 6.531720161437988
Memory usage after-compression was 4.932145118713379
This is  75.51066176765822% of the initial size


In [10]:
col_to_normalize = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']

In [11]:
w_train = scale_and_save_data_frame_columns(df=w_train, columns=col_to_normalize, store_path='data/objects/weather')

In [12]:
w_train

Unnamed: 0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,2016-01-01 00:00:00,8,0.708279,0.70,0.900164,0.000000,0.670499,0.000000,0.000000
1,2016-01-01 01:00:00,8,0.700394,0.50,0.918167,0.002899,0.676884,0.194444,0.078947
2,2016-01-01 02:00:00,8,0.679369,0.30,0.918167,0.005797,0.676884,0.000000,0.000000
3,2016-01-01 03:00:00,8,0.657030,0.30,0.909984,0.005797,0.675607,0.000000,0.000000
4,2016-01-01 04:00:00,8,0.642576,0.30,0.900164,0.002899,0.674330,0.694444,0.136842
...,...,...,...,...,...,...,...,...,...
139768,2016-12-31 19:00:00,15,0.419185,0.25,0.441899,0.002899,0.515326,0.500000,0.300000
139769,2016-12-31 20:00:00,15,0.416557,0.30,0.427169,0.002899,0.513411,0.500000,0.405263
139770,2016-12-31 21:00:00,15,0.416557,0.30,0.454992,0.002899,0.514688,0.500000,0.268421
139771,2016-12-31 22:00:00,15,0.408673,0.30,0.463175,0.002899,0.521073,0.472222,0.242105


In [13]:
w_train = add_ewm_lags(w_train)

Splitter will return list of 16 dataframe


In [14]:
w_train, _ = compress_memory_usage(w_train)

HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=37, style=ProgressStyl…


Memory usage pre-compression was 35.85714054107666
Memory usage after-compression was 20.927773475646973
This is  58.36431226765799% of the initial size


In [15]:
w_train.to_pickle("data/prep/weather_train.pkl")

## Process Test

In [21]:
w_test = pd.read_pickle("data/comp/weather_test.pkl")

In [22]:
w_test = interpolate_weather_data(w_test)

Splitter will return list of 16 dataframe


In [23]:
w_test, _ = compress_memory_usage(w_test)

HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=9, style=ProgressStyle…


Memory usage pre-compression was 12.95569896697998
Memory usage after-compression was 9.782904624938965
This is  75.51043482773508% of the initial size


In [24]:
w_test

Unnamed: 0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,2017-01-01 00:00:00,8,17.799999,4.0,11.7,-2.0,1021.400024,100.0,3.6
1,2017-01-01 01:00:00,8,17.799999,2.0,12.8,0.0,1022.000000,130.0,3.1
2,2017-01-01 02:00:00,8,16.100000,0.0,12.8,0.0,1021.900024,140.0,3.1
3,2017-01-01 03:00:00,8,17.200001,0.0,13.3,0.0,1022.200012,140.0,3.1
4,2017-01-01 04:00:00,8,16.700001,2.0,13.3,0.0,1022.299988,130.0,2.6
...,...,...,...,...,...,...,...,...,...
277238,2018-12-31 19:00:00,7,-0.800000,-1.0,-4.3,2.0,1019.599976,50.0,3.1
277239,2018-12-31 20:00:00,7,-1.200000,-1.0,-4.4,2.0,1018.299988,60.0,4.1
277240,2018-12-31 21:00:00,7,-1.400000,-1.0,-4.4,2.0,1018.400024,60.0,3.1
277241,2018-12-31 22:00:00,7,-1.700000,-1.0,-3.5,2.0,1017.799988,60.0,3.6


In [25]:
w_test = load_normalisers_and_scale_data_frame(df=w_test, store_path='data/objects')

Found data/objects/weather_air_temperature_normalizer.pkl for air_temperature
Found data/objects/weather_cloud_coverage_normalizer.pkl for cloud_coverage
Found data/objects/weather_dew_temperature_normalizer.pkl for dew_temperature
Found data/objects/weather_precip_depth_1_hr_normalizer.pkl for precip_depth_1_hr
Found data/objects/weather_sea_level_pressure_normalizer.pkl for sea_level_pressure
Found data/objects/weather_wind_direction_normalizer.pkl for wind_direction
Found data/objects/weather_wind_speed_normalizer.pkl for wind_speed


In [26]:
w_test = add_ewm_lags(w_test)

Splitter will return list of 16 dataframe


In [27]:
w_test

Unnamed: 0,timestamp,site_id,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,air_temperature_ewm_001,...,sea_level_pressure_ewm_01,sea_level_pressure_ewm_02,wind_direction_ewm_001,wind_direction_ewm_005,wind_direction_ewm_01,wind_direction_ewm_02,wind_speed_ewm_001,wind_speed_ewm_005,wind_speed_ewm_01,wind_speed_ewm_02
0,2017-01-01 00:00:00,8,0.613666,0.5,0.764321,0.000000,0.692210,0.277778,0.189474,0.613666,...,0.692210,0.692210,0.277778,0.277778,0.277778,0.277778,0.189474,0.189474,0.189474,0.189474
1,2017-01-01 01:00:00,8,0.613666,0.3,0.782324,0.005797,0.699872,0.361111,0.163158,0.613666,...,0.696243,0.696243,0.319654,0.320513,0.321637,0.321637,0.176250,0.175978,0.175623,0.175623
2,2017-01-01 02:00:00,8,0.591327,0.1,0.782324,0.005797,0.698596,0.388889,0.163158,0.606145,...,0.697111,0.697111,0.342965,0.344483,0.346453,0.346453,0.171842,0.171484,0.171023,0.171023
3,2017-01-01 03:00:00,8,0.605782,0.1,0.790507,0.005797,0.702427,0.388889,0.163158,0.606053,...,0.698657,0.698657,0.354619,0.356453,0.358793,0.358793,0.169638,0.169240,0.168736,0.168736
4,2017-01-01 04:00:00,8,0.599212,0.3,0.790507,0.005797,0.703704,0.361111,0.136842,0.604657,...,0.699889,0.699889,0.355944,0.357483,0.359359,0.359359,0.162946,0.162079,0.160948,0.160948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277238,2018-12-31 19:00:00,7,0.369251,0.0,0.502455,0.011594,0.669221,0.138889,0.163158,0.281538,...,0.705524,0.705524,0.452178,0.262821,0.193830,0.193830,0.153679,0.127680,0.120315,0.120315
277239,2018-12-31 20:00:00,7,0.363995,0.0,0.500818,0.011594,0.652618,0.166667,0.215789,0.282363,...,0.700234,0.700234,0.449323,0.258014,0.191114,0.191114,0.154300,0.132086,0.129862,0.129862
277240,2018-12-31 21:00:00,7,0.361367,0.0,0.500818,0.011594,0.653895,0.166667,0.163158,0.283153,...,0.695600,0.695600,0.446496,0.253446,0.188669,0.188669,0.154389,0.133639,0.133192,0.133192
277241,2018-12-31 22:00:00,7,0.357424,0.0,0.515548,0.011594,0.646233,0.166667,0.189474,0.283896,...,0.690663,0.690663,0.443698,0.249107,0.186469,0.186469,0.154740,0.136431,0.138820,0.138820


In [28]:
w_test, _ = compress_memory_usage(w_test)

HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=37, style=ProgressStyl…


Memory usage pre-compression was 71.12347316741943
Memory usage after-compression was 41.51072597503662
This is  58.36431226765799% of the initial size


In [29]:
w_test.to_pickle("data/prep/weather_test.pkl")

## fin