# Step 2) Preparing weather data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from tsm.data_selector import split_data_frame_by_column

In [5]:
from tsm.data_utils import compress_memory_usage

In [6]:
NA_REPLACER = {
    'air_temperature': 99,
    'cloud_coverage': 99,
    'dew_temperature': 99,
    'precip_depth_1_hr': 999,
    'sea_level_pressure': 9999,
    'wind_direction': 999,
    'wind_speed': 99
}

In [11]:
def interpolate_weather_data(data):
    
    sites_data = []
    for site_data in split_data_frame_by_column(data, by='site_id', drop=False):
        site_data['timestamp'] = pd.to_datetime(site_data['timestamp'])
        site_data.set_index('timestamp', inplace=True)
        for col in site_data.columns:
            if col != 'site_id':
                site_data[col].replace(NA_REPLACER[col], np.nan, inplace=True)
                if site_data[col].isna().sum() < len(site_data):
                    site_data[col] = site_data[col].interpolate(method='time')
        sites_data.append(site_data)
    
    return pd.concat(sites_data).reset_index()

In [8]:
def add_ewm_lags(data):
    
    sites_data = []
    for site_data in split_data_frame_by_column(data, by='site_id', drop=False):
        for col in site_data.columns:
            if col != 'site_id' and col != 'timestamp':
                
                site_data[col + '_ewm_001'] = site_data[col].ewm(alpha=0.01).mean()
                site_data[col + '_ewm_005'] = site_data[col].ewm(alpha=0.05).mean()
                site_data[col + '_ewm_01'] = site_data[col].ewm(alpha=0.1).mean()
                site_data[col + '_ewm_02'] = site_data[col].ewm(alpha=0.1).mean()
        
        sites_data.append(site_data)
        
    return pd.concat(sites_data)

In [19]:
weather_file = "data/comp/weather_test.pkl"

In [20]:
weather_data, _ = compress_memory_usage(add_ewm_lags(interpolate_weather_data(pd.read_pickle(weather_file))), {})

Splitter will return list of 16 dataframe
Splitter will return list of 16 dataframe


HBox(children=(IntProgress(value=0, description='DataFrame: compress_memory_usage', max=37, style=ProgressStyl…


Memory usage pre-compression was 74.29626750946045
Memory usage after-compression was 41.51072597503662
This is  55.87188612099644% of the initial size


In [21]:
weather_data.to_pickle(weather_file)