In [1]:
import pandas as pd
import utils as utl
import math
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

In [2]:
def build_some_features(df_, num_periods_ahead, num_periods_lagged=1, num_periods_diffed=0, rolling=[]): 
    """
    Builds some features by calculating differences between periods  
    """
    # make a copy 
    df_ = df_.copy()
    
    # for a few values, get the lags  
    for i in range(1, num_periods_lagged+1):
        # make a new feature, with the lags in the observed values column
        df_['lagged_%s' % str(i)] = df_['value'].shift(i)
        
    # for a few values, get the diffs  
    for i in range(1, num_periods_diffed+1):
        # make a new feature, with the lags in the observed values column
        df_['diff_%s' % str(i)] = df_['value'].diff(i)
    
    for stat in rolling:
        df_['rolling_%s'%str(stat)] = df_['value'].rolling('1D').aggregate(stat)
        
    df_['day_of_week'] = df_.index.weekday
    df_['month'] = df_.index.month
    df_['hour'] = df_.index.hour

    df_['sin_weekday'] = np.sin(2*np.pi*df_.index.weekday/7)
    df_['cos_weekday'] = np.sin(2*np.pi*df_.index.weekday/7)

    df_['sin_month'] = np.sin(2*np.pi*df_.index.month/12)
    df_['cos_month'] = np.sin(2*np.pi*df_.index.month/12)
    df_['sin_hour'] = np.sin(2*np.pi*df_.index.hour/24)
    df_['cos_hour'] = np.sin(2*np.pi*df_.index.hour/24)

    df_ = df_.drop(columns=["day_of_week", "month", "hour"])
    
    
    return df_

In [3]:
df = pd.read_csv('data/data.csv')

In [4]:
df["date"] = df.date.apply(pd.to_datetime)

In [5]:
df = df.set_index('date')
df = df.sort_index()

In [6]:
df = df.dropna()

In [7]:
df_area1 = df[df.variable=="area_1"].drop(columns="variable")
df_area2 = df[df.variable=="area_2"].drop(columns="variable")
df_area3 = df[df.variable=="area_3"].drop(columns="variable")

In [8]:
df_area1 = df_area1.resample("h").mean()
df_area2 = df_area2.resample("h").mean()
df_area3 = df_area3.resample("h").mean()

In [9]:
df_area1 = (df_area1.ffill()+ df_area1.bfill())/2
df_area2 = (df_area2.ffill()+ df_area2.bfill())/2
df_area3 = (df_area3.ffill()+ df_area3.bfill())/2

In [10]:
area_1 = df_area1.sort_index()
area_2 = df_area2.sort_index()
area_3 = df_area3.sort_index()

In [11]:
train_1 = area_1[:-168]
train_2 = area_2[:-168]
train_3 = area_3[:-168]

test_1 = area_1[-168:]
test_2 = area_2[-168:]
test_3 = area_3[-168:]

In [None]:
predictions = utl.predict_n_periods(series_=train_1, 
                                    n_periods=168, 
                      model=GradientBoostingRegressor(random_state=1), 
                      num_periods_lagged=6,
                      num_periods_diffed=1,
                      weekday=True,
                      month=False,
                                   
                      rolling=[],
                      hour=False
                      )

In [None]:
mean_absolute_error(predictions, test_1) #1.49831455990887