In [1]:
import pandas as pd
import numpy as np

In [2]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP

## Dataset

In [3]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/"

In [4]:
# Load the dataset.
train = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train.index.name = "Datetime"
train.index = pd.to_datetime(train.index)
freq = "D"
train.index.freq = freq

In [5]:
# Load the dataset.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [6]:
# Load the dataset.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [7]:
TEST_SIZE = 30
FREQ = train.index.freq

In [8]:
TRAIN = train.copy()

In [9]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [10]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

## Forecasting

In [35]:
PREDICTORS

Index(['3 Months Anomaly (%) Rainfall', 'Cereals and tubers',
       'Exchange rate (unofficial) (USD/LCU)', 'FCS', 'Fatality', 'Lat', 'Lon',
       'NDVI', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [37]:
lags_dict = dict()
# Define lags for each indicator.
lags_dict["3 Months Anomaly (%) Rainfall"] = 10
lags_dict["Cereals and tubers"] = 10
lags_dict["Exchange rate (unofficial) (USD/LCU)"] = 10
lags_dict["FCS"] = 10
lags_dict["Fatality"] = 10
lags_dict["NDVI"] = 10
lags_dict["NDVI Anomaly"] = 10
lags_dict["Rainfall (mm)"] = 10
lags_dict["rCSI"] = 10
lags_dict["Lat"] = 0
lags_dict["Lon"] = 0
lags_dict["Population"] = 0
lags_dict["Ramadan"] = 10

# N.B. If the lags is set to 0 the feature is consedered as static. If no lags is provided the feature is not considered as predictor.

In [38]:
lags_dict

{'3 Months Anomaly (%) Rainfall': 10,
 'Cereals and tubers': 10,
 'Exchange rate (unofficial) (USD/LCU)': 10,
 'FCS': 10,
 'Fatality': 10,
 'NDVI': 10,
 'NDVI Anomaly': 10,
 'Rainfall (mm)': 10,
 'rCSI': 10,
 'Lat': 0,
 'Lon': 0,
 'Population': 0,
 'Ramadan': 10}

In [39]:
def create_lags(group, lags_dict, label, n_out = 1):
    adminstrata = group.columns[0][0]
    group = group[adminstrata]
    columns = list(group.columns)
    # Not consider predictors whose are not specified into lags_dict (if exist).
    predictors_to_remove = list(set(columns) - set(list(lags_dict.keys())))
    group_predictors = group.drop(columns = predictors_to_remove)
    
    # Creation of the feature lags of the selected predictors.
    dataframe = list()
    for feature, lags in lags_dict.items():
        # Dynamic predictor.
        if lags != 0:
            serie = group_predictors[feature]
            cols, names = list(), list()
            # Input sequences (t-n, ..., t-1).
            for i in range(lags, 0, -1):
                cols.append(serie.shift(i))
                names += [("%s(t-%d)" % (feature, i))]
            # Add to the dataframe the new features lags.
            feature_lags = pd.concat(cols, axis = 1)
            feature_lags.columns = names
            dataframe.append(feature_lags)
        # Static predictor.
        if lags == 0:
            serie = group_predictors[feature]
            dataframe.append(serie)
            
    # Matrix of feature lags.
    X = pd.concat(dataframe, axis = 1)
    X = X.dropna()
    if n_out != 1:
        X = X[:-(n_out - 1)]
    
    # Create labels.
    target = group[label]
    y = target[max(lags_dict.values()) + n_out - 1:]
    y = y.to_frame()
    if n_out == 1:
        y.columns = ["%s(t)" % label]
    else:
        y.columns = ["%s(t+%d)" % (label, n_out)]
    
    # Create feature time.
    to_serie = lambda x, y: pd.Series(x, index = X.index, name = y)
    X = pd.concat([X, to_serie(y.index.day, "Day"), to_serie(y.index.month, "Month"), 
                   to_serie(y.index.year, "Year")], axis = 1)
    
    # Create x sample test.
    dataframe = list()
    for feature, lags in lags_dict.items():
        # Dynamic predictor.
        if lags != 0:
            serie = group_predictors[feature]
            cols, names = list(), list()
            # Input sequences (t-n, ..., t-1).
            for i in range(lags, 0, -1):
                cols.append(pd.Series(serie[-i]))
                names += [("%s(t-%d)" % (feature, i))]
            # Add to the dataframe the new features lags.
            feature_lags = pd.concat(cols, axis = 1)
            feature_lags.columns = names
            dataframe.append(feature_lags)
        # Static predictor.
        if lags == 0:
            serie = group_predictors[feature]
            dataframe.append(pd.Series(serie[-1], name = feature))
            
    X_test = pd.concat(dataframe, axis = 1)  
    future_step = next_datetimes(group_predictors.index[-1], n_out, "D")[-1]
    X_test = pd.concat([X_test, pd.Series(future_step.day, name = "Day"), pd.Series(future_step.month, name = "Month"),
                        pd.Series(future_step.year, name = "Year")], axis = 1)
    X_test.index = [adminstrata]

    return X, y, X_test

In [40]:
import xgboost as xgb

In [41]:
FORECASTING = {PROVINCE: [] for PROVINCE in PROVINCES}

for h in range(TEST_SIZE):
    # Create training samples.
    X_list, y_list, X_test_list = list(), list(), list()
    for PROVINCE in PROVINCES:
        X, y, X_test = create_lags(TRAIN[[PROVINCE]], lags_dict, n_out = h + 1, label = "FCS")
        X_list.append(X)
        y_list.append(y)
        X_test_list.append(X_test)
        
    X = pd.concat(X_list)
    y = pd.concat(y_list)
    X_test_tot = pd.concat(X_test_list)

    model = xgb.XGBRegressor(n_estimators = 100)   
    model.fit(X, y)  

    # Prediction.
    for PROVINCE in PROVINCES:
        x_sample = X_test_tot.loc[PROVINCE]
        x_sample = x_sample.to_frame().transpose()
        y_hat = model.predict(x_sample)[0]

        FORECASTING[PROVINCE].append(y_hat)



In [42]:
PREDICTION = TEST.copy()
for PROVINCE in PROVINCES:
    PREDICTION[(PROVINCE, "FCS")] = FORECASTING[PROVINCE]

In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Error.
mean_squared_error(TEST, PREDICTION)

15.983878573594463

In [44]:
plot_comparison(fcs, PREDICTION)

interactive(children=(ToggleButtons(description='AdminStrata', options=('Abyan', 'Aden', 'Al Bayda', "Al Dhale…