In [1]:
import pandas as pd
import numpy as np

In [2]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP
from tools import find_multiple_sets
from LagsCreator.LagsCreator import LagsCreator

## Dataset

In [3]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/"

In [4]:
# Load the dataset of the training sets.
train = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train.index.name = "Datetime"
train.index = pd.to_datetime(train.index)
freq = "D"
train.index.freq = freq

In [5]:
# Load the dataset of the test sets.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [6]:
# Load the dataset of the whole time-series of the fcs indicator.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [7]:
TEST_SIZE = 30
FREQ = train.index.freq

In [8]:
TRAIN = train.copy()

In [9]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [10]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [11]:
# Get the training and test sets.
TRAIN_NORMALIZED_SETS = find_multiple_sets(train)
TEST_TARGET_SETS = find_multiple_sets(test)

## Forecasting

In [12]:
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [13]:
lags_dict = dict()
# Define lags for each indicator.
lags_dict["3 Months Anomaly (%) Rainfall"] = 10
lags_dict["1 Month Anomaly (%) Rainfall"] = 10
lags_dict["Cereals and tubers"] = 10
lags_dict["Exchange rate (USD/LCU)"] = 10
lags_dict["FCS"] = 10
lags_dict["Fatality"] = 10
lags_dict["NDVI Anomaly"] = 10
lags_dict["Rainfall (mm)"] = 10
lags_dict["rCSI"] = 10
lags_dict["Lat"] = 0
lags_dict["Lon"] = 0
lags_dict["Population"] = 0
lags_dict["Ramadan"] = 10

# N.B. If the lags is set to 0 the feature is consedered as static. If no lags is provided the feature is not considered as predictor.

In [14]:
lags_dict

{'3 Months Anomaly (%) Rainfall': 10,
 '1 Month Anomaly (%) Rainfall': 10,
 'Cereals and tubers': 10,
 'Exchange rate (USD/LCU)': 10,
 'FCS': 10,
 'Fatality': 10,
 'NDVI Anomaly': 10,
 'Rainfall (mm)': 10,
 'rCSI': 10,
 'Lat': 0,
 'Lon': 0,
 'Population': 0,
 'Ramadan': 10}

In [15]:
import xgboost as xgb

In [16]:
FORECASTING = test.copy()
for h in range(TEST_SIZE):
    X_list, y_list, X_test_list = list(), list(), list()
    for train_normalized in TRAIN_NORMALIZED_SETS:
        # Create training samples.  
        X_test_list_set = list()
        for PROVINCE in PROVINCES:
            creator = LagsCreator(train_normalized[[PROVINCE]], lags_dictionary = lags_dict, target = "FCS")
            X, y, _, _, X_test = creator.to_supervised(n_out = TEST_SIZE, single_step = True, h = h+1, return_dataframe = True,
                                                       feature_time = True, validation = False, return_single_level = True, 
                                                       dtype = np.float32)
            X_list.append(X)
            y_list.append(y)
            X_test_list_set.append(X_test)

        X_test_list.append(pd.concat(X_test_list_set))    

    X = pd.concat(X_list)
    y = pd.concat(y_list)
    
    model = xgb.XGBRegressor(n_estimators = 100)   
    model.fit(X, y)  
    
    # Prediction.
    for i, test_set in enumerate(TEST_TARGET_SETS):
        for PROVINCE in PROVINCES:
            x_sample = X_test_list[i].loc[PROVINCE]
            x_sample = x_sample.to_frame().transpose()
            y_hat = model.predict(x_sample)[0]
            FORECASTING[(PROVINCE, "FCS")].loc[test_set.index[h]] = y_hat

             1 Month Anomaly (%) Rainfall | x(t-9)  \
AdminStrata                                          
Abyan                                   341.451904   

             1 Month Anomaly (%) Rainfall | x(t-8)  \
AdminStrata                                          
Abyan                                   341.278809   

             1 Month Anomaly (%) Rainfall | x(t-7)  \
AdminStrata                                          
Abyan                                   341.108459   

             1 Month Anomaly (%) Rainfall | x(t-6)  \
AdminStrata                                          
Abyan                                   340.934174   

             1 Month Anomaly (%) Rainfall | x(t-5)  \
AdminStrata                                          
Abyan                                   340.753693   

             1 Month Anomaly (%) Rainfall | x(t-4)  \
AdminStrata                                          
Abyan                                   340.573364   

             1 Month A

NameError: name 'dedeed' is not defined

In [18]:
forecasting_target_sets = find_multiple_sets(FORECASTING)

In [19]:
from sklearn.metrics import mean_squared_error

# Loss.
loss = np.mean([mean_squared_error(test_set, forecasting_set) for test_set, forecasting_set in zip(TEST_TARGET_SETS, forecasting_target_sets)])
loss

22.05582170460428

In [20]:
# Plot time-series.
TsIP(FORECASTING, target).interactive_plot_df(title = "Forecasting", matplotlib = False, style = "lines", comparison = True)

interactive(children=(RadioButtons(description='Select:', options=('Time-series', 'Missing values'), value='Ti…