In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP
from tools import find_multiple_sets
from LagsCreator.LagsCreator import LagsCreator

## Dataset

In [3]:
COUNTRY = "Yemen"

In [None]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/data/" + COUNTRY + "/"

In [4]:
# Load the dataset of the smoothed training sets.
train_smooth = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train_smooth.index.name = "Datetime"
train_smooth.index = pd.to_datetime(train_smooth.index)
freq = "D"
train_smooth.index.freq = freq

In [5]:
# Load the dataset of the test sets.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [6]:
# Load the dataset of the whole time-series of the fcs indicator.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [8]:
TRAIN = train_smooth.copy()

In [7]:
TEST_SIZE = 30
FREQ = TRAIN.index.freq

In [9]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [10]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [11]:
# Get the training and test sets.
TRAIN_SETS = find_multiple_sets(TRAIN)
TEST_TARGET_SETS = find_multiple_sets(test)

## Training & Validation

In [12]:
lags_dict = dict()
# Define lags for each indicator.
lags_dict["1 Month Anomaly (%) Rainfall"] = 2
lags_dict["3 Months Anomaly (%) Rainfall"] = 1
lags_dict["Cereals and tubers"] = 2
lags_dict["Exchange rate (USD/LCU)"] = 5
lags_dict["FCS"] = 2
lags_dict["Fatality"] = 2
lags_dict["NDVI Anomaly"] = 2
lags_dict["Rainfall (mm)"] = 2
lags_dict["rCSI"] = 2
lags_dict["Lat"] = 0
lags_dict["Lon"] = 0
lags_dict["Population"] = 0
lags_dict["Ramadan"] = 1

In [13]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [14]:
val_losses_h = dict()

In [16]:
for h in range(TEST_SIZE):
    X_train_list, y_train_list, X_val_list, y_val_list = list(), list(), list(), list()
    for train in TRAIN_SETS:
        # Create training and validation samples.  
        for PROVINCE in PROVINCES:
            creator = LagsCreator(train[[PROVINCE]], lags_dictionary = lags_dict, n_out = TEST_SIZE, target = "FCS", 
                                  return_dataframe = True)
            X_train, y_train, X_val, y_val, _ = creator.to_supervised(single_step = True, h = h+1, feature_time = True, 
                                                                      validation = True, dtype = np.float64)
            X_train_list.append(X_train)
            y_train_list.append(y_train)
            X_val_list.append(X_val)
            y_val_list.append(y_val)  

    X_train = pd.concat(X_train_list).reset_index(drop = True)
    y_train = pd.concat(y_train_list).reset_index(drop = True)
    
    print("Training and validation for prediction horizon: %d" % (h+1))
    # Train the model.
    model = xgb.XGBRegressor(n_estimators = 100, objective = "reg:squarederror")   
    model.fit(X_train, y_train)  
    
    X_val = pd.concat(X_val_list).reset_index(drop = True)
    y_val = pd.concat(y_val_list).reset_index(drop = True)
    
    # Validation.
    y_hats = model.predict(X_val)
    # Compute validation error.
    val_loss = mean_squared_error(y_val.values.flatten(), y_hats)
    val_losses_h[h+1] = val_loss

Training and validation for prediction horizon: 1
Training and validation for prediction horizon: 2
Training and validation for prediction horizon: 3
Training and validation for prediction horizon: 4
Training and validation for prediction horizon: 5
Training and validation for prediction horizon: 6
Training and validation for prediction horizon: 7
Training and validation for prediction horizon: 8
Training and validation for prediction horizon: 9
Training and validation for prediction horizon: 10
Training and validation for prediction horizon: 11
Training and validation for prediction horizon: 12
Training and validation for prediction horizon: 13
Training and validation for prediction horizon: 14
Training and validation for prediction horizon: 15
Training and validation for prediction horizon: 16
Training and validation for prediction horizon: 17


KeyboardInterrupt: 