In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import shutil
from IPython.display import clear_output

In [2]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP
from tools import find_multiple_sets
from LagsCreator.LagsCreator import LagsCreator

In [3]:
# Create workspace.
dir = "./output"
if not os.path.exists(dir):
    os.makedirs(dir)
else:
    shutil.rmtree(dir)           
    os.makedirs(dir)

## Dataset

In [4]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/"

In [5]:
# Load the dataset of the training sets.
train = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train.index.name = "Datetime"
train.index = pd.to_datetime(train.index)
freq = "D"
train.index.freq = freq

In [6]:
# Load the dataset of the test sets.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [7]:
# Load the dataset of the whole time-series of the fcs indicator.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [8]:
TEST_SIZE = 30
FREQ = train.index.freq

In [9]:
TRAIN = train.copy()

In [10]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [11]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [12]:
# Get the training and test sets.
TRAIN_NORMALIZED_SETS = find_multiple_sets(train)
TEST_TARGET_SETS = find_multiple_sets(test)

## Training & Validation
### Parameters grid search

In [23]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error

# Define the PARAMETERS MODEL to which perform the grid search.
space = {"1 Month Anomaly (%) Rainfall": hp.randint("1 Month Anomaly (%) Rainfall", 1, 5), 
         "3 Months Anomaly (%) Rainfall": hp.randint("3 Months Anomaly (%) Rainfall", 1, 5), 
         "Cereals and tubers": hp.randint("Cereals and tubers", 1, 5), 
         "Exchange rate (USD/LCU)": hp.randint("Exchange rate (USD/LCU)", 1, 5), 
         "FCS": hp.randint("FCS", 1, 5), 
         "Fatality": hp.randint("Fatality", 1, 5), 
         "NDVI Anomaly": hp.randint("NDVI Anomaly", 1, 5), 
         "Rainfall (mm)": hp.randint("Rainfall (mm)", 1, 5), 
         "rCSI": hp.randint("rCSI", 1, 5), 
         "Lat": hp.randint("Lat", 0, 1), 
         "Lon": hp.randint("Lon", 0, 1), 
         "Population": hp.randint("Population", 0, 1), 
         "Ramadan": hp.randint("Ramadan", 1, 5)}

In [24]:
import xgboost as xgb

In [31]:
def hyperparameters(space):  
    try:
        val_losses_h = list()
        for h in range(TEST_SIZE):
            X_train_list, y_train_list, X_val_list, y_val_list = list(), list(), list(), list()
            for train_normalized in TRAIN_NORMALIZED_SETS:
                # Create training and validation samples.  
                for PROVINCE in PROVINCES:
                    creator = LagsCreator(train_normalized[[PROVINCE]], lags_dictionary = space, target = "FCS")
                    X_train, y_train, X_val, y_val, _ = creator.to_supervised(n_out = TEST_SIZE, single_step = True, h = h+1, return_dataframe = True,
                                                                              feature_time = True, validation = True, return_single_level = True, 
                                                                              dtype = np.float32)
                    X_train_list.append(X_train)
                    y_train_list.append(y_train)
                    X_val_list.append(X_val)
                    y_val_list.append(y_val)  

            X_train = pd.concat(X_train_list).reset_index(drop = True)
            y_train = pd.concat(y_train_list).reset_index(drop = True)

            # Train the model.
            print("Training %s samples for the prediction horizon h: %d" % (str(X_train.shape), h+1), end = "\r")
            model = xgb.XGBRegressor(objective = "reg:squarederror", n_estimators = 100)   #tree_method = "gpu_hist", gpu_id = 0
            model.fit(X_train, y_train)  

            y_hats_train = model.predict(X_train)
            # Compute training error.
            train_loss = mean_squared_error(y_train.values.flatten(), y_hats_train)

            X_val = pd.concat(X_val_list).reset_index(drop = True)
            y_val = pd.concat(y_val_list).reset_index(drop = True)

            # Validation.
            y_hats_val = model.predict(X_val)
            # Compute validation error.
            val_loss = mean_squared_error(y_val.values.flatten(), y_hats_val)
            val_losses_h.append(val_loss)
            
            # Recursive save results.
            results = space.copy()
            results["h"] = h+1
            results["val_loss"] = val_loss
            results["train_loss"] = train_loss
            df_space = pd.DataFrame(results, index = [0], dtype = object)
            filename = dir + "/grid_search.csv"
            df_space.to_csv(filename, index = False, header = (not os.path.exists(filename)), mode = "a")
            clear_output(wait = True)

        # Compute mean error of this 'space' for the various prediction horizions.
        val_loss = np.mean(val_losses_h)
    except:
        val_loss = np.inf     
        clear_output(wait = True)

    return {"loss": val_loss, "status": STATUS_OK}

In [32]:
trials = Trials()
best = fmin(fn = hyperparameters,
            space = space,
            algo = tpe.suggest,
            max_evals = 3,
            trials = trials)

# Save the trials into a file.
pickle.dump(trials, open(dir + "/hyp_trials.p", "wb"))

100%|█████████████████████████████████████████████████████████████████| 3/3 [00:30<00:00, 10.07s/trial, best loss: inf]
