In [33]:
import pandas as pd
import numpy as np
import pickle
import os
import shutil

In [34]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP
from tools import find_multiple_sets
from LagsCreator.LagsCreator import LagsCreator

In [35]:
# Create workspace.
dir = "./output"
if not os.path.exists(dir):
    os.makedirs(dir)
else:
    shutil.rmtree(dir)           
    os.makedirs(dir)

## Dataset

In [36]:
COUNTRY = "Yemen"

In [37]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/data/" + COUNTRY + "/"

In [38]:
# Load the dataset of the smoothed training sets.
train_smooth = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train_smooth.index.name = "Datetime"
train_smooth.index = pd.to_datetime(train_smooth.index)
freq = "D"
train_smooth.index.freq = freq

In [39]:
# Load the dataset of the test sets.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [40]:
# Load the dataset of the whole time-series of the fcs indicator.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [41]:
TRAIN = train_smooth.copy()

In [42]:
TEST_SIZE = 30
FREQ = TRAIN.index.freq

In [43]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [44]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

## Data source transformation

I decide to normalize the data among the provinces considering indicator by indicator and considering only the training sets.

In [45]:
global SCALERS

MIN = 0
MAX = 1
SCALERS = dict()
def normalization(group, feature_range):
    min_, max_ = feature_range
    min_group = group.min().min()
    max_group = group.max().max()
    
    # Normalization.
    group_std = (group - min_group) / (max_group - min_group)
    group_scaled = group_std * (max_ - min_) + min_

    # Save the scalers for the various indicators.
    SCALERS[group.name] = (min_group, max_group)

    return group_scaled

In [46]:
TRAIN_NORMALIZED = TRAIN.groupby(axis = 1, level = 1).apply(lambda x: normalization(x, (MIN, MAX)))

In [47]:
# Plot time-series.
#TsIP(TRAIN_NORMALIZED).interactive_plot_df(title = "Training sets", matplotlib = False, style = "lines")

In [48]:
def denormalization(group_scaled, indicator, feature_range, scalers):
    min_, max_ = feature_range
    min_group, max_group = scalers[indicator]

    group_std = (group_scaled - min_) / (max_ - min_)
    group = (group_std * (max_group - min_group)) + min_group
    
    return group

In [57]:
# Get the training and test sets.
TRAIN_NORMALIZED_SETS = find_multiple_sets(TRAIN_NORMALIZED)
TEST_TARGET_SETS = find_multiple_sets(test)

## Training & Validation
### Parameters grid search

In [58]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error

In [91]:
# Define the LAGS to which perform the grid search.
space1 = {"1 Month Anomaly (%) Rainfall": hp.choice("1 Month Anomaly (%) Rainfall", np.append(np.arange(1, 10, 5), None)), 
          "3 Months Anomaly (%) Rainfall": hp.choice("3 Months Anomaly (%) Rainfall", np.append(np.arange(1, 10, 5), None)), 
          "Cereals and tubers": hp.choice("Cereals and tubers", np.append(np.arange(1, 10, 5), None)), 
          "Exchange rate (USD/LCU)": hp.choice("Exchange rate (USD/LCU)", np.append(np.arange(1, 10, 5), None)), 
          "FCS": hp.choice("FCS", np.arange(1, 10, 5)), 
          "Fatality": hp.choice("Fatality", np.append(np.arange(1, 10, 5), None)), 
          "NDVI Anomaly": hp.choice("NDVI Anomaly", np.append(np.arange(1, 10, 5), None)), 
          "Rainfall (mm)": hp.choice("Rainfall (mm)", np.append(np.arange(1, 10, 5), None)), 
          "rCSI": hp.choice("rCSI", np.append(np.arange(1, 10, 5), None)), 
          "Lat": hp.choice("Lat", np.append(np.arange(0, 1), None)), 
          "Lon": hp.choice("Lon", np.append(np.arange(0, 1), None)), 
          "Population": hp.choice("Population", np.append(np.arange(0, 1), None)), 
          "Ramadan": hp.choice("Ramadan", np.append(np.arange(1, 10, 5), None))}

In [92]:
# Define the PARAMETERS MODEL to which perform the grid search.
space2 = {"C": hp.choice("C", [0.1, 1, 10, 100, 1000]),  
          "gamma": hp.choice("gamma", [1, 0.1, 0.01, 0.001, 0.0001])}

In [93]:
# Merge the two dictionary to perform the grid search.
space = dict(space1, **space2)

In [70]:
from sklearn.svm import SVR

In [94]:
def hyperparameters(space): 
    #try:
    print(space)
    # Select lags.
    lags_dict = {key: space[key] for key in PREDICTORS}
  
    val_losses_h = list()
    for h in range(TEST_SIZE):
        X_train_list, y_train_list, X_val_list, y_val_list = list(), list(), list(), list()
        for train_normalized in TRAIN_NORMALIZED_SETS:
            # Create training and validation samples.  
            for PROVINCE in PROVINCES:
                creator = LagsCreator(train_normalized[[PROVINCE]], lags_dictionary = lags_dict, target = "FCS")
                X_train, y_train, X_val, y_val, _ = creator.to_supervised(n_out = TEST_SIZE, single_step = True, h = h+1, return_dataframe = True,
                                                                          feature_time = False, validation = True, return_single_level = True, 
                                                                          dtype = np.float64)
                X_train_list.append(X_train)
                y_train_list.append(y_train)
                X_val_list.append(X_val)
                y_val_list.append(y_val)  

        X_train = pd.concat(X_train_list).reset_index(drop = True).values
        y_train = pd.concat(y_train_list).reset_index(drop = True).values.flatten()

        # Train the model.
        print("Training %s samples for the prediction horizon h: %d" % (str(X_train.shape), h+1))
        model = SVR(gamma = space["gamma"], kernel = "rbf", C = space["C"])
        model.fit(X_train, y_train)
        
        y_hats_train = model.predict(X_train)
        # Compute training error.
        train_loss = mean_squared_error(y_train, y_hats_train)
        r2 = model.score(X_train, y_train)

        X_val = pd.concat(X_val_list).reset_index(drop = True).values
        y_val = pd.concat(y_val_list).reset_index(drop = True).values.flatten()

        # Validation.
        y_hats_val = model.predict(X_val)
        # Compute validation error.
        val_loss = mean_squared_error(y_val, y_hats_val)
        val_losses_h.append(val_loss)

        # Recursive save results.
        results = space.copy()
        results["h"] = h+1
        results["r2"] = r2
        results["val_loss"] = val_loss
        results["train_loss"] = train_loss
        df_space = pd.DataFrame(results, index = [0], dtype = object)
        filename = dir + "/grid_search.csv"
        df_space.to_csv(filename, index = False, header = (not os.path.exists(filename)), mode = "a")
    # Compute mean error of this 'space' for the various prediction horizions.
    val_loss = np.mean(val_losses_h)
    #except:
    #    val_loss = np.inf     

    return {"loss": val_loss, "status": STATUS_OK}

In [95]:
trials = Trials()
best = fmin(fn = hyperparameters,
            space = space,
            algo = tpe.suggest,
            max_evals = 1,
            trials = trials)

# Save the trials into a file.
pickle.dump(trials, open(dir + "/hyp_trials.p", "wb"))

{'1 Month Anomaly (%) Rainfall': 6, '3 Months Anomaly (%) Rainfall': 6, 'C': 1, 'Cereals and tubers': 6, 'Exchange rate (USD/LCU)': None, 'FCS': 1, 'Fatality': None, 'Lat': 0, 'Lon': None, 'NDVI Anomaly': None, 'Population': None, 'Rainfall (mm)': 6, 'Ramadan': None, 'gamma': 1, 'rCSI': 1}
Training (8400, 27) samples for the prediction horizon h: 1                                                            
  0%|                                                                            | 0/1 [00:02<?, ?trial/s, best loss=?]

job exception: name 'deed' is not defined


  0%|                                                                            | 0/1 [00:02<?, ?trial/s, best loss=?]


NameError: name 'deed' is not defined