In [16]:
import pandas as pd
import numpy as np

In [18]:
# Add the python path to the folder containing some useful custom packages.
import sys
sys.path.insert(0, "../../packages/")
from TsIP.TsIP import TsIP
from tools import find_multiple_sets
from LagsCreator.LagsCreator import LagsCreator

## Dataset

In [19]:
PATH_TO_DATA_FOLDER = "../../Dataset time-series/"

In [20]:
# Load the dataset of the training sets.
train = pd.read_csv(PATH_TO_DATA_FOLDER + "train_smooth.csv", header = [0, 1], index_col = 0)
train.index.name = "Datetime"
train.index = pd.to_datetime(train.index)
freq = "D"
train.index.freq = freq

In [21]:
# Load the dataset of the test sets.
test = pd.read_csv(PATH_TO_DATA_FOLDER + "test_target.csv", header = [0, 1], index_col = 0)
test.index.name = "Datetime"
test.index = pd.to_datetime(test.index)
freq = "D"
test.index.freq = freq

In [22]:
# Load the dataset of the whole time-series of the fcs indicator.
target = pd.read_csv(PATH_TO_DATA_FOLDER + "all_target.csv", header = [0, 1], index_col = 0)
target.index.name = "Datetime"
target.index = pd.to_datetime(target.index)
freq = "D"
target.index.freq = freq

In [23]:
TEST_SIZE = 30
FREQ = train.index.freq

In [24]:
TRAIN = train.copy()

In [25]:
PROVINCES = TRAIN.columns.get_level_values(0).unique()
PROVINCES

Index(['Abyan', 'Aden', 'Al Bayda', 'Al Dhale'e', 'Al Hudaydah', 'Al Jawf',
       'Al Maharah', 'Al Mahwit', 'Amanat Al Asimah', 'Amran', 'Dhamar',
       'Hajjah', 'Ibb', 'Lahj', 'Marib', 'Raymah', 'Sa'ada', 'Sana'a',
       'Shabwah', 'Taizz'],
      dtype='object', name='AdminStrata')

In [26]:
PREDICTORS = TRAIN.columns.get_level_values(1).unique()
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [27]:
# Get the training and test sets.
TRAIN_NORMALIZED_SETS = find_multiple_sets(train)
TEST_TARGET_SETS = find_multiple_sets(test)

## Training & Validation

In [32]:
PREDICTORS

Index(['1 Month Anomaly (%) Rainfall', '3 Months Anomaly (%) Rainfall',
       'Cereals and tubers', 'Exchange rate (USD/LCU)', 'FCS', 'Fatality',
       'Lat', 'Lon', 'NDVI Anomaly', 'Population', 'Rainfall (mm)', 'Ramadan',
       'rCSI'],
      dtype='object', name='Indicator')

In [33]:
lags_dict = dict()
# Define lags for each indicator.
lags_dict["1 Month Anomaly (%) Rainfall"] = 2
lags_dict["3 Months Anomaly (%) Rainfall"] = 1
lags_dict["Cereals and tubers"] = 2
lags_dict["Exchange rate (USD/LCU)"] = 5
lags_dict["FCS"] = 2
lags_dict["Fatality"] = 2
lags_dict["NDVI Anomaly"] = 2
lags_dict["Rainfall (mm)"] = 2
lags_dict["rCSI"] = 2
lags_dict["Lat"] = 0
lags_dict["Lon"] = 0
lags_dict["Population"] = 0
lags_dict["Ramadan"] = 1

In [48]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [49]:
for h in range(TEST_SIZE):
    X_train_list, y_train_list, X_val_list, y_val_list = list(), list(), list(), list()
    for train_normalized in TRAIN_NORMALIZED_SETS:
        # Create training and validation samples.  
        for PROVINCE in PROVINCES:
            X, y, X_val, y_val, X_test = LagsCreator(train_normalized[[PROVINCE]], lags_dictionary = lags_dict, target = "FCS", 
                                                     n_out = TEST_SIZE, single_step = True, h = h+1, return_dataframe = True,
                                                     feature_time = True, validation = True)
            X_train_list.append(X)
            y_train_list.append(y)
            X_val_list.append(X_val)
            y_val_list.append(y_val)  

    X_train = pd.concat(X_train_list).reset_index(drop = True)
    y_train = pd.concat(y_train_list).reset_index(drop = True)
    
    # Train the model.
    model = xgb.XGBRegressor(n_estimators = 100)   
    model.fit(X_train, y_train)  
    
    X_val = pd.concat(X_val_list).reset_index(drop = True)
    y_val = pd.concat(y_val_list).reset_index(drop = True)
    
    # Validation.
    y_hats = model.predict(X_val)
    # Compute validation error.
    val_loss = mean_squared_error(y_val.values.flatten(), y_hats)

0.3451987851893847


NameError: name 'dedede' is not defined