In [1]:
import numpy as np
import pandas as pd
import pickle
import os

In [2]:
# Add the python path to the folder containing some custom packages.
import sys
sys.path.insert(0, "../../packages/")
from LagsCreator.LagsCreator import LagsCreator
from NestedCV.NestedCV import NestedCV
from TsIP.TsIP import TsIP

In [3]:
# Create the workspace folder for storing training and test points.
dir_data = "./data_xgboost"
os.makedirs(dir_data)

In [4]:
# Select the countries to consider for the creation of training and test points.
COUNTRIES = ["Syria"]
# Define the name of indicator we want to predict.
# We have to rename the indicator 'FCG <= 2' to 'FCG' (XGBoost issue).
TARGET = "FCG"
# Define the number of days we want to learn to predict for the target variable.
TEST_SIZE = 30
# Define the number of total split we want to evaluate using our nested cross validation method.
NUMBER_OF_SPLITS = 5
# Define the time features we want to create for the input samples.
FEATURE_TIMES = ["Day", "Month", "Dayofweek", "Year"]
# Define the step between points during the creation of samples for training and test.
STEP_BETWEEN_POINTS = 1

In [5]:
# Save the parameters.
with open(dir_data + "/global_variables", "wb") as f:
    pickle.dump([TARGET, TEST_SIZE, FEATURE_TIMES, COUNTRIES, NUMBER_OF_SPLITS, STEP_BETWEEN_POINTS], f)

## Time-series dataset

In [6]:
# Load the time-series data of the Syria country.
df_syria = pd.read_csv("../../Dataset time-series/output_data/Syria/Syria.csv", header = [0, 1], index_col = 0)
df_syria.index.name = "Datetime"
df_syria.index = pd.to_datetime(df_syria.index)
freq = "D"
df_syria.index.freq = freq
df_syria.columns = pd.MultiIndex.from_tuples(map(lambda x: ("Syria", x[0], x[1]), df_syria.columns), names = ["Country", "AdminStrata", "Indicator"])

In [7]:
df = df_syria.copy()
# Consider the following dates.
df = df.loc["2018-01-01":"2020-08-31"]
# Select countries.
df = df[COUNTRIES]
# We have to rename the indicator 'FCG <= 2' to 'FCG' and 'rCSI >= 19' to 'rCSI' (XGBoost issue).
df.rename({"FCG <= 2": "FCG", "rCSI >= 19": "rCSI"}, axis = 1, level = 2, inplace = True)
df

Country,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria,Syria
AdminStrata,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,Al-Hasakeh,...,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous,Tartous
Indicator,1 Month Anomaly Rainfalls (%),3 Months Anomaly Rainfalls (%),Code,Exchange rate,FCG,Fatalities,Lat,Lon,NDVI,NDVI Anomaly,...,Fatalities,Lat,Lon,NDVI,NDVI Anomaly,Population,Price cereals and tubers,Rainfalls (mm),Ramadan,rCSI
Datetime,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2018-01-01,,,57,,,,36.491475,40.907354,,,...,,34.959426,36.077933,,,831296,,,,
2018-01-02,,,57,,,,36.491475,40.907354,,,...,,34.959426,36.077933,,,831296,,,,
2018-01-03,,,57,,,,36.491475,40.907354,,,...,,34.959426,36.077933,,,831296,,,,
2018-01-04,,,57,,,,36.491475,40.907354,,,...,,34.959426,36.077933,,,831296,,,,
2018-01-05,,,57,,,,36.491475,40.907354,,,...,,34.959426,36.077933,,,831296,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-08-27,,,57,,43.1629,70.0,36.491475,40.907354,,,...,0.0,34.959426,36.077933,,,831296,,,0.0,42.57902
2020-08-28,,,57,,43.1629,66.0,36.491475,40.907354,,,...,0.0,34.959426,36.077933,,,831296,,,0.0,42.87926
2020-08-29,,,57,,43.1629,74.0,36.491475,40.907354,,,...,0.0,34.959426,36.077933,,,831296,,,0.0,40.19883
2020-08-30,,,57,,43.1629,75.0,36.491475,40.907354,,,...,0.0,34.959426,36.077933,,,831296,,,0.0,40.19883


In [8]:
# Plot time-series.
#TsIP(df).interactive_plot_df(title = "Time-series", matplotlib = False, style = "mix", comparison = False)

## Creation training and test sets

In [9]:
# Define lags dictionary for each indicator.
lags_dict = dict()
# Define lags for each indicator.
lags_dict["3 Months Anomaly Rainfalls (%)"] = np.array([1,2,3]) 
lags_dict["1 Month Anomaly Rainfalls (%)"] = np.array([1,2,3]) 
lags_dict["Rainfalls (mm)"] = np.array([1,2,3]) 
lags_dict["Exchange rate"] = np.array([1,2,3]) 
lags_dict["Price cereals and tubers"] = np.array([1,2,3]) 
lags_dict["Fatalities"] = np.array([1,2,3,4]) 
lags_dict["NDVI Anomaly"] = np.array([1,2,3]) 
lags_dict["NDVI"] = np.array([1,2,3]) 
lags_dict["FCG"] = np.arange(1, 16)
lags_dict["rCSI"] = np.array([1,2,3,4]) 
lags_dict["Lat"] = np.array([1])
lags_dict["Lon"] = np.array([1])
lags_dict["Population"] = np.array([1])
lags_dict["Code"] = np.array([1])
lags_dict["Ramadan"] = np.array([1])

In [10]:
# Save the lags dictionary.
with open(dir_data + "/lags_dict", "wb") as fp:
    pickle.dump(lags_dict, fp)

In [11]:
# Create folder for containing training data.
os.makedirs(dir_data + "/train")
# Create folder for containing test data.
os.makedirs(dir_data + "/test")
for country in COUNTRIES:
    provinces = df[country].columns.get_level_values(0).unique()
    for province in provinces:
        os.makedirs(dir_data + "/train/%s/%s" % (country, province)) 
        os.makedirs(dir_data + "/test/%s/%s" % (country, province)) 

In [12]:
# Create the nested cross validation.
cv = NestedCV(NUMBER_OF_SPLITS, TEST_SIZE)
# Total nested cross validation.
SPLITS = cv.get_splits(df)
for split_number, (train, test) in SPLITS.items():
    print("Split %d: range of days to predict between %s - %s" % (split_number, str(test.index[0].date()), str(test.index[-1].date())))

Split 1: range of days to predict between 2020-04-01 - 2020-04-30
Split 2: range of days to predict between 2020-05-01 - 2020-05-30
Split 3: range of days to predict between 2020-06-01 - 2020-06-30
Split 4: range of days to predict between 2020-07-01 - 2020-07-30
Split 5: range of days to predict between 2020-08-01 - 2020-08-30


In [13]:
for split_number, (train, test) in SPLITS.items():
    print("Split %d. Please wait." % split_number)
    # Define the first multi-sites (countries).
    countries = train.columns.get_level_values(0).unique()
    for country in countries:
        train_country = train[country]
        # Define the second multi-sites (provinces).
        provinces = train_country.columns.get_level_values(0).unique()
        for province in provinces:
            creator = LagsCreator(train_country[province], lags_dictionary = lags_dict, target = TARGET, delay = True)
            for h in range(TEST_SIZE):
                # Training samples.
                X_train, y_train, X_test, features = creator.to_supervised(h = h+1, step = STEP_BETWEEN_POINTS, single_step = True, 
                                                                           return_dataframe = True, feature_time = FEATURE_TIMES, 
                                                                           dtype = float)
                
                # Train input and output.
                X_train.to_csv(dir_data + "/train/%s/%s/X_train_split%d_h%d.csv" % (country, province, split_number, h+1), index_label = False) 
                y_train.to_csv(dir_data + "/train/%s/%s/y_train_split%d_h%d.csv" % (country, province, split_number, h+1), index_label = False) 
                # Test input.
                X_test.to_csv(dir_data + "/test/%s/%s/X_test_split%d_h%d.csv" % (country, province, split_number, h+1), index_label = False) 

print("Complete!")

Split 1. Please wait.
Split 2. Please wait.
Split 3. Please wait.
Split 4. Please wait.
Split 5. Please wait.
Complete!
