In [31]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import argparse
import pickle
import shutil
import click
import os

from _gui import *
from _utils import *
from _default import *

# Add the python path to the folder containing some custom packages.
import sys
sys.path.insert(0, "../packages/")
from LagsCreator.LagsCreator import LagsCreator
from NestedCV.NestedCV import NestedCV
from TsIP.TsIP import TsIP

# Load the time-series dataset.
dfs = []
indicators = []
for country in ["Yemen"]:
    # Load the time-series data.
    df = pd.read_csv(f"../Dataset time-series/output_data/{country}/{country}.csv", header = [0, 1], index_col = 0)
    df.index = pd.to_datetime(df.index)
    df.index.freq = "D"
    # Add a level information regarding the country.
    df.columns = pd.MultiIndex.from_tuples(map(lambda x: (country, x[0], x[1]), df.columns), 
                                           names = ["Country", "AdminStrata", "Indicator"])
    # Select the defined temporal range (availability data -> end of the selected month).
    df = df.loc[:pd.to_datetime("2020-11") + pd.offsets.MonthEnd(1)]
    # Save indicator names at provincial level.
    for province in df.columns.get_level_values("AdminStrata").unique():
        indicators.append(sorted(df[country][province].columns))
    # Append country.
    dfs.append(df)

# Concatenate data of the countries.
df = pd.concat(dfs, axis = 1)

In [32]:
df = df.xs("FCG", axis = 1, level = 2, drop_level = False).dropna()

In [33]:
df

Country,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen,Yemen
AdminStrata,Abyan,Aden,Al Bayda,Al Dhale'e,Al Hudaydah,Al Jawf,Al Maharah,Al Mahwit,Amanat Al Asimah,Amran,Dhamar,Hajjah,Ibb,Lahj,Marib,Raymah,Sa'ada,Sana'a,Shabwah,Taizz
Indicator,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG,FCG
2018-07-02,31.1587,16.6195,38.3327,29.1948,20.7882,22.0857,2.89231,16.8151,17.0004,20.4467,26.9111,28.3613,25.0367,31.7624,32.5962,54.1218,17.6760,26.9177,16.7523,26.5634
2018-07-03,32.6752,16.3706,43.2921,31.3369,20.6925,23.4772,2.91383,19.8334,17.2082,21.6890,28.0716,28.8054,27.1024,32.0495,33.3333,56.7227,17.1346,25.7835,18.2575,29.1240
2018-07-04,33.4156,15.0892,45.1282,33.1352,21.8251,24.8640,2.95041,19.6143,18.0708,25.6111,28.9676,30.9937,29.8441,33.1063,36.8837,56.3063,17.4545,25.4360,19.0831,29.4280
2018-07-05,33.5315,15.7665,43.9593,34.5543,21.4817,27.8150,2.24791,19.5586,19.0220,27.9225,30.5548,32.9059,29.0254,32.9388,36.8976,56.2310,17.8463,27.4218,20.3495,30.4560
2018-07-06,33.9519,16.8701,44.5165,33.8786,21.0767,28.6882,2.26978,17.6081,19.8176,28.7122,32.5240,32.5982,30.6928,33.8379,36.6696,57.3248,18.4631,26.5472,21.1075,31.0125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-26,50.9886,29.3969,56.9619,58.9251,29.9524,53.7412,26.66870,40.1570,35.2689,49.2564,41.5017,37.2870,49.4759,60.8887,48.8427,63.1242,41.0149,38.7104,53.4447,43.8733
2020-11-27,50.6362,29.9349,55.7664,57.3678,30.8276,54.1917,26.81290,41.5936,34.6877,51.1352,41.6611,38.4107,48.6944,61.2863,49.4961,63.3617,41.2691,38.4491,52.2910,44.3046
2020-11-28,50.7676,31.5763,55.0458,56.6495,31.2197,53.1305,26.01600,40.5596,35.5904,50.8996,40.3168,38.4743,49.1870,61.0338,48.4315,63.6885,41.3086,38.7844,52.4090,44.2039
2020-11-29,51.5017,31.3202,54.2610,55.8773,31.6806,52.1938,25.59640,40.3598,34.6926,49.1650,40.1051,37.6882,49.2357,60.8310,48.8655,63.5812,42.0492,39.2922,53.5650,43.1553


In [34]:
# Create the nested cross validation.
cv = NestedCV(5, 30)
# Nested cross validation.
SPLITS = cv.get_splits(df)
for split_number, (train, test) in SPLITS.items():
    print(f"Split {split_number}: range of days to predict (test) between {test.index[0].date()} - {test.index[-1].date()}")

Split 1: range of days to predict (test) between 2020-07-01 - 2020-07-30
Split 2: range of days to predict (test) between 2020-08-01 - 2020-08-30
Split 3: range of days to predict (test) between 2020-09-01 - 2020-09-30
Split 4: range of days to predict (test) between 2020-10-01 - 2020-10-30
Split 5: range of days to predict (test) between 2020-11-01 - 2020-11-30


In [39]:
TsIP(SPLITS[5][0], SPLITS[5][1]).interactive_plot_df(comparison = True)

interactive(children=(ToggleButtons(description='Country', options=('Yemen',), value='Yemen'), RadioButtons(de…