In [None]:
"""Build training, validation, and testing hurricane data tensors.

Functions
---------
build_data(data_path, settings, verbose=0)

"""
import pprint

import numpy as np
import pandas as pd
import copy

import toolbox

__author__ = "Elizabeth A. Barnes and Randal J Barnes"
__version__ = "12 November 2022"

def build_data(data_path, settings, verbose=0, iscaled=False):
    """Build the training, validation and testing tensors for the shash model.

    The settings['target'] specifies which data set to build.There are five
    different possible targets: intensity, logitude, latitude, radial, and
    angle.

    Arguments
    ---------
    data_path : str
        The input filepath, not including the file name.

    settings : dict
        The parameters defining the current experiment.

    verbose : int
        0 -> silent
        1 -> description only
        2 -> description and y statistics
        
    ## Added by MCM
    iscaled : boolean
        True: calculate scaled intensity (I/MPI) and use that as preditor instead of VMAX
        False (default): use VMAX as predictor (normal)

    Returns
    -------
    data_summary : dict
        A descriptive dictionary of the data.

    x_train : numpy.ndarray
        The training split of the x data.
        shape = [n_train, n_features].

    label_train : numpy.ndarray
        The training split of the predictand.
        shape = [n_train, 1].

    x_val : numpy.ndarray
        The validation split of the x data.
        shape = [n_val, n_features].

    label_val : numpy.ndarray
        The validation split of the predictand.
        shape = [n_val, 1].

    x_test : numpy.ndarray
        The test split of the x data.
        shape = [n_test, n_features].

    label_test : numpy.ndarray
        The validation split of the predictand.
        shape = [n_test, 1].

    x_valtest : numpy.ndarray
        The union of the test and validation splits of the x data.
        shape = [n_val+n_test, n_features].

    label_valtest : numpy.ndarray
        The union of the test and validation splits of the predictand.
        shape = [n_val+n_test, 1].

    df_train : pandas dataframe
        A pandas dataframe containing training records.  The
        dataframe contains all columns from the original file.
        However, the dataframe contains only rows from the training
        data set that satisfy the specified basin and leadtime
        requirements, and were not eliminated due to missing values.
        The dataframe has the shuffled order of the rows.  In
        particular, the rows of df_train align with the rows of x_train
        and label_train.

    df_val : pandas dataframe
        A pandas dataframe containing validation records.  The
        dataframe contains all columns from the original file.
        However, the dataframe contains only rows from the validation
        data set that satisfy the specified basin and leadtime
        requirements, and were not eliminated due to missing values.
        The dataframe has the shuffled order of the rows.  In particular,
        the rows of df_val align with the rows of x_val and label_val.

    df_test : pandas dataframe
        A pandas dataframe containing testing records.  The
        dataframe contains all columns from the original file.
        However, the dataframe contains only rows from the testing
        data set that satisfy the specified basin and leadtime
        requirements, and were not eliminated due to missing values.
        The dataframe has the shuffled order of the rows.  In particular,
        the rows of df_test align with the rows of x_test and label_test.

    df_valtest : pandas dataframe
        A pandas dataframe containing union of the validation and testing
        records.  The dataframe contains all columns from the original
        file. However, the dataframe contains only rows from the union
        of the validation and testing data sets that satisfy the specified
        basin and leadtime requirements, and were not eliminated due to
        missing values. The dataframe has the shuffled order of the rows.
        In particular, the rows of df_valtest align with the rows of
        x_valtest and label_valtest.

    """
    # Setup for the selected features
    if (settings["x_names"] is None) && (iscaled == False):
        x_names = [
            "VMAX0",
            "VMXC",
            "NCI",
            "DSDV",
            "LGDV",
            "HWDV",
            "AVDV",
            "DV12",
            "SLAT",
            "SSTN",
            "SHDC",
            "DTL",
            "T200",
            "D200",
            "RHMD",
            # "SPDX",
            # "SPDY",
        ]
    elif (settings["x_names"] is None) && (iscaled == True):
         x_names = [
            "DTL",
            "LAT",
            "SHRG",
            "D200",
            "Z850",
            "DELV",
            "RHMD",
            "DELV-12",
            "SST",
            "OHC"]
    else:
        x_names = settings["x_names"]

    # Setup for the selected target.
    if settings["target"] == "intensity":
        y_name = ["PREDICTAND"]
        missing = -9999

    else:
        raise NotImplementedError

    # Get the data from the specified file and filter out the unwanted rows.
    datafile_path = data_path + settings["filename"]
    df_raw = pd.read_table(datafile_path, sep="\s+")
    df_raw = df_raw.rename(columns={'Date': 'year'})

    # Add predictand.
    df_raw["PREDICTAND"] = df_raw["OFDV"]   # OFDV = BestTrack - Official Forecast

    # Get predictors and storms of interest.
    df = df_raw[
        (df_raw["ATCF"].str.contains(settings["basin"])) &
        (df_raw["ftime(hr)"] == settings["leadtime"])
    ]

    # Replace missing values.
    df = df.replace(missing, np.nan)
    df = df.dropna(axis=0)
    df = df.reset_index(drop=True)

    # Shuffle the rows in the df Dataframe, using the numpy rng.
    df = df.sample(frac=1, random_state=settings['rng_seed'])
    df = df.reset_index(drop=True)

    # Check that there is data for training.
    if np.shape(df)[0] == 0:
        return (
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
            np.empty((0, 1)),
        )

    # ---------------------------------
    # Train/Validation/Test Split

    # Get the testing data
    if settings["test_condition"] is None:
        pass
    else:
        years = settings["years_test"]
        if verbose != 0:
            print('years' + str(years) + ' withheld for testing')
        index = df.index[df['year'].isin(years)]
        df_test = df.iloc[index]
        x_test = df_test[x_names].to_numpy()
        y_test = np.squeeze(df_test[y_name].to_numpy())
        df_test = df_test.reset_index(drop=True)

        df = df.drop(index)
        df = df.reset_index(drop=True)

    # Get the validation data.
    if settings["val_condition"] == "random":
        index = np.arange(0,settings["n_val"])
        if(len(index)<100):
            raise Warning("Are you sure you want n_val < 100?")

    elif settings["val_condition"] == "years":
        if verbose != 0:
            print('years' + str(settings["n_val"]) + ' withheld for testing')
        index = df.index[df['year'].isin(settings["n_val"])]

        # unique_years = df['year'].unique()
        # years = unique_years[:settings["n_val"]]
        # index = df.index[df['year'].isin(years)]

    df_val = df.iloc[index]
    x_val = df_val[x_names].to_numpy()
    y_val = np.squeeze(df_val[y_name].to_numpy())
    df_val = df_val.reset_index(drop=True)

    df = df.drop(index)
    df = df.reset_index(drop=True)

    if settings["test_condition"] is None:
        df_test = df_val.copy()
        x_test  = copy.deepcopy(x_val)
        y_test  = copy.deepcopy(y_val)

    # Subsample training if desired.
    if settings["n_train"] == "max":
        df_train = df.copy()
    else:
        df_train = df.iloc[:settings["n_train"]]

    x_train = df_train[x_names].to_numpy()
    y_train = np.squeeze(df_train[y_name].to_numpy())
    df_train = df_train.reset_index(drop=True)

    # ---------------------------------
    # Create 'label' y arrays.

    label_train = np.zeros((len(y_train), 1))
    label_val = np.zeros((len(y_val), 1))
    label_test = np.zeros((len(y_test), 1))

    label_train[:, 0] = y_train
    label_val[:, 0] = y_val
    label_test[:, 0] = y_test

    # Make a descriptive dictionary.
    data_summary = {
        "datafile_path": datafile_path,
        "x_train_shape": tuple(x_train.shape),
        "x_val_shape": tuple(x_val.shape),
        "x_test_shape": tuple(x_test.shape),
        "label_train_shape": tuple(label_train.shape),
        "label_val_shape": tuple(label_val.shape),
        "label_test_shape": tuple(label_test.shape),
        "x_names": x_names,
        "y_name": y_name,
    }

    # Report the results.
    if verbose >= 1:
        pprint.pprint(data_summary, width=80)

    if verbose >= 2:
        toolbox.print_summary_statistics(
            {
                "y_train" : label_train[:,0],
                "y_val"   : label_val[:,0],
                "y_test"  : label_test[:,0]
            },
            sigfigs=1
        )

    # Change dtype of label to 'float32' for consistency.
    label_train = label_train.astype('float32')
    label_val = label_val.astype('float32')
    label_test = label_test.astype('float32')

    # Create the combined valtest set.
    x_valtest = np.concatenate((x_val, x_test), axis=0)
    label_valtest = np.concatenate((label_val, label_test), axis=0)
    df_valtest = pd.concat([df_val, df_test])

    return (
        data_summary,
        x_train,
        label_train,
        x_val,
        label_val,
        x_test,
        label_test,
        x_valtest,
        label_valtest,
        df_train,
        df_val,
        df_test,
        df_valtest,
    )
