# Imports and Configs

## Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook

In [None]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import math

from sklearn.metrics import mean_squared_error, mean_absolute_error


from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import Process
import Load
import Eval
import Models
import Split

import tensorflow as tf

import pickle
import os.path as osp
import configparser

## Configs

In [None]:
from IPython.core.display import display, HTML
# display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
CONF_FILE_NAME = "run_conf.ini"
conf = configparser.ConfigParser()

# Load Data

In [None]:
conf.read(CONF_FILE_NAME)

ADCP_PREF = conf['Pref']['ADCP_PREF']
BUOY_PREF = conf['Pref']['BUOY_PREF']
PHYS_DEEP_PREF = conf['Pref']['PHYS_DEEP_PREF']
PHYS_SHALLOW_PREF = conf['Pref']['PHYS_SHALLOW_PREF']

files_and_pref = [\
    [conf['Path']['ADCP'], conf['Pref']['ADCP_PREF']],
    [conf['Path']['BUOY'], conf['Pref']['BUOY_PREF']],
    [conf['Path']['MODEL_DEEP'], conf['Pref']['PHYS_DEEP_PREF']],
    [conf['Path']['MODEL_SHALLOW'], conf['Pref']['PHYS_SHALLOW_PREF']]
                 ]

In [None]:
full_data = Load.load_all_data(files_and_pref, conf['Path']['DATA_BASE_DIR'])

In [None]:
full_data

In [None]:
def downsample_data(data, ratio_for_downsample=2):
    return data.iloc[range(0, data.shape[0], ratio_for_downsample)]

# data = downsample_data(full_data, 1)

# Prepare Data

## AMI

In [None]:
x=full_data.a_hs
y=full_data.b_hs
from sklearn import metrics
metrics.adjusted_mutual_info_score(x, y,average_method='arithmetic')  

In [None]:
hs_ami=metrics.normalized_mutual_info_score(full_data.a_hs, full_data.b_hs,average_method='arithmetic')
Tm_ami=metrics.normalized_mutual_info_score(full_data.a_Tm, full_data.b_Tm,average_method='arithmetic')
dir_ami=metrics.normalized_mutual_info_score(full_data.a_dir, full_data.b_dir,average_method='arithmetic')


In [None]:
print("hs",hs_ami,"Tm",Tm_ami,"dir",dir_ami)
plt.figure(1)
plt.plot(full_data.b_hs,'r.')
plt.plot(full_data.a_hs,'b.')
plt.legend(["Buoy","ADCP"])
plt.title("hs")

plt.figure(2)
plt.plot(full_data.b_Tm,'r.')
plt.plot(full_data.a_Tm,'b.')
plt.legend(["Buoy","ADCP"])
plt.title("Tm")

plt.figure(3)
plt.plot(full_data.b_dir,'r.')
plt.plot(full_data.a_dir,'b.')
plt.legend(["Buoy","ADCP"])
plt.title("dir")

In [None]:
adcp_hs_Tm_ami=metrics.normalized_mutual_info_score(full_data.a_hs, full_data.a_Tm,average_method='arithmetic')
adcp_hs_dir_ami=metrics.normalized_mutual_info_score(full_data.a_hs, full_data.a_dir,average_method='arithmetic')
adcp_Tm_dir_ami=metrics.normalized_mutual_info_score(full_data.a_Tm, full_data.a_dir,average_method='arithmetic')

print("adcp:","hs-Tm",adcp_hs_Tm_ami,"hs-dir",adcp_hs_dir_ami,"Tm-dir",adcp_Tm_dir_ami)

plt.plot(full_data.a_hs/max(full_data.a_hs),'r.')
plt.plot(full_data.a_Tm/max(full_data.a_Tm),'b.')
plt.plot(full_data.a_dir/max(full_data.a_dir),'g.')
plt.legend(["hs","Tm","dir"])
plt.title("ADCP")

In [None]:
buoy_hs_Tm_ami=metrics.normalized_mutual_info_score(full_data.b_hs, full_data.b_Tm,average_method='arithmetic')
buoy_hs_dir_ami=metrics.normalized_mutual_info_score(full_data.b_hs, full_data.b_dir,average_method='arithmetic')
buoy_Tm_dir_ami=metrics.normalized_mutual_info_score(full_data.b_Tm, full_data.b_dir,average_method='arithmetic')

print("buoy:","hs-Tm",buoy_hs_Tm_ami,"hs-dir",buoy_hs_dir_ami,"Tm-dir",buoy_Tm_dir_ami)

plt.plot(full_data.b_hs/max(full_data.b_hs),'r.')
plt.plot(full_data.b_Tm/max(full_data.b_Tm),'b.')
plt.plot(full_data.b_dir/max(full_data.b_dir),'g.')
plt.legend(["hs","Tm","dir"])
plt.title("Buoy")

In [None]:
buoy_hs_adcp_dir_ami=metrics.normalized_mutual_info_score(full_data.a_dir, full_data.b_hs,average_method='arithmetic')
buoy_hs_adcp_dir_ami

In [None]:
nmi_test=np.zeros(48) # np.array([1,2,3,4,5,5,6],dtype=np.float64)
for i in range(48):
    dt=6
    hrs=(i+1)/2
    x=full_data.b_hs[:-int(dt*hrs)]
    y=full_data.a_hs[int(dt*hrs):]
    nmi_test[i]=metrics.normalized_mutual_info_score(x, y,average_method='arithmetic')

In [None]:
print(nmi_test)
plt.plot(nmi_test,'.')

## Test data relations (AMI)

In [None]:
# estimate probability density func (scott,1992)
from sklearn.neighbors import KernelDensity
# Gaussian KDE
X=np.array(full_data.a_Tm)[:, np.newaxis]
kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
X_plot = np.linspace(1, max(full_data.a_Tm),1000)[:, np.newaxis]
log_dens = kde.score_samples(X_plot)

fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
fig.subplots_adjust(hspace=0.05, wspace=0.05)

ax[0, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
ax[0, 0].text(-3.5, 0.31, "Gaussian Kernel Density")



for axi in ax.ravel():
    axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k')
    axi.set_xlim(1, 20)
    axi.set_ylim(-0.02, 0.34)

for axi in ax[:, 0]:
    axi.set_ylabel('Normalized Density')

for axi in ax[1, :]:
    axi.set_xlabel('x')

In [None]:
plt.fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')

In [None]:
# estimate probability density func (scott,1992)
from sklearn.neighbors import KernelDensity
# Gaussian KDE
x=full_data.a_hs
y=full_data.b_hs

X=np.array(x)[:, np.newaxis]
Y=np.array(y)[:, np.newaxis]
X_kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
Y_kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(Y)
X_plot = np.linspace(1, max(x),1000)[:, np.newaxis]
Y_plot = np.linspace(1, max(y),1000)[:, np.newaxis]
X_dens =  np.exp(X_kde.score_samples(X_plot))
Y_dens =  np.exp(Y_kde.score_samples(Y_plot))

In [None]:
def kde2D(x, y, bandwidth, xbins=100j, ybins=100j, **kwargs): 
    """Build 2D kernel density estimate (KDE)."""

    # create grid of sample locations (default: 100x100)
    xx, yy = np.mgrid[x.min():x.max():xbins, 
                      y.min():y.max():ybins]

    xy_sample = np.vstack([yy.ravel(), xx.ravel()]).T
    xy_train  = np.vstack([y, x]).T

    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(xy_train)

    # score_samples() returns the log-likelihood of the samples
    z = np.exp(kde_skl.score_samples(xy_sample))
    return xx, yy, np.reshape(z, xx.shape)

In [None]:

xx, yy, zz = kde2D(x, y, 1.0)

In [None]:
plt.pcolormesh(xx, yy, zz)
plt.scatter(x, y, s=2, facecolor='white')

In [None]:
zz/(X_dens*Y_dens)

## Prepare data types and offsets

In [None]:
# choose relevant columns, get data_df and model data
PRED_FORWARD = 6*12  # 6 bins per hour, 3 hours
TARGET_COL = ADCP_PREF + "_hs"
TARGET_COL = BUOY_PREF + "_hs"
if TARGET_COL.startswith(ADCP_PREF):
    PHYS_COL = PHYS_DEEP_PREF + "_hs"
elif TARGET_COL.startswith(BUOY_PREF):
    PHYS_COL = PHYS_SHALLOW_PREF + "_hs"
else:
    raise(IndexError)
IS_TARGET_DATA_SOURCE_INCLUDED = True

# col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", GAP_FORWARD), (PHYS_SHALLOW_PREF+"_hs", 12)])
# col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", PRED_FORWARD)])
# note: took out a_dir because it has "inf"
# col_names_and_offsets = np.array(list(zip(['a_hs', 'b_hs', 'b_dir', 'ma_hs', 'ma_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
# col_names_and_offsets = np.array(list(zip(['a_hs', 'a_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
col_names_and_offsets = np.array(list(zip(['a_hs', 'b_dir', 'b_hs'], np.zeros(data.columns.shape, dtype=np.int8))))
col_names_and_offsets = np.array(list(zip(['b_hs', 'b_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
# col_names_and_offsets = np.array([(PHYS_DEEP_PREF+"_hs", PRED_FORWARD), (ADCP_PREF+"_hs", 0)])
df = Load.get_df_for_model(data, col_names_and_offsets)

In [None]:
# assume that target is aligned as starting at 0 (with no offset)
# and then just cut off edge which exists in the case when some offsets aren't 0
phys_target = data[PHYS_COL].iloc[:df.shape[0]]

## Split into train-val-test

In [None]:
train, val, test, phys_test = Split.split_train_test_val(df, phys_target=phys_target)

In [None]:
train, val, test, phys_test = Split.kfold_split_train_test(df, 2, phys_target=phys_target)

## Preprocess (scale, restructure into sequences and reshape)

In [None]:
def get_feature_and_target_data(data, target_col_name, is_target_in_features=True):
    if type(data) == list:
        target = [d[[target_col_name]] for d in data]
        if not is_target_in_features:
            data = [d.drop(target_col_name, axis=1) for d in data]
    else:
        target = data[[target_col_name]]
        if not is_target_in_features:
            data = data.drop(target_col_name, axis=1)
    return data, target

In [None]:
# def get_feature_and_target_data(data, target_col_name, is_target_in_features=True):
#     if is_target_in_features:
#         return data, data[[target_col_name]]
#     else:
#         if type(data) == list:
#             data_no_target = [d.drop(target_col_name, axis=1) for d in data]
#             target = [d[[target_col_name]] for d in data]
#         else:
#             data_no_target = data.drop(target_col_name, axis=1)
#             target = data[[target_col_name]]
#         return data_no_target, target

In [None]:
TRAIN_STEPS = 12
Y_LENGTH = 1
STEP_SIZE = 1
GAP_FORWARD = PRED_FORWARD # for now instead just predict 3 hours forward, sounds good
pre = Process.PreprocessData(steps_back=TRAIN_STEPS, y_length=Y_LENGTH, step_size=STEP_SIZE,
                          gap_forward=GAP_FORWARD)

In [None]:
pre.fit(*get_feature_and_target_data(train, TARGET_COL, IS_TARGET_DATA_SOURCE_INCLUDED))
X_train, y_train = pre.transform(*get_feature_and_target_data(train, TARGET_COL, IS_TARGET_DATA_SOURCE_INCLUDED))
X_val, y_val = pre.transform(*get_feature_and_target_data(val, TARGET_COL, IS_TARGET_DATA_SOURCE_INCLUDED))
X_test, y_test = pre.transform(*get_feature_and_target_data(test, TARGET_COL, IS_TARGET_DATA_SOURCE_INCLUDED))

# Model

## Choose and build

In [None]:
input_dim = X_train.shape[2]
model_structure_args = {"look_back": TRAIN_STEPS, "input_dimension": input_dim}
model_train_args = {"num_epochs" : 8, "batch_size": 50}
# model_class = Models.LSTMModel
model_class = Models.FCNNModel
# model_class = Models.RandomForestModel


with tf.device("/cpu:0"):
    curr_model = model_class(**model_structure_args)

## Train

In [None]:
with tf.device("/cpu:0"):
    curr_model.fit(X_train, y_train, val_data=(X_val, y_val), **model_train_args)

## Predict

In [None]:
y_pred = curr_model.predict(X_test)

y_pred.shape

### Get originally scaled data

In [None]:
pred_org = pre.inverse_scale_target(y_pred)
test_org = pre.inverse_scale_target(y_test.reshape(-1, 1))
phys_org = phys_test.iloc[TRAIN_STEPS + PRED_FORWARD:].values.reshape(-1,1)

## Evaluate

In [None]:
results = Eval.eval_pred_phys_const(test_org, pred_org, phys_org, pre)

In [None]:
results[['rmse', 'r2', 'si', 'max_error']]

### Plot

In [None]:
%matplotlib notebook

In [None]:
plt.plot(range(pred_org.shape[0]), test_org, 'g.')
# plt.plot(range(preds_org.shape[0]), np.concatenate([test_org[18:],np.zeros((18, 1))]), 'y')
plt.plot(range(pred_org.shape[0]), pred_org, 'b')
plt.plot(range(pred_org.shape[0]), phys_org, 'r')
plt.legend(['Buoy/ADCP', '{} Prediction'.format(curr_model.name), 'WW3'])
plt.show()

# For Testing

## kfold functions

In [None]:
def run_single_fold_train_test(df, target_col, is_target_in_input,
                               phys_target, pre, model_class, model_train_args, fold_num, k=5):
    train, val, test, phys_test = Split.kfold_split_train_test(df, fold_num,
                                                k=k, phys_target=phys_target)

    pre.fit(*get_feature_and_target_data(train, target_col, is_target_in_input))
    X_train, y_train = pre.transform(*get_feature_and_target_data(train, target_col, is_target_in_input))
    X_val, y_val = pre.transform(*get_feature_and_target_data(val, target_col, is_target_in_input))
    X_test, y_test = pre.transform(*get_feature_and_target_data(test, target_col, is_target_in_input))

    input_dim = X_train.shape[2]
    model_structure_args = {"look_back": TRAIN_STEPS, "input_dimension": input_dim,
                           "description_string": DESC_STR + "_f{}".format(fold_num)}

    with tf.device("/cpu:0"):
        curr_model = model_class(**model_structure_args)

    with tf.device("/cpu:0"):
        curr_model.fit(X_train, y_train, val_data=(X_val, y_val), **model_train_args)

    y_pred = curr_model.predict(X_test)

    pred_org = pre.inverse_scale_target(y_pred)
    test_org = pre.inverse_scale_target(y_test.reshape(-1, 1))
    phys_org = phys_test.iloc[TRAIN_STEPS + PRED_FORWARD:].values.reshape(-1,1)

    results = Eval.eval_pred_phys_const(test_org, pred_org, phys_org, pre)
    return results

def run_kfold_train_test(df, target_col, is_target_in_input, phys_target, pre,
                         model_class, model_train_args, k=5, num_folds_to_run=None):
    results = []
    folds_to_run_on = list(range(k))
    if num_folds_to_run:
    # if num_folds_to_run < k, prefer running on last folds
        folds_to_run_on = folds_to_run_on[-num_folds_to_run:]
    for i in folds_to_run_on:
        print("Running on fold {}".format(i))
        curr_fold_results = run_single_fold_train_test(df, target_col, is_target_in_input, phys_target, pre,
                                            model_class, model_train_args, fold_num=i, k=k).assign(fold=i)
        results.append(curr_fold_results)
    results = pd.concat(results)
    results = results.set_index(['fold', results.index])
    return results

## One Cell to rule them all

### Old Cell

In [None]:

# PRED_FORWARD_HRS = 6
# LOOK_BACK_HRS = 12
# TIME_SAMPLE_RES_MINUTES = 10
# k = 5
# num_folds_to_run = 5


# samples_in_hr = 60 / TIME_SAMPLE_RES_MINUTES
# downsample_ratio = TIME_SAMPLE_RES_MINUTES / 10
# assert(samples_in_hr == int(samples_in_hr) and downsample_ratio == int(downsample_ratio))
# samples_in_hr = int(samples_in_hr)
# downsample_ratio = int(downsample_ratio)

# data = downsample_data(full_data, downsample_ratio)

# # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", GAP_FORWARD), (PHYS_SHALLOW_PREF+"_hs", 12)])
# # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", PRED_FORWARD)])
# # note: took out a_dir because it has "inf"
# # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_hs', 'b_dir', 'ma_hs', 'ma_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
# # col_names_and_offsets = np.array(list(zip(['a_hs', 'a_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
# # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_dir', 'b_hs'], np.zeros(data.columns.shape, dtype=np.int8))))
# col_names_and_offsets = np.array(list(zip(['b_hs', 'b_dir'], np.zeros(data.columns.shape, dtype=np.int8))))


# DESC_STR = "b{}h_{}msindirhb_lb{}h_lstm1".format(PRED_FORWARD_HRS,
#                                                 TIME_SAMPLE_RES_MINUTES, LOOK_BACK_HRS)

# PRED_FORWARD = samples_in_hr * PRED_FORWARD_HRS
# # TARGET_COL = ADCP_PREF + "_hs"
# TARGET_COL = BUOY_PREF + "_hs"
# if TARGET_COL.startswith(ADCP_PREF):
#     PHYS_COL = PHYS_DEEP_PREF + "_hs"
# elif TARGET_COL.startswith(BUOY_PREF):
#     PHYS_COL = PHYS_SHALLOW_PREF + "_hs"
# else:
#     raise(IndexError)
# IS_TARGET_DATA_SOURCE_INCLUDED = True



# # col_names_and_offsets = np.array([(PHYS_DEEP_PREF+"_hs", PRED_FORWARD), (ADCP_PREF+"_hs", 0)])
# df = Load.get_df_for_model(data, col_names_and_offsets)

# phys_target = data[PHYS_COL].iloc[:df.shape[0]]


# TRAIN_STEPS = samples_in_hr * LOOK_BACK_HRS
# Y_LENGTH = 1
# STEP_SIZE = 1
# GAP_FORWARD = PRED_FORWARD # for now instead just predict 3 hours forward, sounds good
# pre = Process.PreprocessData(steps_back=TRAIN_STEPS, y_length=Y_LENGTH, step_size=STEP_SIZE,
#                           gap_forward=GAP_FORWARD)

# model_train_args = {"num_epochs" : 15, "batch_size": 50}
# model_class = Models.LSTMModel
# # model_class = Models.FCNNModel
# # model_class = Models.RandomForestModel


# results_file_name = DESC_STR + "_res"

# results = run_kfold_train_test(df, phys_target, k=k, num_folds_to_run=num_folds_to_run)

# if k == num_folds_to_run:
#     results_file_name = results_file_name + ".h5"
# else:
#     results_file_name = "{}_{}of{}_folds.h5".format(results_file_name, num_folds_to_run, k)  

# results.to_hdf(osp.join("output", "results", results_file_name), key="a")

### New Old cell

In [None]:
def org_multi_func_run(*, pred_fwd, input_cols, descr_str):

    PRED_FORWARD_HRS = pred_fwd
    LOOK_BACK_HRS = 12
    TIME_SAMPLE_RES_MINUTES = 10
    k = 5
    num_folds_to_run = 5


    samples_in_hr = 60 / TIME_SAMPLE_RES_MINUTES
    downsample_ratio = TIME_SAMPLE_RES_MINUTES / 10
    assert(samples_in_hr == int(samples_in_hr) and downsample_ratio == int(downsample_ratio))
    samples_in_hr = int(samples_in_hr)
    downsample_ratio = int(downsample_ratio)

    data = downsample_data(full_data, downsample_ratio)

    # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", GAP_FORWARD), (PHYS_SHALLOW_PREF+"_hs", 12)])
    # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", PRED_FORWARD)])
    # note: took out a_dir because it has "inf"
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_hs', 'b_dir', 'ma_hs', 'ma_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'a_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_dir', 'b_hs'], np.zeros(data.columns.shape, dtype=np.int8))))
    col_names_and_offsets = np.array(list(zip(input_cols, np.zeros(data.columns.shape, dtype=np.int8))))


#     DESC_STR = "b{}h_{}msindirhb_lb{}h_lstm1".format(PRED_FORWARD_HRS,
#                                                     TIME_SAMPLE_RES_MINUTES, LOOK_BACK_HRS)
    DESC_STR = descr_str.format(PRED_FORWARD_HRS,
                                TIME_SAMPLE_RES_MINUTES, LOOK_BACK_HRS)

    PRED_FORWARD = samples_in_hr * PRED_FORWARD_HRS
    # TARGET_COL = ADCP_PREF + "_hs"
    TARGET_COL = BUOY_PREF + "_hs"
    if TARGET_COL.startswith(ADCP_PREF):
        PHYS_COL = PHYS_DEEP_PREF + "_hs"
    elif TARGET_COL.startswith(BUOY_PREF):
        PHYS_COL = PHYS_SHALLOW_PREF + "_hs"
    else:
        raise(IndexError)
    IS_TARGET_DATA_SOURCE_INCLUDED = True



    # col_names_and_offsets = np.array([(PHYS_DEEP_PREF+"_hs", PRED_FORWARD), (ADCP_PREF+"_hs", 0)])
    df = Load.get_df_for_model(data, col_names_and_offsets)

    phys_target = data[PHYS_COL].iloc[:df.shape[0]]


    TRAIN_STEPS = samples_in_hr * LOOK_BACK_HRS
    Y_LENGTH = 1
    STEP_SIZE = 1
    GAP_FORWARD = PRED_FORWARD # for now instead just predict 3 hours forward, sounds good
    pre = Process.PreprocessData(steps_back=TRAIN_STEPS, y_length=Y_LENGTH, step_size=STEP_SIZE,
                              gap_forward=GAP_FORWARD)

    model_train_args = {"num_epochs" : 16, "batch_size": 50}
    model_class = Models.LSTMModel
    # model_class = Models.FCNNModel
    # model_class = Models.RandomForestModel


    results_file_name = DESC_STR + "_res"

    results = run_kfold_train_test(df, phys_target, pre,
                        model_class, model_train_args, k=k, num_folds_to_run=num_folds_to_run)

    if k == num_folds_to_run:
        results_file_name = results_file_name + ".h5"
    else:
        results_file_name = "{}_{}of{}_folds.h5".format(results_file_name, num_folds_to_run, k)  

    results.to_hdf(osp.join("output", "results", results_file_name), key="a")
    
    pd.options.display.float_format = '{:,.3f}'.format
    print(DESC_STR)
    display(results.groupby(level=1).mean()[['rmse', 'r2', 'si', 'mae', 'max_error']])
    return results

### New Cells

#### Can hide

In [None]:
def run_single_fold_train_test(df, run_params, curr_fold_num):
    phys_target = data[run_params.phys_col].iloc[:df.shape[0]]
    train, val, test, phys_test = Split.kfold_split_train_test(df, curr_fold_num,
                                                k=run_params.k, phys_target=phys_target)

    run_params.pre.fit(*get_feature_and_target_data(
        train, run_params.target_col, run_params.is_target_in_input))
    X_train, y_train = run_params.pre.transform(
        *get_feature_and_target_data(train,run_params.target_col, run_params.is_target_in_input))
    X_val, y_val = run_params.pre.transform(
        *get_feature_and_target_data(val, run_params.target_col, run_params.is_target_in_input))
    X_test, y_test = run_params.pre.transform(
        *get_feature_and_target_data(test, run_params.target_col, run_params.is_target_in_input))

    input_dim = X_train.shape[2]
    model_structure_args = {"look_back": run_params.train_steps, "input_dimension": input_dim,
                           "description_string": run_params.desc_str + "_f{}".format(curr_fold_num)}

    with tf.device("/cpu:0"):
        curr_model = run_params.model_class(**model_structure_args)

    with tf.device("/cpu:0"):
        curr_model.fit(X_train, y_train, val_data=(X_val, y_val), **run_params.model_args)

    y_pred = curr_model.predict(X_test)

    pred_org = run_params.pre.inverse_scale_target(y_pred)
    test_org = run_params.pre.inverse_scale_target(y_test.reshape(-1, 1))
    phys_org = phys_test.iloc[run_params.train_steps + run_params.pred_forward:].values.reshape(-1,1)

    results = Eval.eval_pred_phys_const(test_org, pred_org, phys_org, run_params.pre)
    return results

def run_kfold_train_test(df, run_params):
    results = []
    folds_to_run_on = list(range(run_params.k))
    if run_params.num_folds_to_run:
    # if num_folds_to_run < k, prefer running on last folds
        folds_to_run_on = folds_to_run_on[-run_params.num_folds_to_run:]
    for i in folds_to_run_on:
        print("Running on fold {}".format(i))
        curr_fold_results = run_single_fold_train_test(df, run_params, i).assign(fold=i)
        results.append(curr_fold_results)
    results = pd.concat(results)
    results = results.set_index(['fold', results.index])
    return results

In [None]:
class TestInstanceParams():
    def __init__(self, input_data_str_repr, model_str_repr, desc_str_addition='',
                 target_col="b_hs", is_target_in_input=True,  pred_forward_hrs=4,
                look_back_hrs=12, time_sample_res_minutes=10, model_class=Models.LSTMModel,
                 model_args={"num_epochs" : 5, "batch_size": 50}, 
                 k=5, num_folds_to_run=None):
        self.pre = None
        self.target_col = target_col
        self.is_target_in_input = is_target_in_input
        self.pred_forward_hrs = pred_forward_hrs
        self.look_back_hrs = look_back_hrs
        self.time_res_min = time_sample_res_minutes

        self.model_class = model_class
        self.model_args = model_args
        self.k = k
        if not num_folds_to_run:
            self.num_folds_to_run = k
        else:
            self.num_folds_to_run = num_folds_to_run

        self.desc_str = self.build_desc_str(input_data_str_repr, model_str_repr, desc_str_addition)
        self.phys_col = self.find_phys_col()
        self.samples_in_hr = 60 / time_sample_res_minutes
        self.downsample_ratio = time_sample_res_minutes / 10
        assert(self.samples_in_hr == int(self.samples_in_hr) and \
               self.downsample_ratio == int(self.downsample_ratio))
        self.samples_in_hr = int(self.samples_in_hr)
        self.downsample_ratio = int(self.downsample_ratio)
        self.pred_forward = self.pred_forward_hrs * self.samples_in_hr
        self.train_steps = self.look_back_hrs*self.samples_in_hr
    
    def build_desc_str(self, input_data_str_repr, model_str_repr, addition):
        if self.target_col.startswith(ADCP_PREF):
            target_pref = ADCP_PREF
        elif self.target_col.startswith(BUOY_PREF):
            target_pref = BUOY_PREF
        else:
            raise IndexError
        # desc_str is of format:
        # <pred_target><forward hours>h_<time_res_min>m_lb<look_back hours>h_<model_name><optional addition>
        desc_str = "{}{}h_{}m{}_lb{}h_{}{}".format(target_pref, self.pred_forward_hrs,
                self.time_res_min, input_data_str_repr, self.look_back_hrs, model_str_repr, addition)
        return desc_str

    def set_pre(self, pre):
        self.pre = pre
    
    def get_pre(self):
        if not self.pre:
            raise NameError('pre not initialized yet')
        return self.pre
    
    def find_phys_col(self):
        if self.target_col.startswith(ADCP_PREF):
            phys_col = PHYS_DEEP_PREF + "_hs"
        elif self.target_col.startswith(BUOY_PREF):
            phys_col = PHYS_SHALLOW_PREF + "_hs"
        else:
            raise IndexError
        return phys_col

    

#### For run

In [None]:
def multi_func_run(*, target_col, col_names_and_offsets, input_data_str_repr,
                  pred_forward_hrs=4, look_back_hrs=12, time_sample_res_minutes=10):
    
    model_str_repr = 'lstm1'
#     pred_forward_hrs = 6
#     look_back_hrs = 12
#     time_sample_res_minutes= 10
    k = 5
    num_folds_to_run = 5

    
    model_train_args = {"num_epochs" : 16, "batch_size": 50}
    model_class = Models.LSTMModel
    # model_class = Models.FCNNModel
    # model_class = Models.RandomForestModel
    
    col_names_and_offsets = col_names_and_offsets*int(60/time_sample_res_minutes)
    
    is_target_in_input=True
    if target_col not in col_names_and_offsets.index:
        is_target_in_input = False
        col_names_and_offsets[target_col] = 0

    run_params = TestInstanceParams(input_data_str_repr=input_data_str_repr, \
        model_str_repr=model_str_repr, target_col=target_col, \
        is_target_in_input=is_target_in_input, pred_forward_hrs=pred_forward_hrs, \
        look_back_hrs=look_back_hrs, time_sample_res_minutes=time_sample_res_minutes, \
        k=k, num_folds_to_run=num_folds_to_run, \
        model_class=model_class, model_args=model_train_args, \
        desc_str_addition ='')
    

    # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", GAP_FORWARD), (PHYS_SHALLOW_PREF+"_hs", 12)])
    # col_names_and_offsets = np.array([(ADCP_PREF+"_hs", 0), (PHYS_DEEP_PREF+"_hs", PRED_FORWARD)])
    # note: took out a_dir because it has "inf"
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_hs', 'b_dir', 'ma_hs', 'ma_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'a_dir'], np.zeros(data.columns.shape, dtype=np.int8))))
    # col_names_and_offsets = np.array(list(zip(['a_hs', 'b_dir', 'b_hs'], np.zeros(data.columns.shape, dtype=np.int8))))
    # TARGET_COL = ADCP_PREF + "_hs"
    # TARGET_COL = BUOY_PREF + "_hs"
    
    data = downsample_data(full_data, run_params.downsample_ratio)
    df = Load.get_df_for_model(data, col_names_and_offsets)

    pre = Process.PreprocessData(steps_back=run_params.train_steps, \
                                 y_length=1, step_size=1, \
                              gap_forward=run_params.pred_forward)
    run_params.set_pre(pre)
    
    results = run_kfold_train_test(df, run_params)
    if run_params.k == run_params.num_folds_to_run:
        results_file_name = run_params.desc_str + "_res.h5"
    else:
        results_file_name = "{}_res_{}of{}_folds.h5".format(run_params.desc_str,
                                            run_params.num_folds_to_run, run_params.k)  
    results.to_hdf(osp.join("output", "results", results_file_name), key="a")
    
    pd.options.display.float_format = '{:,.3f}'.format
    print(run_params.desc_str)
    display(results.groupby(level=1).mean()[['rmse', 'r2', 'si', 'mae', 'max_error']])
    return results, results_file_name

In [None]:
# run_1 = {"pred_fwd":4, "input_cols":['a_hs', 'b_hs', 'b_dir'], "descr_str":"b{}h_{}mfolddirhbha_lb{}h_lstm1"}
# run_0 = {"pred_fwd":4, "input_cols":['a_hs', 'b_hs'], "descr_str":"b{}h_{}mhahb_lb{}h_lstm1"}
# run_2 = {"pred_fwd":6, "input_cols":['b_hs'], "descr_str":"b{}h_{}mhb_lb{}h_lstm1"}
# run_3 = {"pred_fwd":6, "input_cols":['a_hs', 'b_hs'], "descr_str":"b{}h_{}mhahb_lb{}h_lstm1"}
# run_4 = {"pred_fwd":6, "input_cols":['b_dir', 'b_hs'], "descr_str":"b{}h_{}mfolddirhb_lb{}h_lstm1"}
# run_5 = {"pred_fwd":6, "input_cols":['a_hs', 'b_hs', 'b_dir'], "descr_str":"b{}h_{}mfolddirhbha_lb{}h_lstm1"}

In [None]:
run_1 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['a_hs', 'b_hs'], data=[0,0]), "pred_forward_hrs":6,
         "input_data_str_repr":'hahb'}
run_2 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['a_hs', 'b_hs', 'b_dir'], data=[0,0, 0]), "pred_forward_hrs":6,
         "input_data_str_repr":'folddirhbha'}
run_3 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['a_hs', 'b_hs', 'b_dir'], data=[0,0, 0]), "pred_forward_hrs":4,
         "input_data_str_repr":'folddirhbha'}
run_4 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['a_hs', 'b_hs'], data=[0,0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'hahb'}
run_5 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs":6,
         "input_data_str_repr":'folddirhb'}
run_6 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs":4,
         "input_data_str_repr":'folddirhb'}
run_7 = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs":6,
         "input_data_str_repr":'hb'}

In [None]:
all_results_new = []
for i, all_args in enumerate([run_1, run_2, run_3, run_4, run_5, run_6, run_7]):
    print("-----------------------")
    print("RUN NUMBER {}".format(i+1))
    print("-----------------------")
    all_results_new.append(multi_func_run(**all_args))

In [None]:
my_run = {"target_col":'b_hs', "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir', 'a_hs'], data=[0,0,0]), "pred_forward_hrs":6,
         "input_data_str_repr":'hb'}

In [None]:
multi_func_run(**my_run)

In [None]:
all_results_new[0]

In [None]:
all_results_new[0]

In [None]:
target_col = "a_hs"

In [None]:
# Fold all run params (1-36)
run_1 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'hb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_2 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'hb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_3 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'hb', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_4 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'hb', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_5 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'hb', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_6 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'hb', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_7 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'hb', "look_back_hrs" : 48, "time_sample_res_minutes" : 60}
run_8 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs'], data=[0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'hb', "look_back_hrs" : 48, "time_sample_res_minutes" : 60}
run_9 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_10 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_11 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_12 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_13 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 48, "time_sample_res_minutes" : 60}
run_14 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,0]), "pred_forward_hrs": 2,
         "input_data_str_repr":'folddirhb', "look_back_hrs" : 48, "time_sample_res_minutes" : 60}
run_15 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,2]), "pred_forward_hrs": 6,
         "input_data_str_repr":'folddir2hb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_16 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[0,2]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddir2hb', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_17 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[2,0]), "pred_forward_hrs": 6,
         "input_data_str_repr":'folddirhb2', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_18 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir'], data=[2,0]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhb2', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_19 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir', 'ma_hs'], data=[0,0,4]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhbma4', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_20 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir', 'ma_hs'], data=[0,0, 4]), "pred_forward_hrs": 2,
         "input_data_str_repr":'folddirhbma4', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_190 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir', 'ma_hs'], data=[0,0,6]), "pred_forward_hrs": 4,
         "input_data_str_repr":'folddirhbma6', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_200 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['b_hs', 'b_dir', 'ma_hs'], data=[0,0, 6]), "pred_forward_hrs": 2,
         "input_data_str_repr":'folddirhbma6', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_21 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[2]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma2', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_22 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[2]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma2', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_23 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma4', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_24 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma4', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_25 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[6]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma6', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_26 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[6]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma6', "look_back_hrs" : 12, "time_sample_res_minutes" : 10}
run_27 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma4', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_28 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma4', "look_back_hrs" : 6, "time_sample_res_minutes" : 10}
run_29 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma4', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_30 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[4]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma4', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_31 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[6]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma6', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_32 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[6]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma6', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_33 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[8]), "pred_forward_hrs": 4,
         "input_data_str_repr":'ma8', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}
run_34 = {"target_col": target_col, "col_names_and_offsets": \
         pd.Series(index=['ma_hs'], data=[8]), "pred_forward_hrs": 2,
         "input_data_str_repr":'ma8', "look_back_hrs" : 24, "time_sample_res_minutes" : 30}

In [None]:
runs = [run_1, run_2, run_2, run_4, run_5, run_6, run_7, run_8, run_9, run_10, run_11,
       run_12, run_13, run_14, run_15, run_16, run_17, run_18, run_19, run_20, run_190, run_200, run_21,
       run_22, run_23, run_24, run_25, run_26, run_27, run_28, run_29, run_30, run_31,
       run_32, run_33, run_34]

In [None]:
all_results = []
all_results_file_names = []
for i, all_args in enumerate(runs):
    print("-----------------------")
    print("RUN NUMBER {}".format(i+1))
    print("-----------------------")
    results, results_file_name = multi_func_run(**all_args)
    all_results.append(results)
    all_results_file_names.append(results_file_name)

In [None]:
all_args = run_3

In [None]:
results, results_file_name = multi_func_run(**all_args)

In [None]:
pickle.dump(all_results_file_names, open("temp_all_results_names.pkl", "wb"))
pickle.dump(all_results, open("temp_all_results.pkl", "wb"))

In [None]:
for res, name in zip(all_results, all_results_file_names):
    print(name)
    display(res.groupby(level=1).mean()[['rmse', 'r2', 'si', 'mae', 'max_error']])

In [None]:
pd.options.display.float_format = '{:,.3f}'.format
print(DESC_STR)
results.groupby(level=1).mean()[['rmse', 'r2', 'si', 'mae', 'max_error']]