In [None]:
from multitcn_components import TCNStack, DownsampleLayerWithAttention, LearningRateLogger
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping, ModelCheckpoint, CSVLogger
from sklearn import preprocessing
import numpy as np
import pandas as pd
from datetime import datetime,date,timedelta
import tensorflow_addons as tfa
import uuid
import sys
from scipy.signal import correlate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import matplotlib
import matplotlib.colors as colors
import eli5
from sklearn.linear_model import LinearRegression
from pygam import LinearGAM, s, f, te, GAM
from pygam import terms
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.model_selection import cross_val_score
import pydotplus
from IPython.display import Image
from scipy import stats
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def windowed_dataset(series, time_series_number, window_size):
    """
    Returns a windowed dataset from a Pandas dataframe
    """
    available_examples= series.shape[0]-window_size + 1
    time_series_number = series.shape[1]
    inputs = np.zeros((available_examples,window_size,time_series_number))
    for i in range(available_examples):
        inputs[i,:,:] = series[i:i+window_size,:]
    return inputs 

def windowed_forecast(series, forecast_horizon):
    available_outputs = series.shape[0]- forecast_horizon + 1
    output_series_num = series.shape[1]
    output = np.zeros((available_outputs,forecast_horizon, output_series_num))
    for i in range(available_outputs):
        output[i,:]= series[i:i+forecast_horizon,:]
    return output

def shuffle_arrays_together(a,b):
    p = np.random.permutation(a.shape[0])
    return a[p],b[p]

def remove_outliers_and_interpolate(dataframe, std_times = 3):
    """
    Removes outliers further than std_times standard deviations from the mean of each column of a df and replaces them with simple interpolated values
    """
    for c in ['Temp_degC']:
        mask = (dataframe>40)
        dataframe.loc[mask[c],c] = np.nan

    for c in ['Turbidity_NTU','Chloraphylla_ugL']:
        mask = (dataframe<0)
        dataframe.loc[mask[c],c] = np.nan

    for c in list(dataframe.columns):
        mean = np.mean(np.array(dataframe[c]))
        std = np.std(np.array(dataframe[c]))
        mask =((dataframe < (mean - std_times*std)) | (dataframe > (mean+std_times*std)))
        dataframe.loc[mask[c],c] = np.nan
    
    dataframe = dataframe.interpolate()
    return dataframe

def norm_cross_corr(a,b):
    nom = correlate(a,b)
    den = np.sqrt(np.sum(np.power(a,2))*np.sum(np.power(b,2)))
    return nom/den

def symm_mape(true,prediction):
    return 100*np.sum(2*np.abs(prediction-true)/(np.abs(true)+np.abs(prediction)))/true.size

def get_metrics(true,prediction,print_metrics=False):
        c = norm_cross_corr(true,prediction)
        extent = int((c.shape[0]-1)/2)
        max_corr_point = np.argmax(c)-extent
        max_corr = np.max(c)
        max_v = np.max(prediction)
        mse = mean_squared_error(true,prediction,squared=True)
        rmse = mean_squared_error(true,prediction,squared=False)
        mae = mean_absolute_error(true,prediction)
        r2 = r2_score(true,prediction)
        smape = symm_mape(true,prediction)
        if print_metrics:
            print("Max %f - Autocorr %d - MSE %f - RMSE %f - MAE %f - sMAPE %f%% - R^2 %f"%(max_v,max_corr_point,mse,rmse,mae,smape,r2))
        return [max_corr_point,mse,rmse,mae,smape,r2]

def get_confidence_interval_series(sample_array,confidence_level=0.95):
    bounds = stats.t.interval(confidence_level,sample_array.shape[0]-1)
    samples_mean = np.mean(sample_array,axis=0)
    samples_std = np.std(sample_array,axis=0,ddof=1)
    lower_bound = samples_mean + bounds[0]*samples_std/np.sqrt(sample_array.shape[0])
    upper_bound = samples_mean + bounds[1]*samples_std/np.sqrt(sample_array.shape[0])
    return samples_mean, lower_bound, upper_bound

def present_mean_metrics(metrics):
    print("Autocorr\t\t MSE\t\t RMSE\t\t MAE\t\t sMAPE\t\t R^2")
    print("%10.4f\t %10.4f\t %10.4f\t %10.4f\t %10.4f\t %10.4f"% tuple(np.mean(metrics,axis=0)))
    print("+-",)
    print("%10.4f\t %10.4f\t %10.4f\t %10.4f\t %10.4f\t %10.4f"% tuple(np.std(metrics,axis=0,ddof=1)))
    

In [None]:
####### Set up experiment parameters ###############
#Training parameters
loss ='mse'

#Dataset parameters
window_length = 192
forecast_horizon = 48
preprocessor = preprocessing.MinMaxScaler()
out_preprocessor = preprocessing.MinMaxScaler()
shuffle_train_set = True
scale_output = True
training_percentage = 0.9
experiment_target = F"Forecasting,{forecast_horizon} steps ahead"
experiment_complete = False

In [None]:
############## Set up model ##########################
class MTCNAModel(tf.keras.Model):
    
    def __init__(self, tcn_layer_num,tcn_kernel_size,tcn_filter_num,window_size,forecast_horizon,num_output_time_series, use_bias, kernel_initializer, tcn_dropout_rate,tcn_dropout_format,tcn_activation, tcn_final_activation, tcn_final_stack_activation):
        super(MTCNAModel, self).__init__()


        self.num_output_time_series = num_output_time_series
        

        #Create stack of TCN layers    
        self.lower_tcn = TCNStack(tcn_layer_num,tcn_filter_num,tcn_kernel_size,window_size,use_bias,kernel_initializer,tcn_dropout_rate,tcn_dropout_format,tcn_activation,tcn_final_activation, tcn_final_stack_activation)
        
        self.downsample_att = DownsampleLayerWithAttention(num_output_time_series,window_size, tcn_kernel_size, forecast_horizon, kernel_initializer, None)
        
        
    def call(self, input_tensor):
        x = self.lower_tcn(input_tensor)
        x, distribution = self.downsample_att([x,input_tensor])
        return [x[:,i,:] for i in range(self.num_output_time_series)], distribution

In [None]:
################ Prepare dataset ###########################

### Note details for logging purposes
dataset_description = "Burnett river sensor data"
dataset_preprocessing = """Drop TIMESTAMP, Replace outliers more than 3*std on input data with Nan,
pd.interpolate() for NaN values"""

# Read csv in pandas
data_files = []

for year in range(2014,2019):
    data_file = pd.read_csv(F"Datasets/burnett-river-trailer-quality-{year}.csv")
    data_files.append(data_file)
data = pd.concat(data_files,axis=0)

# Change type of temp to avoid errors
data = data.astype({'Temp_degC':'float64'})

#Create date object for easy splitting according to dates
dateobj = pd.to_datetime(data['TIMESTAMP'])

### For now remove timestamp and output outliers
data = data.drop(columns=["TIMESTAMP","RECORD"],axis=1)

data = remove_outliers_and_interpolate(data, std_times=3)


In [None]:
## Add date object for splitting
data['DateObj'] = dateobj

#Split data based on dates
training_start_date = pd.Timestamp(year=2014,month=3,day=1)

# Preceding values used only for creating final graph and predicting first values of test set
holdout_preceding_date = pd.Timestamp(year=2017, month=3, day=1)
holdout_set_start_date = pd.Timestamp(year=2017, month=4, day=1)
holdout_set_end_date = pd.Timestamp(year=2018, month=4, day=1)

training_data = data.loc[(data['DateObj']>=training_start_date) & (data['DateObj'] < holdout_set_start_date)]
test_data = data.loc[(data['DateObj'] >= holdout_set_start_date) & (data['DateObj'] < holdout_set_end_date)]
pre_evaluation_period = data.loc[(data['DateObj'] >= holdout_preceding_date) & (data['DateObj'] < holdout_set_start_date)]


## Keep iput variables
input_variables = ['Temp_degC', 'EC_uScm', 'pH', 'Turbidity_NTU', 'Chloraphylla_ugL'] # 'DO_mg', 'DO_Sat'
#input_variables = ['Temp_degC', 'EC_uScm', 'pH', 'DO_mg', 'Turbidity_NTU', 'Chloraphylla_ugL']

## Save for explainable models training
do_and_date = training_data[["DO_mg","DateObj"]].copy()
do_and_date_test = test_data[["DO_mg","DateObj"]].copy()

training_data = training_data[input_variables]
test_data = test_data[input_variables+["DO_mg"]]



In [None]:
##Select prediction target
targets = ['Temp_degC', 'EC_uScm', 'pH', 'Turbidity_NTU','Chloraphylla_ugL']
labels = np.array(training_data[targets])

if scale_output:
    out_preprocessor.fit(labels)
    if "Normalizer" in str(out_preprocessor.__class__):
        ## Save norm so in case of normalizer we can scale the predictions correctly
        out_norm = np.linalg.norm(labels)
        labels = preprocessing.normalize(labels,axis=0)
    else:
        labels= out_preprocessor.transform(labels)


num_input_time_series = training_data.shape[1]


### Make sure data are np arrays in case we skip preprocessing
training_data = np.array(training_data)


#### Fit preprocessor to training data
preprocessor.fit(training_data)

if "Normalizer" in str(preprocessor.__class__):
    ## Save norm so in case of normalizer we can scale the test_data correctly
    in_norm = np.linalg.norm(training_data,axis=0)
    training_data = preprocessing.normalize(training_data,axis=0)
else:
    training_data = preprocessor.transform(training_data)

In [None]:

### Create windows for all data
data_windows = windowed_dataset(training_data[:-forecast_horizon],num_input_time_series,window_length)
label_windows = windowed_forecast(labels[window_length:],forecast_horizon)

### Transpose outputs to agree with model output
label_windows = np.transpose(label_windows,[0,2,1])


samples = data_windows.shape[0]

unshuffled_data_windows = data_windows.copy()

## Shuffle windows
if shuffle_train_set:
    data_windows, label_windows = shuffle_arrays_together(data_windows,label_windows)

### Create train and validation sets
train_x = data_windows
train_y = [label_windows[:,i,:] for i in range(len(targets))]


## In order to use all days of test set for prediction, append training window from preceding period
pre_test_train = pre_evaluation_period[test_data.columns][-window_length:]
test_data = pd.concat([pre_test_train,test_data])

test_data = test_data[input_variables]
## Create windowed test set with same process
test_labels = np.array(test_data[targets])

#### Preprocess data
test_data = np.array(test_data)

if "Normalizer" in str(preprocessor.__class__):
    test_data = test_data/in_norm
else:
    test_data = preprocessor.transform(test_data)

test_x = windowed_dataset(test_data[:-forecast_horizon],num_input_time_series,window_length)
test_y = np.transpose(windowed_forecast(test_labels[window_length:],forecast_horizon),[0,2,1])



## Create pre test period for visualization
pre_test_target = np.vstack((np.array(pre_evaluation_period[targets]),test_labels[:window_length]))

total_samples = train_x.shape[0] + test_x.shape[0]

In [None]:
##################### Initialize model parameters ########################
## For simplicity all time series TCNs have the same parameters, though it is relatively easy to change this
tcn_kernel_size = 3
tcn_layer_num = 7
tcn_use_bias = True
tcn_filter_num = 64
tcn_kernel_initializer = 'random_normal'
tcn_dropout_rate = 0.5 # This may be with the old keep_prob setting, we should also try the 1 - dropout_rate
tcn_dropout_format = "channel"
tcn_activation = 'relu'
tcn_final_activation = 'linear'
tcn_final_stack_activation = 'relu'

In [None]:
# ### Check for GPU

## Make only given GPU visible   
gpus = tf.config.experimental.list_physical_devices('GPU')
mirrored_strategy = None

print("GPUs Available: ", gpus)
if len(gpus)==0:
    device = "CPU:0"
else:
    print("Enter number of gpus to use:")
    gpu_num = input()
    if len(gpu_num)!=0 and gpu_num.isdigit():
        gpu_num = int(gpu_num)
    if gpu_num==1:
        print("Enter index of GPU to use:")
        gpu_idx = input()
        if len(gpu_idx)!=0 and gpu_idx.isdigit():
            gpu_idx = int(gpu_idx)
        tf.config.experimental.set_visible_devices(gpus[gpu_idx], 'GPU')
        device = "GPU:0"
    else:
        mirrored_strategy = tf.distribute.MirroredStrategy(devices=[F"GPU:{i}" for i in range(gpu_num)])
        device = " ".join([F"GPU:{i}" for i in range(gpu_num)])

In [None]:
### Set evaluation seed to affect dropout random execution
print("Enter a seed for the evaluation:")
seed = input()
if len(seed)!=0 and seed.isdigit():
    seed = int(seed)
else:
    seed = 192
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
## Set up test model
## From all the test samples keep individual, non overlapping days
unshuffled_train_days = unshuffled_data_windows[0::forecast_horizon]
test_x_days = test_x[0::forecast_horizon,:]
true_y = np.transpose(test_y[0::forecast_horizon,:],(0,2,1)).reshape((-1,len(targets)))

expl_train_data_y = np.array(do_and_date['DO_mg'])[window_length:unshuffled_train_days.shape[0]*forecast_horizon+window_length]
expl_test_data_y = np.array(do_and_date_test['DO_mg'])[:true_y.shape[0]]



In [None]:
test_dropout = 0.85

with tf.device(device):
    test_model = MTCNAModel(tcn_layer_num,tcn_kernel_size,tcn_filter_num,window_length,forecast_horizon,len(targets), tcn_use_bias, tcn_kernel_initializer, test_dropout, tcn_dropout_format, tcn_activation, tcn_final_activation, tcn_final_stack_activation)
_ = test_model(train_x[0:1])


best_weight_name = "f08332bc-d654-4219-a7c1-e0e6854fb2b5-weights.95-0.0070.h5"

## Generate predictions for test set using best weight (first in list)
## Reset training fase to disable dropout 
tf.keras.backend.set_learning_phase(0)
test_model.load_weights("SecondStageWeights-WaterQ/"+best_weight_name)

best_pred = np.asarray(test_model(test_x_days)[0]).reshape((len(targets),-1)).T
if scale_output and "Normalizer" in str(out_preprocessor.__class__):
    best_pred *= (out_norm)
else:
    best_pred = out_preprocessor.inverse_transform(best_pred)

In [None]:
from os import listdir
weight_names = listdir("SecondStageWeights-WaterQ/")
print(weight_names)
dropout_runs_per_weight = 20

metrics_number = 6
samples_per_prediction = dropout_runs_per_weight*len(weight_names)

## Enable dropout
tf.keras.backend.set_learning_phase(1)

dl_errors  = np.zeros((samples_per_prediction,test_x_days.shape[0]*forecast_horizon,len(targets)))
dl_predictions = np.zeros((samples_per_prediction,test_x_days.shape[0]*forecast_horizon,len(targets)))
dl_metrics = np.zeros((samples_per_prediction,metrics_number,len(targets)))

ml_train = np.zeros((samples_per_prediction,unshuffled_train_days.shape[0]*forecast_horizon,len(targets)))

for i in tqdm(range(len(weight_names))):
    test_model.load_weights("SecondStageWeights-WaterQ/"+weight_names[i])
    print(weight_names[i])
    for j in range(dropout_runs_per_weight):
        print(j)
        ## Get DL test set predictions and metrics
        cur_pred = np.asarray(test_model(test_x_days)[0]).reshape((len(targets),-1)).T
        if scale_output and "Normalizer" in str(out_preprocessor.__class__):
            cur_pred *= (out_norm)
        else:
            cur_pred = out_preprocessor.inverse_transform(cur_pred)
        dl_predictions[i*dropout_runs_per_weight+j,:] = cur_pred
        dl_errors[i*dropout_runs_per_weight+j,:] = cur_pred - true_y
        for t in range(len(targets)):
            dl_metrics[i*dropout_runs_per_weight+j,:,t] = np.asarray(get_metrics(true_y[:,t],cur_pred[:,t],print_metrics=False))
        ## Get train set for explainable ML
        cur_ml_train = np.asarray(test_model(unshuffled_train_days)[0]).reshape((len(targets),-1)).T
        if scale_output and "Normalizer" in str(out_preprocessor.__class__):
            cur_ml_train *= (out_norm)
        else:
            cur_ml_train = out_preprocessor.inverse_transform(cur_ml_train)
        ml_train[i*dropout_runs_per_weight+j,:] = cur_ml_train
        

In [None]:

np.set_printoptions(linewidth=100)
sns.set()
for var_idx in range(len(targets)):
    print(targets[var_idx])
    present_mean_metrics(dl_metrics[...,var_idx])
    plt.hist(dl_errors[...,var_idx].flatten(),alpha=0.5)
    plt.hist((dl_predictions[...,var_idx]-np.median(dl_predictions[...,var_idx],axis=0)).flatten(),alpha=0.5)
    plt.show()

In [None]:
pred_mean, dl_lower_bound, dl_upper_bound = get_confidence_interval_series(dl_predictions)

In [None]:
preceding_points = 192
from_day = 20
to_day = 21


d0 = holdout_set_start_date.to_pydatetime()
d1 = d0 + timedelta(days=from_day)

pred_plot_range = range(preceding_points,preceding_points+(to_day-from_day)*forecast_horizon)
pred_sp = from_day*forecast_horizon
pred_ep = to_day*forecast_horizon

for i in range(len(targets)):
    fig = plt.figure(figsize=(20,10))
    plt.plot(pred_plot_range,pred_mean[pred_sp:pred_ep,i],marker="o",label="Prediction")
    plt.fill_between(pred_plot_range, dl_lower_bound[pred_sp:pred_ep,i], dl_upper_bound[pred_sp:pred_ep,i], alpha=0.3)
    
    if from_day==0:
        plt.plot(pre_test_target[-preceding_points:,i],label="Pretest period", marker="o")
    else:
        plt.plot(true_y[pred_sp-preceding_points:pred_sp,i],label="Pretest period", marker="o")
    plt.plot(pred_plot_range,true_y[from_day*forecast_horizon:to_day*forecast_horizon,i],marker="o",label="True data")

    plt.grid(axis='x')
    plt.legend()
    plt.title(targets[i])
    plt.xlabel(d1.strftime("%d/%m/%Y"))
    plt.xticks([])
    plt.show()

In [None]:


sp = pre_test_target.shape[0]
ep = sp + true_y.shape[0]
days_ahead=30

for i in range(len(targets)):
    fig = plt.figure(figsize=(20,10))
    plt.plot(range(sp,ep),best_pred[:,i],label="Prediction")

    plt.plot(pre_test_target[:,i],label="Pretest period")
    plt.plot(range(sp,ep),true_y[:,i],label="True data")

    plt.xlim(left=sp, right=sp+days_ahead*forecast_horizon)
    plt.title(targets[i])
    plt.grid(axis='x')
    plt.legend()
    plt.show()

In [None]:
## Present attention graphs for specific prediction output

input_variables = ['Temp_degC', 'EC_uScm', 'pH', 'Turbidity_NTU', 'Chloraphylla_ugL']

var_of_interest = "Temp_degC"

var_idx = input_variables.index(var_of_interest)

test_idx = 20

## Reset training fase to disable dropout 
tf.keras.backend.set_learning_phase(0)
test_model.load_weights("SecondStageWeights/"+best_weight_name)


o, dist = test_model(test_x_days[test_idx:test_idx+1])

o = np.asarray(o).reshape((len(targets),-1)).T
if scale_output and "Normalizer" in str(out_preprocessor.__class__):
    o *= (out_norm)
else:
    o = out_preprocessor.inverse_transform(o)
inp = preprocessor.inverse_transform(test_x_days[test_idx])[:,var_idx]

prediction= o[:,var_idx]
true_out = true_y[test_idx*forecast_horizon:(test_idx+1)*(forecast_horizon),var_idx]

In [None]:
fix, ax = plt.subplots(figsize=(20,10))
plt.plot(inp)
plt.plot(np.arange(window_length,window_length+forecast_horizon),prediction,marker="o",label="Prediction")
plt.plot(np.arange(window_length,window_length+forecast_horizon),true_out,marker="o",label="Ground truth")
plt.legend()
plt.show()

In [None]:
## Get value dense layer
for w in test_model.weights:
    if w.name.endswith("sep_dense_value_weights:0"):
        weights = np.abs(w.numpy())[var_idx]
        #weights = w.numpy()[var_idx]
        break

dist_var = dist.numpy()[0,var_idx,...]
full_dist = np.matmul(dist_var,weights.T)

In [None]:
sns.set()
def infl_to_out_elem(out_elem):
    elem_dist = full_dist[out_elem:out_elem+1,:]
    prep = preprocessing.MinMaxScaler()
    prep.fit(elem_dist.T)
    elem_dist = prep.transform(elem_dist.T)
        
    fig, ax = plt.subplots(figsize=(20,10))
    sns.heatmap(elem_dist.T, cmap="Blues", cbar=True, yticklabels=False, xticklabels=10)
    ax2 = plt.twinx()
    ax2.plot(range(window_length,window_length+forecast_horizon),true_out,label="True data",marker="o")
    ax2.plot(range(window_length,window_length+forecast_horizon),prediction,label="Prediction",marker="o")
    plt.plot([window_length+out_elem], [prediction[out_elem]], marker='o', label= "Step "+str(out_elem+1), markersize=8, color="black")
    sns.lineplot(x=np.arange(0,window_length),y=inp, ax=ax2)
    ax.axis('tight')
    ax2.legend(fontsize=20)
    ax2.set_ylabel("°C")
    plt.show()
    #plt.savefig("%s-%02d.png"%(var_of_interest,out_elem))
    #plt.close(fig)
#infl_to_out_elem(22)
interact(infl_to_out_elem, out_elem=(0,forecast_horizon-1,1))
# for i in range(forecast_horizon):
#     infl_to_out_elem(i)





In [None]:
### Prepare dataset for explainable methods from output of stage 1, do_mg values and timestamp
## Extract timestamp information to periodic numbers
date_linear_df_train = pd.DataFrame()
date_cat_df_train = pd.DataFrame()
date_linear_df_test = pd.DataFrame()
date_cat_df_test = pd.DataFrame()



date_linear_df_train['hour_sin'] = np.sin(do_and_date['DateObj'].dt.hour*(2.*np.pi/24))
date_linear_df_train['hour_cos'] = np.cos(do_and_date['DateObj'].dt.hour*(2.*np.pi/24))

date_linear_df_train['dayofyear_sin'] = np.sin((do_and_date['DateObj'].dt.dayofyear-1)*(2.*np.pi/365))
date_linear_df_train['dayofyear_cos'] = np.cos((do_and_date['DateObj'].dt.dayofyear-1)*(2.*np.pi/365))

date_linear_df_train['month_sin'] = np.sin((do_and_date['DateObj'].dt.month-1)*(2.*np.pi/12))
date_linear_df_train['month_cos'] = np.cos((do_and_date['DateObj'].dt.month-1)*(2.*np.pi/12))

date_linear_df_train['quarter_sin'] = np.sin((do_and_date['DateObj'].dt.quarter-1)*(2.*np.pi/4))
date_linear_df_train['quarter_cos'] = np.cos((do_and_date['DateObj'].dt.quarter-1)*(2.*np.pi/4))

date_linear_df_train['dayofweek_sin'] = np.sin((do_and_date['DateObj'].dt.dayofweek)*(2.*np.pi/7))
date_linear_df_train['dayofweek_cos'] = np.cos((do_and_date['DateObj'].dt.dayofweek)*(2.*np.pi/7))

date_linear_df_train['dayofmonth_sin'] = np.sin((do_and_date['DateObj'].dt.day-1)*(2.*np.pi/31))
date_linear_df_train['dayofmonth_cos'] = np.cos((do_and_date['DateObj'].dt.day-1)*(2.*np.pi/31))

expl_train_linear_date_data = np.array(date_linear_df_train)[window_length:expl_train_data_y.shape[0]+window_length]



date_cat_df_train['hour'] = do_and_date['DateObj'].dt.hour
date_cat_df_train['dayofyear'] = do_and_date['DateObj'].dt.dayofyear-1
date_cat_df_train['month'] = do_and_date['DateObj'].dt.month-1
date_cat_df_train['quarter'] = do_and_date['DateObj'].dt.quarter-1
date_cat_df_train['dayofweek'] = do_and_date['DateObj'].dt.dayofweek
date_cat_df_train['dayofmonth'] = do_and_date['DateObj'].dt.day-1

expl_train_cat_date_data = np.array(date_cat_df_train)[window_length:expl_train_data_y.shape[0]+window_length]

## Same process for test data

date_linear_df_test['hour_sin'] = np.sin(do_and_date_test['DateObj'].dt.hour*(2.*np.pi/24))
date_linear_df_test['hour_cos'] = np.cos(do_and_date_test['DateObj'].dt.hour*(2.*np.pi/24))

date_linear_df_test['dayofyear_sin'] = np.sin((do_and_date_test['DateObj'].dt.dayofyear-1)*(2.*np.pi/365))
date_linear_df_test['dayofyear_cos'] = np.cos((do_and_date_test['DateObj'].dt.dayofyear-1)*(2.*np.pi/365))

date_linear_df_test['month_sin'] = np.sin((do_and_date_test['DateObj'].dt.month-1)*(2.*np.pi/12))
date_linear_df_test['month_cos'] = np.cos((do_and_date_test['DateObj'].dt.month-1)*(2.*np.pi/12))

date_linear_df_test['quarter_sin'] = np.sin((do_and_date_test['DateObj'].dt.quarter-1)*(2.*np.pi/4))
date_linear_df_test['quarter_cos'] = np.cos((do_and_date_test['DateObj'].dt.quarter-1)*(2.*np.pi/4))

date_linear_df_test['dayofweek_sin'] = np.sin((do_and_date_test['DateObj'].dt.dayofweek)*(2.*np.pi/7))
date_linear_df_test['dayofweek_cos'] = np.cos((do_and_date_test['DateObj'].dt.dayofweek)*(2.*np.pi/7))

date_linear_df_test['dayofmonth_sin'] = np.sin((do_and_date_test['DateObj'].dt.day-1)*(2.*np.pi/31))
date_linear_df_test['dayofmonth_cos'] = np.cos((do_and_date_test['DateObj'].dt.day-1)*(2.*np.pi/31))

expl_test_linear_date_data = np.array(date_linear_df_test)[:true_y.shape[0]]



date_cat_df_test['hour'] = do_and_date_test['DateObj'].dt.hour
date_cat_df_test['dayofyear'] = do_and_date_test['DateObj'].dt.dayofyear -1
date_cat_df_test['month'] = do_and_date_test['DateObj'].dt.month-1
date_cat_df_test['quarter'] = do_and_date_test['DateObj'].dt.quarter-1
date_cat_df_test['dayofweek'] = do_and_date_test['DateObj'].dt.dayofweek
date_cat_df_test['dayofmonth'] = do_and_date_test['DateObj'].dt.day-1

expl_test_cat_date_data = np.array(date_cat_df_test)[:true_y.shape[0]]
                                                          
# Add information of DO_mg value at last known step
expl_train_domg_last_step = np.zeros((expl_train_data_y.shape[0],1))
for i in range(0,expl_train_data_y.shape[0],forecast_horizon):
    expl_train_domg_last_step[i:i+forecast_horizon] = do_and_date['DO_mg'].iloc[window_length-1+i]
    
expl_train_domg_last_mean = np.zeros((expl_train_data_y.shape[0],1))
for i in range(0,expl_train_data_y.shape[0],forecast_horizon):
    expl_train_domg_last_mean[i:i+forecast_horizon] = np.mean(do_and_date['DO_mg'].iloc[window_length-forecast_horizon+i:window_length+i])

expl_test_domg_last_step = np.zeros((expl_test_data_y.shape[0],1))
expl_test_domg_last_step[0:forecast_horizon] = do_and_date['DO_mg'].iloc[-1]
for i in range(forecast_horizon,expl_test_data_y.shape[0],forecast_horizon):
    expl_test_domg_last_step[i:i+forecast_horizon] = do_and_date_test['DO_mg'].iloc[i-1]

expl_test_domg_last_mean = np.zeros((expl_test_data_y.shape[0],1))
expl_test_domg_last_mean[0:forecast_horizon] = np.mean(do_and_date['DO_mg'].iloc[-forecast_horizon:])
for i in range(forecast_horizon,expl_test_data_y.shape[0],forecast_horizon):
    expl_test_domg_last_mean[i:i+forecast_horizon] = np.mean(do_and_date_test['DO_mg'].iloc[i-forecast_horizon:i])

In [None]:
# Simple linear regression
lr_dtypes = ["linear","categorical"]

lr_dtype_idx = 0

lr_dtype = lr_dtypes[lr_dtype_idx]

print("Approach with %s data."%lr_dtype)



feature_names=input_variables+list(date_linear_df_train.columns) +["last_DO_mg_step"]+["last_DO_mg_mean"]
print(len(feature_names))
print(feature_names)
linreg= LinearRegression()
linear_reg_predictions = np.zeros((ml_train.shape[0],expl_test_data_y.shape[0]))
linear_reg_metrics = np.zeros((ml_train.shape[0],metrics_number))
## Linear datasets
for i in range(ml_train.shape[0]):
    
    expl_train_data_x = ml_train[i,...]
    expl_test_data_x = dl_predictions[i,...]
    
    if lr_dtype=="linear":    
        expl_linear_train_data_x = np.hstack((expl_train_data_x,expl_train_linear_date_data))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_step))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_mean))

        expl_linear_test_data_x = np.hstack((expl_test_data_x,expl_test_linear_date_data))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_step))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_mean))
    else:        
        expl_cat_train_data_x = np.hstack((expl_train_data_x,expl_train_cat_date_data))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_step))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_mean))

        expl_cat_test_data_x = np.hstack((expl_test_data_x,expl_test_cat_date_data))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_step))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_mean))

    if lr_dtype=="linear":
        expl_train_x = expl_linear_train_data_x.copy()
        expl_test_x = expl_linear_test_data_x.copy()
    else:
        expl_train_x = expl_cat_train_data_x.copy()
        expl_test_x = expl_cat_test_data_x.copy()
    
    
    linreg.fit(expl_train_x,expl_train_data_y)
    cur_pred = linreg.predict(expl_test_x)
    linear_reg_predictions[i,...] = cur_pred
    linear_reg_metrics[i,...] = np.asarray(get_metrics(expl_test_data_y,cur_pred))

lrmean, lrlb, lrup = get_confidence_interval_series(linear_reg_predictions)

present_mean_metrics(linear_reg_metrics)

In [None]:
d0 = holdout_set_start_date.to_pydatetime()
preceding_points = 24
from_day =20
to_day = 21
d1 = d0 + timedelta(days=from_day)

pred_plot_range = range(preceding_points,preceding_points+(to_day-from_day)*forecast_horizon)
pred_sp = from_day*forecast_horizon
pred_ep = to_day*forecast_horizon


fig = plt.figure(figsize=(20,20))
plt.plot(pred_plot_range,lrmean[pred_sp:pred_ep],marker="o",label="Prediction")
plt.fill_between(pred_plot_range, lrlb[pred_sp:pred_ep], lrup[pred_sp:pred_ep], alpha=0.3)

if from_day==0:
    plt.plot(np.array(do_and_date['DO_mg'])[-preceding_points:],label="Pretest period", marker="o")
else:
    plt.plot(expl_test_data_y[pred_sp-preceding_points:pred_sp],label="Pretest period", marker="o")
plt.plot(pred_plot_range,expl_test_data_y[from_day*forecast_horizon:to_day*forecast_horizon],marker="o",label="True data")

plt.grid(axis='x')
plt.ylim(top=7.65)
plt.legend(fontsize=35)
plt.tick_params(axis="y", labelsize=35)
plt.xlabel(d1.strftime("%d/%m/%Y"),fontsize=35)
plt.ylabel("mg/L",fontsize=35)
plt.xticks([])
plt.savefig("linear_reg.png")
plt.show()

In [None]:

eli5.sklearn.explain_linear_regressor_weights(linreg,feature_names=feature_names,top=25)

In [None]:
## Decision tree regressor
dt_dtypes = ["linear","categorical"]

dt_dtype_idx = 0

dt_dtype = dt_dtypes[dt_dtype_idx]

print("Approach with %s data."%dt_dtype)

if dt_dtype=="linear":
    feature_names=input_variables+list(date_linear_df_train.columns) +["last_DO_mg_step"]+["last_DO_mg_mean"]
else:
    feature_names=input_variables+list(date_cat_df_train.columns) +["last_DO_mg_step"]+["last_DO_mg_mean"]
    
dec_tree = DecisionTreeRegressor(max_depth=6)
dec_tree_predictions = np.zeros((ml_train.shape[0],expl_test_data_y.shape[0]))
dec_tree_metrics = np.zeros((ml_train.shape[0],metrics_number))
## Linear datasets
for i in range(ml_train.shape[0]):
    expl_train_data_x = ml_train[i,...]
    expl_test_data_x = dl_predictions[i,...]
    
    if dt_dtype=="linear":    
        expl_linear_train_data_x = np.hstack((expl_train_data_x,expl_train_linear_date_data))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_step))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_mean))

        expl_linear_test_data_x = np.hstack((expl_test_data_x,expl_test_linear_date_data))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_step))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_mean))
    else:        
        expl_cat_train_data_x = np.hstack((expl_train_data_x,expl_train_cat_date_data))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_step))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_mean))

        expl_cat_test_data_x = np.hstack((expl_test_data_x,expl_test_cat_date_data))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_step))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_mean))
    if dt_dtype=="linear":
        expl_train_x = expl_linear_train_data_x.copy()
        expl_test_x = expl_linear_test_data_x.copy()
    else:
        expl_train_x = expl_cat_train_data_x.copy()
        expl_test_x = expl_cat_test_data_x.copy()
    
    
    dec_tree.fit(expl_train_x,expl_train_data_y)
    cur_pred = dec_tree.predict(expl_test_x)
    dec_tree_predictions[i,...] = cur_pred
    dec_tree_metrics[i,...] = np.asarray(get_metrics(expl_test_data_y,cur_pred))

dtmean, dtlb, dtup = get_confidence_interval_series(dec_tree_predictions)

present_mean_metrics(dec_tree_metrics)

In [None]:
pred_plot_range = range(preceding_points,preceding_points+(to_day-from_day)*forecast_horizon)
pred_sp = from_day*forecast_horizon
pred_ep = to_day*forecast_horizon


fig = plt.figure(figsize=(20,20))
plt.plot(pred_plot_range,dtmean[pred_sp:pred_ep],marker="o",label="Prediction")
plt.fill_between(pred_plot_range, dtlb[pred_sp:pred_ep], dtup[pred_sp:pred_ep], alpha=0.3)

if from_day==0:
    plt.plot(pre_test_target[-preceding_points:],label="Pretest period", marker="o")
else:
    plt.plot(expl_test_data_y[pred_sp-preceding_points:pred_sp],label="Pretest period", marker="o")

plt.plot(pred_plot_range,expl_test_data_y[from_day*forecast_horizon:to_day*forecast_horizon],marker="o",label="True data")
plt.grid(axis='x')
plt.legend(fontsize=35)
plt.tick_params(axis="y", labelsize=35)
plt.xlabel(d1.strftime("%d/%m/%Y"),fontsize=35)
plt.ylabel("mg/L",fontsize=35)
plt.ylim(top=7.65)
plt.xticks([])
plt.savefig("dec_tree.png")
plt.show()

eli5.sklearn.explain_decision_tree(dec_tree,feature_names=feature_names)
#dot_data = export_graphviz(dec_tree, max_depth=6,feature_names=feature_names)
#graph = pydotplus.graph_from_dot_data(dot_data)  
#Image(graph.create_png())

In [None]:
#### Simple linear GAM


gam_dtypes = ["linear","categorical"]

gam_dtype_idx = 0

gam_dtype = gam_dtypes[gam_dtype_idx]
print("Approach with %s data."%gam_dtype)

if gam_dtype=="linear":
    feature_names=input_variables+list(date_linear_df_train.columns) +["last_DO_mg_step"]+["last_DO_mg_mean"]
else:
    feature_names=input_variables+list(date_cat_df_train.columns) +["last_DO_mg_step"]+["last_DO_mg_mean"]
    
    
##Gam hyperparameters, as a result of Grid search on training data
lam = 0.004
spline_order=5
n_splines=7


term_splines = [s(i,spline_order=spline_order, n_splines=n_splines,lam=lam) for i in range(len(feature_names))]
termlist = terms.TermList(*term_splines)

gam_predictions = np.zeros((ml_train.shape[0],expl_test_data_y.shape[0]))
gam_metrics = np.zeros((ml_train.shape[0],metrics_number))
for i in tqdm(range(ml_train.shape[0])): 
    expl_train_data_x = ml_train[i,...]
    expl_test_data_x = dl_predictions[i,...]
    
    if gam_dtype=="linear":    
        expl_linear_train_data_x = np.hstack((expl_train_data_x,expl_train_linear_date_data))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_step))
        expl_linear_train_data_x = np.hstack((expl_linear_train_data_x,expl_train_domg_last_mean))

        expl_linear_test_data_x = np.hstack((expl_test_data_x,expl_test_linear_date_data))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_step))
        expl_linear_test_data_x = np.hstack((expl_linear_test_data_x,expl_test_domg_last_mean))
    else:        
        expl_cat_train_data_x = np.hstack((expl_train_data_x,expl_train_cat_date_data))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_step))
        expl_cat_train_data_x = np.hstack((expl_cat_train_data_x,expl_train_domg_last_mean))

        expl_cat_test_data_x = np.hstack((expl_test_data_x,expl_test_cat_date_data))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_step))
        expl_cat_test_data_x = np.hstack((expl_cat_test_data_x,expl_test_domg_last_mean))
        
    if gam_dtype=="linear":
        expl_train_x = expl_linear_train_data_x.copy()
        expl_test_x = expl_linear_test_data_x.copy()
    else:
        expl_train_x = expl_cat_train_data_x.copy()
        expl_test_x = expl_cat_test_data_x.copy()
    
    gam = LinearGAM(termlist,fit_intercept = True).fit(expl_train_x, expl_train_data_y)
    cur_pred = gam.predict(expl_test_x)
    gam_predictions[i,...] = cur_pred
    gam_metrics[i,...] = np.asarray(get_metrics(expl_test_data_y,cur_pred)) 

gammean, gamlb, gamup = get_confidence_interval_series(gam_predictions)

present_mean_metrics(gam_metrics)

In [None]:
preceding_points = 24

pred_plot_range = range(preceding_points,preceding_points+(to_day-from_day)*forecast_horizon)
pred_sp = from_day*forecast_horizon
pred_ep = to_day*forecast_horizon

fig = plt.figure(figsize=(20,20))
plt.plot(pred_plot_range,gammean[pred_sp:pred_ep],label="Prediction",marker="o")

plt.fill_between(pred_plot_range, gamlb[pred_sp:pred_ep], gamup[pred_sp:pred_ep], alpha=0.3)

if from_day==0:
    plt.plot(np.array(do_and_date['DO_mg'])[-preceding_points:],label="Pretest period", marker="o")
else:
    plt.plot(expl_test_data_y[pred_sp-preceding_points:pred_sp],label="Pretest period", marker="o")
plt.plot(pred_plot_range,expl_test_data_y[from_day*forecast_horizon:to_day*forecast_horizon],label="True data",marker="o")
plt.grid(axis='x')
plt.legend(fontsize=35)
plt.tick_params(axis="y", labelsize=35)
plt.xlabel(d1.strftime("%d/%m/%Y"),fontsize=35)
plt.ylabel("mg/L",fontsize=35)
plt.ylim(top=7.65)
plt.xticks([])
#plt.savefig("gam.png")
plt.show()

In [None]:
print(feature_names)
term = 'Temp_degC'
term_idx = feature_names.index(term)
XX = gam.generate_X_grid(term=term_idx)
pdep, confi = gam.partial_dependence(term=term_idx, X=XX, width=0.95)
plt.figure(figsize=(15,10))
term = gam.terms[term_idx]
plt.plot(XX[:, term.feature], pdep)
plt.plot(XX[:, term.feature], confi, c='r', ls='--')
plt.title(feature_names[term_idx])
# plt.savefig("gampdplot.png")
plt.show()