# GRU-DF — 6m observation window
### Prediction Imputation

- For this model we do not mask the training set's missing instances. We impute them.
- We do, however, perform masking for the teting data evaluation.

# Updates
- Remove all test operations
- mask_list.append((train, test)) went to mask_list.append(train)

# Libraries

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
import tensorflow as tf
import sklearn as sk
from matplotlib import pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold

from keras import callbacks
import keras.layers as L
import keras.models as M
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_absolute_error as mae
import os, shutil

from keras.regularizers import l1_l2

### Basic Functions

In [None]:
def select_columns(col_list, n_months):
    """
    Takes in a list of column names and number of visits starting at 0.
    Returns column list time-stepped and dovetailed.
    """ 
    return dovetail_names(*[time_step_names(i, n_months) for i in col_list])
        
def time_step_names(name, n_months):

    return [(name + '_%d' % (j+1)) for j in range(-1,n_months*6, 6)]

def dovetail_names(*kwargs):
    """
    Dovetails column names across time slices cccording to preset order.
    """
    zipped = zip(*kwargs)
    l = []
   
    for i in zipped:
        
        for j in i:
            
            l.append(j)
            
    return l

def reshape_data(X, y, n_time_steps, n_features = 115):  
    
    X_reshaped = X.values.reshape(-1, n_time_steps, n_features)
    y_reshaped = y.values.reshape(-1, n_time_steps, 1)
    print("X reshaped is " + str(X_reshaped.shape))
    print("y reshaped is " + str(y_reshaped.shape))

    y = y_reshaped.astype(float)
    X = X_reshaped.astype(float)
    
    print(X.shape, y.shape)
    return X, y

def provide_data(X,y,roll,n_features=115):
 
    X = X.iloc[:,:n_features*roll]
    y = y.iloc[:,:roll]

    y_full = y.dropna()

    mask = X.index.isin(y_full.index.tolist())

    X_full = X[mask]

    y_nan = y[~mask]
    X_nan = X[~mask]
    
    print('NaN')
    X_nan, y_nan = reshape_data(X_nan, y_nan, roll)
    print('Full')
    X_full, y_full = reshape_data(X_full, y_full, roll)
    
    return X_full, X_nan, y_full, y_nan, mask

def provide_all_data(X,y,roll):
 
    X = X.iloc[:,:n_features*roll]
    y = y.iloc[:,:roll]

    X_all, y_all = reshape_data(X, y, roll)
    
    return X_all, y_all
    
def prepare_for_mask(X,y, mask_value = -99):
    
    for i in range(y.shape[0]):
        for j in range(y.shape[1]):
            if np.isnan(y[i][j][0]) or (y[i][j] == mask_value):

                X[i][j] = mask_value
                y[i][j] = mask_value
                
            if (mask_value in X[i][j]) or np.isnan(X[i][j][114]):
                
                X[i][j] = mask_value
                y[i][j] = mask_value
            
    return X,y

def round_off_EDSS(number):
    """Round a number to the closest half integer.
    >>> round_of_rating(1.3)
    1.5
    >>> round_of_rating(2.6)
    2.5
    >>> round_of_rating(3.0)
    3.0
    >>> round_of_rating(4.1)
    4.0"""
    return np.round(number * 2) / 2
   
def def_train_name(f_ix):
    
    X_train_name = "data_folds/X_train_f" + str(f_ix + 1) + ".csv"
    y_train_name = "data_folds/y_train_f" + str(f_ix + 1) + ".csv"
    X_test_name = "data_folds/X_test_f" + str(f_ix + 1) + ".csv"
    y_test_name = "data_folds/y_test_f" + str(f_ix + 1) + ".csv"
    
    return X_train_name, X_test_name, y_train_name, y_test_name

### Model Functions

In [None]:
weight_file_path = "weights/my_model_weights_6m.h5"
final_file_path = "final_weights/my_model_weights_6m.h5"
best_file_path = "best/best_weights_6m.hdf5"

# reduce learning rate on plateau
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                               patience=5, min_lr=0.001)

# stop training if there isn't a significant improvement in the course of 5 epochs
early_stopping = callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0001, 
                              patience=5, verbose=0, mode='auto', 
                              baseline=None, restore_best_weights=True)

# model check point
model_checkpoint = ModelCheckpoint(best_file_path, monitor='val_loss', 
                                   verbose=1, 
                                   save_best_only=True, 
                                   mode='min')

callbacks_list = [reduce_lr, early_stopping]
callbacks_list_final = [reduce_lr, early_stopping, model_checkpoint]

In [None]:
def rnn_model(n_time_steps, n_inputs):
    
    m = M.Sequential()
    m.add(L.Masking(mask_value=-99, input_shape=(n_time_steps, n_inputs)))
    m.add(L.GRU(128, return_sequences=True))
    m.add(L.Dropout(0.2))
    m.add(L.Dense(1, activation='relu'))
    m.compile(optimizer = 'adam', loss = 'mean_absolute_error')

    return m

### Eval Functions

In [None]:
def optimal_test_model(X_train, y_train, n_features=115, pot = 1):
    
    """STEP 3, PART A
    Determine optimal model to evalute test data
    now that the data sets are already imputed"""
    # Delete Previous Model Checkpoint, if any
    if os.path.exists('optimal_weights_6m/'):
        shutil.rmtree('optimal_weights_6m/')
    os.makedirs('optimal_weights_6m/')
        
    optimal_file_path = "optimal_weights_6m/weights_6m.hdf5"
    
    model_checkpoint = ModelCheckpoint(optimal_file_path, 
                                       monitor='val_loss', 
                                       verbose=1, 
                                       save_best_only=True, mode='min')

    callbacks_list = [reduce_lr, early_stopping, model_checkpoint]

    """Running the model after we've imputed for the whole feature space"""
    
    n_inputs = n_features*pot
    n_time_steps = len(y_train.columns)
    
    X_train_all, y_train_all = provide_all_data(X_train, y_train, n_time_steps)
    
    K.clear_session()
    
    m = rnn_model(n_time_steps, n_inputs)
    
    m.fit(X_train_all, y_train_all,
                    validation_split = 0.2,
                    batch_size = 32, 
                    epochs=100,
                    shuffle=True,
                   callbacks = callbacks_list)
    
    K.clear_session()
    
    m = rnn_model(n_time_steps, n_inputs)
    # load weights from previous model to establish continuity 
    m.load_weights(optimal_file_path)
    
    return m

def evaluate_model(X_train, X_test, y_train, y_test): 
    """
    STEP 3 PART B
    Evaluate model with held out test set
    """
    
    m = optimal_test_model(X_train, y_train)
    
    n_time_steps = len(y_train.columns)
    
    X_test_all, y_test_all = provide_all_data(X_test, y_test, n_time_steps)
    
    masked_X_test_all, masked_y_test_all = prepare_for_mask(X_test_all, y_test_all)
    
    y_pred = round_off_EDSS(m.predict(masked_X_test_all))
    
    mask_test = np.where(masked_y_test_all.reshape(-1) != -99)
    
    res = mae(y_pred.reshape(-1)[mask_test], masked_y_test_all.reshape(-1)[mask_test])
    
    print(res)
    
    return res

def retrieve_fold(f_ix):

    X_train_name, X_test_name, y_train_name, y_test_name = def_train_name(f_ix)
    
    X_train = pd.read_csv(X_train_name,  index_col = 0)
    X_test = pd.read_csv(X_test_name,  index_col = 0)
    y_train = pd.read_csv(y_train_name,  index_col = 0)
    y_test = pd.read_csv(y_test_name, index_col = 0)
    
    return X_train, X_test, y_train, y_test
    
def kfold_eval(f_ix, ahead):
    
    less = ahead - 1
    
    l = []

    for fold in range(5):
        
        X_train, X_test, y_train, y_test = retrieve_fold(fold)
        
        if ahead > 1:
            
            res = evaluate_model(X_train.iloc[:,:-(less*115)],
                             X_test.iloc[:,:-(less*115)], 
                             y_train.iloc[:,less:], 
                             y_test.iloc[:,less:])
        else:
            
            res = evaluate_model(X_train,
                             X_test,
                             y_train,
                             y_test)
        
        print("Fold #", str(fold+1), ": ", str(res))
        l.append(res)
        
    return np.mean(l)    

### Basic Parameters

In [None]:
n_features = 115
pot = 1
n_inputs = n_features * pot
n_units = 128

# Evaluate models 

In [None]:
result1 = kfold_eval(1,1)

In [None]:
result1

In [None]:
result2 = kfold_eval(1,2)

In [None]:
result2

In [None]:
result3 = kfold_eval(1,3)

In [None]:
result3

In [None]:
result4 = kfold_eval(1,4)

In [None]:
result4

In [None]:
result5 = kfold_eval(1,5)

In [None]:
result5

In [None]:
result6 = kfold_eval(1,6)

In [None]:
result6

In [None]:
result7 = kfold_eval(1,7)

In [None]:
result7

In [None]:
result8 = kfold_eval(1,8)

In [None]:
result8

In [None]:
result9 = kfold_eval(1,9)

In [None]:
result9

In [None]:
result10 = kfold_eval(1,10)

In [None]:
result10

In [None]:
result11 = kfold_eval(1,11)

In [None]:
result11

In [None]:
result12 = kfold_eval(1,12)

In [None]:
result12