# GRU-FF
### 1 year and a half observation window

- Train
  - Forward Fill For Target and Feature
  
- Test 
  - Mask Target and Features

### Base Parameters

In [None]:
n_features = 115
pot = 3

### Functions

In [None]:
from __future__ import print_function
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
import sklearn as sk

import tensorflow as tf
from matplotlib import pyplot as plt

from keras import callbacks
from keras import regularizers
import keras.layers as L
import keras.models as M
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_absolute_error as mae

In [None]:
def select_columns(col_list, n_months):
    """
    Takes in a list of column names and number of visits starting at 0.
    Returns column list time-stepped and dovetailed.
    """ 
    return dovetail_names(*[time_step_names(i, n_months) for i in col_list])

def time_step_names(name, n_months):

    return [(name + '_%d' % (j+1)) for j in range(-1,n_months*6, 6)]

def dovetail_names(*kwargs):
    """
    Dovetails column names across time slices cccording to preset order.
    """
    zipped = zip(*kwargs)
    l = []
   
    for i in zipped:
        
        for j in i:
            
            l.append(j)
            
    return l

def stretch_input(Xtr, n_inputs, time_steps, pot = pot):
    """
    Xtr_fill is empty 3D numpy array where we extend length of patient observation times t
    pot stands for Patient Observation Time. We only need to do this for our X input
    """
    Xtr_fill = np.zeros(shape=[Xtr.shape[0], time_steps, n_inputs*pot] , dtype = object) 

    for subject in range(Xtr.shape[0]):
    
        for i in range(time_steps):
            
            concat_list = []
            
            for extra in range(pot):
            
                concat_list.append(Xtr[subject][i+extra])
                
            Xtr_fill[subject][i] = np.concatenate(concat_list)
            
    return Xtr_fill

def reshape_data(X, y, n_time_steps, pot, n_features = n_features):  
    
    extra_ts = pot - 1
    
    X_reshaped = X.values.reshape(-1, n_time_steps+extra_ts, n_features)
    y_reshaped = y.values.reshape(-1, n_time_steps, 1)
    
    if (pot > 1):
        
        X = stretch_input(X_reshaped, n_features, n_time_steps, pot)

    y = y_reshaped.astype(float)
    X = X.astype(float)
    
    print("X reshaped is " + str(X.shape))
    print("y reshaped is " + str(y_reshaped.shape))
    
    return X, y

def provide_data(X, y, roll, n_features = n_features, pot = pot):
 
    X = X.iloc[:,:(n_features*(roll+(pot-1)))]
    y = y.iloc[:,:roll]

    y_full = y.dropna()

    mask = X.index.isin(y_full.index.tolist())

    X_full = X[mask]

    y_nan = y[~mask]
    X_nan = X[~mask]
    
    print('NaN')
    X_nan, y_nan = reshape_data(X_nan, y_nan, roll)
    print('Full')
    X_full, y_full = reshape_data(X_full, y_full, roll)
    
    return X_full, X_nan, y_full, y_nan, mask

def provide_all_data(X,y,roll):
 
    X = X.iloc[:,:n_features*roll]
    y = y.iloc[:,:roll]

    X_all, y_all = reshape_data(X, y, roll)
    
    return X_all, y_all
    
def prepare_for_mask(X, y, mask_value = -99):
    """Improved and working"""
    for i in range(y.shape[0]):
        for j in range(y.shape[1]):
            if np.isnan(y[i][j][0]) or (y[i][j] == mask_value):

                X[i][j] = mask_value
                y[i][j] = mask_value
                
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            if (mask_value in X[i][j]) or np.isnan(X[i][j]).any():
                
                X[i][j] = mask_value
                y[i][j] = mask_value
            
    return X,y

def round_off_EDSS(number):
    """Round a number to the closest half integer.
    >>> round_of_rating(1.3)
    1.5
    >>> round_of_rating(2.6)
    2.5
    >>> round_of_rating(3.0)
    3.0
    >>> round_of_rating(4.1)
    4.0"""
    return np.round(number * 2) / 2

# Keras Callbacks

In [None]:
import os 
import shutil
# reduce learning rate on plateau
reduce_lr = callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2,
                               patience=5, min_lr=0.001)

# stop training if there isn't a significant improvement in the course of 5 epochs
early_stopping = callbacks.EarlyStopping(monitor='loss', min_delta=0.0001, 
                              patience=5, verbose=0, mode='auto', 
                              baseline=None, restore_best_weights=True)

# model_checkpoint = ModelCheckpoint(mcp_file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [reduce_lr, early_stopping]
# callbacks_list_final = [reduce_lr, early_stopping, model_checkpoint]

In [None]:
weights_folder = 'weights1y6m/'
weights_file = 'weights_1y6m.hdf5'

In [None]:
def run_model(X_train, X_test, y_train, y_test, batch_sz = 50, n_epochs = 500):
    
    # Delete Previous Model Checkpoint, if any
    if os.path.exists(weights_folder):
        shutil.rmtree(weights_folder)
    os.makedirs(weights_folder)
        
    weights_file_path = weights_folder+weights_file
    
    model_checkpoint = ModelCheckpoint(weights_file_path, 
                                       monitor='val_loss', 
                                       verbose=1, 
                                       save_best_only=True, mode='min')

    callbacks_list = [reduce_lr, early_stopping, model_checkpoint]

    K.clear_session()
    
    n_time_steps = X_train.shape[1]
    n_inputs = X_train.shape[2]

    m = M.Sequential()
    m.add(L.Masking(mask_value=-99, input_shape=(n_time_steps, n_inputs)))
    m.add(L.GRU(128, return_sequences=True))
    m.add(L.Dropout(0.2))
    m.add(L.Dense(1, activation='relu', input_dim = 3))
    m.compile(optimizer = 'adam', loss = 'mean_absolute_error')

    m.fit(X_train, y_train, 
          batch_size = batch_sz, 
          epochs=n_epochs, 
          validation_split = 0.2,
          callbacks = callbacks_list)  

    m.load_weights(weights_file_path) 
    
    y_pred = round_off_EDSS(m.predict(X_test))
    
    mask_test = np.where(y_test.reshape(-1) != -99)
    
    test_result = mae(y_pred.reshape(-1)[mask_test], y_test.reshape(-1)[mask_test])
    
    print(test_result)

    return test_result

def five_fold(X_og, y_og, ahead, pot=pot):

    less = ahead - 1
    
    if (ahead > 1):
        
        X_og = X_og.iloc[:,:-(n_features*less)].copy()
        y_og = y_og.iloc[:,less:].copy()
    
    
    five_runs = []

    for train, test in skf.split(X_og.index):

        X_train_og = X_og.iloc[train, :].copy()
        X_test_og = X_og.iloc[test, :].copy()
        y_train_og = y_og.iloc[train, :].copy()
        y_test_og = y_og.iloc[test,:].copy()
        
        # train impute feature and target 
        y_train_og = y_train_og.fillna(method = 'ffill',  axis = 1)
        for col in y_train_og.columns:
            if col in X_train_og.columns:
                X_train_og[col] = y_train_og[col]
                
        mask_train = X_train_og.index.isin(X_train_og.dropna().index.tolist())
#         mask_test = X_test_og.index.isin(X_test_og.dropna().index.tolist())

        X_train_og, y_train_og = X_train_og[mask_train], y_train_og[mask_train]
#         X_test_og, y_test_og = X_test_og[mask_test],y_test_og[mask_test]

        n_time_steps = len(y_train_og.columns)
        
        print('TRAIN:')
        X_train, y_train = reshape_data(X_train_og, y_train_og, n_time_steps, pot, n_features)
        print('TEST:')
        X_test, y_test = reshape_data(X_test_og, y_test_og, n_time_steps, pot, n_features)

        # Perpare for Masking Layer 
        X_train, y_train = prepare_for_mask(X_train, y_train)
        X_test, y_test = prepare_for_mask(X_test, y_test)
        
        res = run_model(X_train, X_test, y_train, y_test)
        print(res)
        
        five_runs.append(res)

    return np.mean(five_runs)

# Import Data

In [None]:
X_og = pd.read_csv('../../data/pre_imputation_data/X_1.5_years|6_months.csv', index_col = 0)
y_og = pd.read_csv('../../data/pre_imputation_data/y_1.5_yearS|6_months.csv', index_col = 0)

#n_time_steps = len(y.columns)
n_features = X_og.columns.tolist().index("EDSS_0")+1
pot = 3
n_inputs = n_features * pot

skf = KFold(n_splits=5, shuffle=True, random_state=42)
 
print("The input length of the training data will be", pot, "time slices, separated by 6 month intervals")
print(n_features, "features comprise one time slice")

### Data Processing 

In [None]:
results = []

In [None]:
result1 = five_fold(X_og, y_og, 1)
results.append(result1)

In [None]:
result1 

In [None]:
result2 = five_fold(X_og, y_og, 2)
results.append(result2)

In [None]:
result2

In [None]:
result3 = five_fold(X_og, y_og, 3)
results.append(result3)

In [None]:
result3

In [None]:
result4 = five_fold(X_og, y_og, 4)
results.append(result4)

In [None]:
result4

In [None]:
result5 = five_fold(X_og, y_og, 5)
results.append(result5)

In [None]:
result5

In [None]:
result6 = five_fold(X_og, y_og, 6)
results.append(result6)

In [None]:
result6

In [None]:
result7 = five_fold(X_og, y_og, 7)
results.append(result7)

In [None]:
result7

In [None]:
result8 = five_fold(X_og, y_og, 8)
results.append(result8)

In [None]:
result8

In [None]:
result9 = five_fold(X_og, y_og, 9)
results.append(result9)

In [None]:
result9

In [None]:
result10 = five_fold(X_og, y_og, 10)
results.append(result10)

In [None]:
result10

In [None]:
result11 = five_fold(X_og, y_og, 11)
results.append(result11)

In [None]:
result11

In [None]:
result12 = five_fold(X_og, y_og, 12)
results.append(result12)

In [None]:
result12

In [None]:
results