# Setup enviorment

In [1]:
# Data reading in Dataframe format and data preprocessing
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Linear algebra operations
import numpy as np

# Machine learning models and preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
import tensorflow.keras.backend as K

# Epiweek
from epiweeks import Week, Year

# Date
from datetime import date as convert_to_date

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
embeddings = 'Embeddings/embeddings_contrastive_learning_1024features.csv'
labels = 'Tabular_data/dengue_tabular.csv'
Municipality='Ibagué'

# Read Data

In [4]:
def epiweek_from_date(image_date):
    date = image_date.split('-')
    
    # Get year as int
    year = ''.join(filter(str.isdigit, date[0]))
    year = int(year)
    
    # Get month as int
    month = ''.join(filter(str.isdigit, date[1]))
    month = int(month)
    
    # Get day as int
    day = ''.join(filter(str.isdigit, date[2]))
    day = int(day)
    
    # Get epiweek:
    date = convert_to_date(year, month, day)
    epiweek = str(Week.fromdate(date))
    epiweek = int(epiweek)
    
    return epiweek

### 1. Features

In [5]:
def read_features(path, Municipality = None):
    df = pd.read_csv(path)
    #df.Date = pd.to_datetime(df.Date)
    
    if Municipality:
        print('Obtaining dataframe for the city of Medellin only...')
        df = df[df['Municipality Code'] == Municipality]
        
    df.Date = df.Date.apply(epiweek_from_date)
    
    df = df.sort_values(by=['Date'])
    
    df = df.set_index('Date')
    
    if Municipality:
        df.drop(columns=['Municipality Code'], inplace=True)
        
    df.index.name = None
    return df

In [6]:
features_df = read_features(path=embeddings, Municipality=Municipality)
features_df

Obtaining dataframe for the city of Medellin only...


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
201544,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,60.914200,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944
201545,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,60.914200,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944
201546,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,60.914200,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944
201547,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,60.914200,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944
201548,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,60.914200,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201848,106.780690,0.0,32.777737,18.379940,105.80126,94.180880,0.0,51.702873,37.520320,0.0,...,108.435520,88.691740,0.0,0.0,0.0,0.0,0.0,0.0,33.226227,71.349020
201849,40.236237,0.0,12.437395,6.630134,39.60790,34.530903,0.0,18.765072,14.190913,0.0,...,38.860176,31.293947,0.0,0.0,0.0,0.0,0.0,0.0,11.149647,26.095455
201850,110.027596,0.0,33.642426,18.998648,108.86624,97.187550,0.0,53.270990,38.915524,0.0,...,112.165210,91.725960,0.0,0.0,0.0,0.0,0.0,0.0,34.233818,73.739680
201851,76.441780,0.0,23.284721,12.792800,75.25716,66.796850,0.0,36.122963,27.116320,0.0,...,76.345436,62.035313,0.0,0.0,0.0,0.0,0.0,0.0,22.364012,50.547592


### 2. Labels

In [7]:
def get_epiweek(name):
    
    # Get week
    week = name.split('/')[1]
    week = week.replace('w','')
    week = int(week)
    
    # Year
    year = name.split('/')[0]
    year = int(year)
    
    epiweek = Week(year, week)
    
    epiweek = str(epiweek)
    epiweek = int(epiweek)

    return epiweek

In [8]:
def read_labels(path, Municipality = None):
    df = pd.read_csv(path)
    if df.shape[1] > 678:
        df = pd.concat([df[['Municipality code', 'Municipality']], df.iloc[:,-676:]], axis=1)
        cols = df.iloc[:, 2:].columns
        new_cols = df.iloc[:, 2:].columns.to_series().apply(get_epiweek)
        df = df.rename(columns=dict(zip(cols, new_cols))) 
        
    if 'Label_CSV_All_Municipality' in path:
        # Get Columns
        df = df[['epiweek', 'Municipality code', 'Municipality', 'final_cases_label']]
        
        # change epiweek format
        df.epiweek = df.epiweek.apply(get_epiweek)
        
        # Remove duplicates
        df = df[df.duplicated(['epiweek','Municipality code','Municipality']) == False]
        
        # Replace Increase, decrease, stable to numerical:
        """
        - Stable = 0
        - Increased = 1 
        - Decreased = 2
        """
        df.final_cases_label = df.final_cases_label.replace({'Stable': 0, 'Increased': 1, 'Decreased': 2})
        
        # Create table
        df = df.pivot(index=['Municipality code', 'Municipality'], columns='epiweek', values='final_cases_label')

        # Reset Index:
        df = df.reset_index()
    
    if Municipality:
        df = df[df['Municipality'] == Municipality]
        df.drop(columns=['Municipality code'], inplace=True)
        df.rename(columns={'Municipality': 'Municipality Code'}, inplace=True)
    
        df = df.set_index('Municipality Code')
        df = df.T

        df.columns.name = None
        df.index.name = None
        
        df.columns = ['Labels']
        
        df.index = pd.to_numeric(df.index)
    
    return df

In [9]:
labels_df = read_labels(path=labels, Municipality=Municipality)
labels_df = labels_df

# Data preparation

In [10]:
# Merge the two dataframes based on the date values
dengue_df = features_df.merge(labels_df, how='inner', left_index=True, right_index=True)
dengue_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Labels
201544,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944,132
201545,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944,115
201546,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944,140
201547,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944,112
201548,61.768227,0.0,18.816113,10.067684,60.65894,53.635525,0.0,28.799873,21.906380,0.0,...,49.309160,0.0,0.0,0.0,0.0,0.0,0.0,17.391779,40.532944,112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201848,106.780690,0.0,32.777737,18.379940,105.80126,94.180880,0.0,51.702873,37.520320,0.0,...,88.691740,0.0,0.0,0.0,0.0,0.0,0.0,33.226227,71.349020,22
201849,40.236237,0.0,12.437395,6.630134,39.60790,34.530903,0.0,18.765072,14.190913,0.0,...,31.293947,0.0,0.0,0.0,0.0,0.0,0.0,11.149647,26.095455,9
201850,110.027596,0.0,33.642426,18.998648,108.86624,97.187550,0.0,53.270990,38.915524,0.0,...,91.725960,0.0,0.0,0.0,0.0,0.0,0.0,34.233818,73.739680,12
201851,76.441780,0.0,23.284721,12.792800,75.25716,66.796850,0.0,36.122963,27.116320,0.0,...,62.035313,0.0,0.0,0.0,0.0,0.0,0.0,22.364012,50.547592,16


### Train Test split

In [11]:
def train_test_split(df, train_percentage = 80):
    # We need a sequence so we can't split randomly
    # To divide into Train and test we have to calculate the train percentage of the dataset:
    size = df.shape[0]
    split = int(size*(train_percentage/100))
    
    """ Train """
    # We will train with 1st percentage % of data and test with the rest
    train_df = df.iloc[:split,:] ## percentage % train
    
    """ Test """
    test_df = df.iloc[split:,:] # 100 - percentage % test
    
    print(f'The train shape is: {train_df.shape}')
    print(f'The test shape is: {test_df.shape}')
    
    return train_df, test_df

In [12]:
train_df, test_df = train_test_split(dengue_df, train_percentage = 80)

The train shape is: (132, 1025)
The test shape is: (33, 1025)


### Normalize features

In [13]:
# Normalize train data and create the scaler
def normalize_train_features(df, feature_range=(-1, 1), scaler=True):
    
    scalers = {}
    # For each column in the dataframe
    for i, column in enumerate(df.columns):
        if not scaler:
            if (i == len(df.columns) - 1):
                continue
        
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Generate a new scaler
        scaler = MinMaxScaler(feature_range=feature_range)
        # Fit the scaler just for that column
        scaled_column = scaler.fit_transform(values)
        # Add the scaled column to the dataframe
        scaled_column = np.reshape(scaled_column, len(scaled_column))
        df[column] = scaled_column
        
        # Save the scaler of the column
        scalers['scaler_' + column] = scaler
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df, scalers


""" If you want to use the same scaler used in train, you can use this function"""
def normalize_test_features(df, scalers=None, scaler=True):
    
    if not scalers:
        raise TypeError("You should provide a list of scalers.")
        
    for i, column in enumerate(df.columns):
        if not scaler:
            if (i == len(df.columns) - 1):
                continue
        
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Take the scaler of that column
        scaler = scalers['scaler_' + column]
        # Scale values
        scaled_column = scaler.transform(values)
        scaled_column = np.reshape(scaled_column,len(scaled_column))
        # Add the scaled values to the df
        df[column] = scaled_column
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df 

In [14]:
feature_range = (-1, 1)

# Scale train:
train_df, scalers = normalize_train_features(train_df, feature_range=feature_range)

#print(f'The scalers are: {scalers}')

train_df.head()

 Min values are: 
0        -1.0
1        -1.0
2        -1.0
3        -1.0
4        -1.0
         ... 
1020     -1.0
1021     -1.0
1022     -1.0
1023     -1.0
Labels   -1.0
Length: 1025, dtype: float64
 Max values are: 
0         1.0
1        -1.0
2         1.0
3         1.0
4         1.0
         ... 
1020     -1.0
1021     -1.0
1022      1.0
1023      1.0
Labels    1.0
Length: 1025, dtype: float64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Labels
201544,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.412044,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.518853,-0.388338,0.643312
201545,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.412044,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.518853,-0.388338,0.426752
201546,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.412044,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.518853,-0.388338,0.745223
201547,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.412044,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.518853,-0.388338,0.388535
201548,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.412044,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.518853,-0.388338,0.388535


In [15]:
# Scale test:
test_df = normalize_test_features(test_df, scalers=scalers)
test_df.head()

 Min values are: 
0        -0.934354
1        -1.000000
2        -0.921211
3        -1.002218
4        -0.920316
            ...   
1020     -1.000000
1021     -1.000000
1022     -1.058290
1023     -0.932219
Labels   -0.974522
Length: 1025, dtype: float64
 Max values are: 
0         0.337786
1        -1.000000
2         0.325328
3         0.280800
4         0.325462
            ...   
1020     -1.000000
1021     -1.000000
1022      0.256944
1023      0.318877
Labels   -0.643312
Length: 1025, dtype: float64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Labels
201820,-0.0708,-1.0,-0.088457,-0.162871,-0.091236,-0.096924,-1.0,-0.146969,-0.075745,-1.0,...,-0.124182,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.217552,-0.103601,-0.88535
201821,-0.65507,-1.0,-0.66787,-0.785196,-0.680751,-0.69057,-1.0,-0.750535,-0.670693,-1.0,...,-0.728873,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.841691,-0.700669,-0.88535
201822,-0.034512,-1.0,-0.052641,-0.123214,-0.054633,-0.059581,-1.0,-0.109145,-0.037768,-1.0,...,-0.086426,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.178473,-0.06669,-0.745223
201823,-0.172582,-1.0,-0.180206,-0.215014,-0.18524,-0.187097,-1.0,-0.214373,-0.180567,-1.0,...,-0.201294,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.240884,-0.190443,-0.808917
201824,-0.934354,-1.0,-0.921211,-0.841198,-0.895381,-0.935771,-1.0,-0.896906,-0.893931,-1.0,...,-0.892583,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.715161,-0.918507,-0.643312


### Prepare data for time series supervised learning (function to create sliding window)

In [16]:
# prepare data for time series

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True, no_autoregressive=None):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        if no_autoregressive:
            cols.append(df.shift(i).iloc[:,:-1])
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars-1)]
        else:
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [17]:
# length of window
days = 10
no_autoregressive = True

# frame as supervised learning
train = series_to_supervised(train_df, n_in=days, no_autoregressive=no_autoregressive)
test = series_to_supervised(test_df, n_in=days, no_autoregressive=no_autoregressive)

DataFrame(train)

Unnamed: 0,var1(t-10),var2(t-10),var3(t-10),var4(t-10),var5(t-10),var6(t-10),var7(t-10),var8(t-10),var9(t-10),var10(t-10),...,var1016(t),var1017(t),var1018(t),var1019(t),var1020(t),var1021(t),var1022(t),var1023(t),var1024(t),var1025(t)
201602,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,0.444635,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.399208,0.454903,0.541401
201603,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.977271,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.941639,-0.983859,0.987261
201604,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,0.014250,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.063945,0.031219,0.515924
201605,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.295909,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.400062,-0.272705,0.707006
201606,-0.349643,-1.0,-0.367727,-0.466723,-0.372761,-0.380103,-1.0,-0.438428,-0.359381,-1.0,...,-0.295909,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.400062,-0.272705,0.477707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201815,-0.113176,-1.0,-0.131262,-0.209810,-0.134218,-0.140931,-1.0,-0.194730,-0.118822,-1.0,...,-0.094383,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.176988,-0.075835,-0.923567
201816,0.524182,-1.0,0.514909,0.482473,0.514881,0.512282,-1.0,0.488085,0.520630,-1.0,...,-0.918008,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.922319,-0.909929,-0.974522
201817,0.439760,-1.0,0.430217,0.391984,0.429475,0.425559,-1.0,0.398230,0.435679,-1.0,...,-0.338668,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.436189,-0.316921,-0.885350
201818,0.062085,-1.0,0.048184,-0.017455,0.045933,0.039646,-1.0,-0.002730,0.052554,-1.0,...,0.011623,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.059368,0.027197,-0.910828


### Features and Labels Set

In [18]:
def features_labels_set(timeseries_data, original_df):
    
    """ Features """
    # We define the number of features as (Cases and media cloud)
    n_features = original_df.shape[1]

    # The features to train the model will be all except the values of the actual week 
    # We can't use other variables in week t because whe need to resample a a 3D Array
    features_set = DataFrame(timeseries_data.values[:,:-n_features])
    # Convert pandas data frame to np.array to reshape as 3D Array
    features_set = features_set.to_numpy()
    print(f'The shape of the features is {features_set.shape}')
    
    """ Labels """
    # We will use Covid cases in last week 
    labels_set = DataFrame(timeseries_data.values[:,-1])
    # Convert pandas data frame to np.array
    labels_set = labels_set.to_numpy()
    print(f'The shape of the labels is {labels_set.shape}')
    
    return features_set, labels_set, n_features

In [19]:
# Train features and labels set
print('Train:')
train_X, train_y, n_features = features_labels_set(timeseries_data=train, original_df=dengue_df)

# Test features and labels set
print('Test:')
test_X, test_y, n_features = features_labels_set(timeseries_data=test, original_df=dengue_df)

Train:
The shape of the features is (122, 10240)
The shape of the labels is (122, 1)
Test:
The shape of the features is (23, 10240)
The shape of the labels is (23, 1)


# Modeling

In [20]:
def reshape_tensor(train_X, test_X, n_features, no_autoregressive=None):
    print('The initial shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    # reshape input to be 3D [samples, timesteps, features]
    if no_autoregressive:
        train_X = train_X.reshape((train_X.shape[0], days, n_features-1))
        test_X = test_X.reshape((test_X.shape[0], days, n_features-1))
    
    else:
        train_X = train_X.reshape((train_X.shape[0], days, n_features))
        test_X = test_X.reshape((test_X.shape[0], days, n_features))
    
    print('-----------------------')
    print('The Final shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    return train_X, test_X

In [21]:
# reshape input to be 3D [samples, timesteps, features]
train_X, test_X = reshape_tensor(train_X, test_X, n_features, no_autoregressive)

The initial shapes are:
The train shape is (122, 10240)
The test shape is (23, 10240)
-----------------------
The Final shapes are:
The train shape is (122, 10, 1024)
The test shape is (23, 10, 1024)


# Define the Model

In [22]:
# Set Seed
#tf.random.set_seed(0)

def smape(y_true, y_pred):
    epsilon = 0.1
    summ = K.maximum(K.abs(y_true) + K.abs(y_pred) + epsilon, 0.5 + epsilon)
    smape = K.abs(y_pred - y_true) / summ * 2.0
    return smape


def create_model():
    # design network
    model = Sequential()
    model.add(LSTM(120, dropout=0.1, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
    model.add(LSTM(240, dropout=0.1, input_shape=(train_X.shape[1], 120)))
    model.add(Dense(60))
    model.add(Dense(1))
    
    # Compile the model:
    opt = keras.optimizers.Adam()
    
    # Metrics
    metrics = [
        tf.keras.metrics.RootMeanSquaredError(name='rmse'),
        tf.keras.metrics.MeanAbsolutePercentageError(name='mape'),
        smape
    ]
    
    model.compile(loss='mse', optimizer=opt, metrics=metrics)

    return model

### Train the model

In [23]:
from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=20, 
        verbose=1, mode='auto', restore_best_weights=True)

In [24]:
# fit network
def train_model(model, monitor, plot=None, epochs=50):
    if monitor:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor])
    else:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False)
    
    if plot:
        # plot history
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='validation')
        plt.legend()
        plt.show()

# Test the model

In [25]:
from math import sqrt
from numpy import concatenate

def test_model(model, test_X, test_y, scaler, rnn = None):
    
    # If model is a classical machine learning model and test_X is a 3D tensor, then convert to 2D
    if not rnn and (len(test_X.shape) == 3):
        test_X = test_X.reshape((test_X.shape[0], -1))
    
    # do the prediction
    yhat = model.predict(test_X)
    
    # Invert scaling for forecast
    # Inverse Scaler
    
    # Predicted
    if not rnn:
        yhat = yhat.reshape(-1, 1)
        
    if not scaler:
        return yhat, test_y
    
    inv_yhat = scaler.inverse_transform(yhat)
    
    # Real:
    inv_y = scaler.inverse_transform(test_y)
    
    return inv_yhat, inv_y

### Mean Absolute Percentage Error (MAPE)

$$
MAPE = \displaystyle\frac{100\%}{n}\sum_{t=1}^{n}\left |\frac{x_i-y_i}{y_t}\right|
$$

MAPE has a problem if there are zeros in the test data, so other metrics can be explored

In [26]:
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print('Test MAPE: %.3f' % mape)
    return mape

### Symmetric Mean Absolute Percentage Error (sMAPE)

$$
sMAPE = \displaystyle\frac{100\%}{n}\sum_{t=1}^{n} \frac{|x_i-y_i|}{|x_i|+|y_t|}
$$

In [27]:
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    smape = 1/len(y_true) * np.sum(2 * np.abs(y_pred-y_true) / (np.abs(y_true) + np.abs(y_pred))*100)
    print('Test sMAPE: %.3f' % smape)
    return smape

### Mean Absoulte Error (MAE)
$$
RMSE = \sqrt{(\frac{1}{n})\sum_{i=1}^{n}(x_i-y_i)^{2}}
$$

In [28]:
from sklearn.metrics import mean_squared_error

def root_mean_squared_error(y_true, y_pred):
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    print('Test RMSE: %.3f' % rmse)
    return rmse

In [29]:
def plot_predictions(inv_y, inv_yhat, model_name = ''):
    data_predict = inv_yhat  ## predicted target cases
    dataY_plot = inv_y  ##  real test-target cases

    data_predict = data_predict.reshape(len(data_predict), 1)
    dataY_plot = dataY_plot.reshape(len(dataY_plot), 1)

    plt.plot(dataY_plot, label = 'actual')
    plt.plot(data_predict, label = 'predicted')
    plt.legend(loc="upper left")

    plt.suptitle(f'Time-Series Prediction with {model_name}')
    plt.show()

In [30]:
def evaluate(model, test_X, test_y, scaler):
    stored_results = {}
    
    inv_yhat_lstm, inv_y_lstm = test_model(model=model, test_X=test_X, test_y=test_y, scaler=y_scaler, rnn = True)
    stored_results['mape'] = mean_absolute_percentage_error(inv_y_lstm, inv_yhat_lstm)
    stored_results['smape'] = symmetric_mean_absolute_percentage_error(inv_y_lstm, inv_yhat_lstm)
    stored_results['rmse'] = root_mean_squared_error(inv_y_lstm, inv_yhat_lstm)

    return stored_results

# Calculate Mean and SD

In [31]:
# With LSTM:
#print(f'The scalers are: {scalers.keys()}')
y_scaler = scalers['scaler_Labels']

def calculate_mean_std():
    
    metrics = {
        "rmse": [],
        "mape": [],
        "smape": []
    }
    
    for i in range(10):
        model = create_model()
        train_model(model=model, monitor=monitor)
        stored_results = evaluate(model, test_X, test_y, y_scaler)
        print(stored_results)
        
        for key in metrics.keys():
            metrics[key].append(stored_results[key])
            
    for key in metrics.keys():
        results = metrics[key]
        print(key, f": average={np.average(results):.3f}, std={np.std(results):.3f}")


In [32]:
calculate_mean_std()

Epoch 1/50
8/8 - 3s - loss: 1.3981 - rmse: 1.1824 - mape: 121.7983 - smape: 0.8671 - val_loss: 0.0537 - val_rmse: 0.2318 - val_mape: 25.0512 - val_smape: 0.2708
Epoch 2/50
8/8 - 0s - loss: 0.3362 - rmse: 0.5799 - mape: 71.4442 - smape: 0.8343 - val_loss: 0.0108 - val_rmse: 0.1040 - val_mape: 10.7108 - val_smape: 0.0950
Epoch 3/50
8/8 - 0s - loss: 0.1910 - rmse: 0.4370 - mape: 47.6743 - smape: 0.2871 - val_loss: 0.0095 - val_rmse: 0.0976 - val_mape: 9.9280 - val_smape: 0.0993
Epoch 4/50
8/8 - 0s - loss: 0.1829 - rmse: 0.4276 - mape: 50.1839 - smape: 0.4070 - val_loss: 0.0437 - val_rmse: 0.2089 - val_mape: 22.5688 - val_smape: 0.2406
Epoch 5/50
8/8 - 0s - loss: 0.1217 - rmse: 0.3488 - mape: 36.2427 - smape: 0.2867 - val_loss: 0.0037 - val_rmse: 0.0607 - val_mape: 5.8492 - val_smape: 0.0568
Epoch 6/50
8/8 - 0s - loss: 0.1501 - rmse: 0.3875 - mape: 41.1339 - smape: 0.2817 - val_loss: 0.0111 - val_rmse: 0.1056 - val_mape: 10.8278 - val_smape: 0.1088
Epoch 7/50
8/8 - 0s - loss: 0.1484 - rmse