# Setup enviorment

In [1]:
# Data reading in Dataframe format and data preprocessing
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Linear algebra operations
import numpy as np

# Machine learning models and preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional

# Epiweek
from epiweeks import Week, Year

# Date
from datetime import date as convert_to_date

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
features = 'Tabular_data/dengue_tabular.csv'
labels = 'Tabular_data/Label_CSV_All_Municipality.csv'
MUNICIPALITY = 'Medellín'

# Read Data

In [4]:
def get_epiweek(name):
    
    # Get week
    week = name.split('/')[1]
    week = week.replace('w','')
    week = int(week)
    
    # Year
    year = name.split('/')[0]
    year = int(year)
    
    epiweek = Week(year, week)
    
    epiweek = str(epiweek)
    epiweek = int(epiweek)

    return epiweek

In [5]:
def read_labels(path, Municipality = None):
    df = pd.read_csv(path)
    if df.shape[1] > 678:
        df = pd.concat([df[['Municipality code', 'Municipality']], df.iloc[:,-676:]], axis=1)
        cols = df.iloc[:, 2:].columns
        new_cols = df.iloc[:, 2:].columns.to_series().apply(get_epiweek)
        df = df.rename(columns=dict(zip(cols, new_cols))) 
        
    if 'Label_CSV_All_Municipality' in path:
        # Get Columns
        df = df[['epiweek', 'Municipality code', 'Municipality', 'final_cases_label']]
        
        # change epiweek format
        df.epiweek = df.epiweek.apply(get_epiweek)
        
        # Remove duplicates
        df = df[df.duplicated(['epiweek','Municipality code','Municipality']) == False]
        
        # Replace Increase, decrease, stable to numerical:
        """
        - Stable = 0
        - Increased = 1 
        - Decreased = 2
        """
        df.final_cases_label = df.final_cases_label.replace({'Stable': 0, 'Increased': 1, 'Decreased': 2})
        
        # Create table
        df = df.pivot(index=['Municipality code', 'Municipality'], columns='epiweek', values='final_cases_label')

        # Reset Index:
        df = df.reset_index()
    
    if Municipality:
        df = df[df['Municipality'] == Municipality]
        df.drop(columns=['Municipality code'], inplace=True)
        df.rename(columns={'Municipality': 'Municipality Code'}, inplace=True)
    
        df = df.set_index('Municipality Code')
        df = df.T

        df.columns.name = None
        df.index.name = None
        
        df.columns = ['Cases']
    
    return df

### 1. Features

In [6]:
features_df = read_labels(path=features, Municipality=MUNICIPALITY)
features_df.index = features_df.index.astype(int)
features_df

Unnamed: 0,Cases
200701,1
200702,0
200703,0
200704,0
200705,0
...,...
201948,15
201949,20
201950,30
201951,14


### 2. Labels

In [7]:
labels_df = read_labels(path=labels, Municipality=MUNICIPALITY)
labels_df_orig = labels_df
labels_df = pd.get_dummies(labels_df['Cases'])
labels_df

Unnamed: 0,0,1,2
201601,1,0,0
201602,0,1,0
201603,0,0,1
201604,1,0,0
201605,1,0,0
...,...,...,...
201848,1,0,0
201849,0,0,1
201850,0,1,0
201851,1,0,0


# Data preparation

In [8]:
n_labels = labels_df.shape[1]

In [9]:
# Merge the two dataframes based on the date values
dengue_df = features_df.merge(labels_df, how='inner', left_index=True, right_index=True)
dengue_df

Unnamed: 0,Cases,0,1,2
201601,235,1,0,0
201602,274,0,1,0
201603,252,0,0,1
201604,262,1,0,0
201605,274,1,0,0
...,...,...,...,...
201848,28,1,0,0
201849,12,0,0,1
201850,27,0,1,0
201851,17,1,0,0


### Train Test split

In [10]:
def train_test_split(df, train_percentage = 80):
    # We need a sequence so we can't split randomly
    # To divide into Train and test we have to calculate the train percentage of the dataset:
    size = df.shape[0]
    split = int(size*(train_percentage/100))
    
    """ Train """
    # We will train with 1st percentage % of data and test with the rest
    train_df = df.iloc[:split,:] ## percentage % train
    
    """ Test """
    test_df = df.iloc[split:,:] # 100 - percentage % test
    
    print(f'The train shape is: {train_df.shape}')
    print(f'The test shape is: {test_df.shape}')
    
    return train_df, test_df

In [11]:
train_df, test_df = train_test_split(dengue_df, train_percentage = 80)

The train shape is: (124, 4)
The test shape is: (32, 4)


### Normalize features

In [12]:
# Normalize train data and create the scaler
def normalize_train_features(df, feature_range=(-1, 1), n_labels=None):
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    scalers = {}
    # For each column in the dataframe
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Generate a new scaler
        scaler = MinMaxScaler(feature_range=feature_range)
        # Fit the scaler just for that column
        scaled_column = scaler.fit_transform(values)
        # Add the scaled column to the dataframe
        scaled_column = np.reshape(scaled_column, len(scaled_column))
        df[column] = scaled_column
        
        # Save the scaler of the column
        scalers['scaler_' + column] = scaler
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df, scalers


""" If you want to use the same scaler used in train, you can use this function"""
def normalize_test_features(df, scalers=None, n_labels=None):
    
    if not scalers:
        raise TypeError("You should provide a list of scalers.")
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Take the scaler of that column
        scaler = scalers['scaler_' + column]
        # Scale values
        scaled_column = scaler.transform(values)
        scaled_column = np.reshape(scaled_column,len(scaled_column))
        # Add the scaled values to the df
        df[column] = scaled_column
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df 

In [13]:
feature_range = (-1, 1)

# Scale train:
train_df, scalers = normalize_train_features(train_df, feature_range=feature_range, n_labels=n_labels)

#print(f'The scalers are: {scalers}')

train_df.head()

 Min values are: 
Cases   -1.0
0        0.0
1        0.0
2        0.0
dtype: float64
 Max values are: 
Cases    1.0
0        1.0
1        1.0
2        1.0
dtype: float64


Unnamed: 0,Cases,0,1,2
201601,-0.235294,1,0,0
201602,-0.100346,0,1,0
201603,-0.176471,0,0,1
201604,-0.141869,1,0,0
201605,-0.100346,1,0,0


In [14]:
# Scale test:
test_df = normalize_test_features(test_df, scalers=scalers, n_labels=n_labels)
test_df.head()

 Min values are: 
Cases   -1.00692
0        0.00000
1        0.00000
2        0.00000
dtype: float64
 Max values are: 
Cases   -0.906574
0        1.000000
1        1.000000
2        1.000000
dtype: float64


Unnamed: 0,Cases,0,1,2
201821,-0.989619,1,0,0
201822,-0.982699,1,0,0
201823,-0.968858,1,0,0
201824,-0.972318,1,0,0
201825,-0.968858,1,0,0


### Prepare data for time series supervised learning (function to create sliding window)

In [15]:
# prepare data for time series

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True, no_autoregressive=None):
    if no_autoregressive:
        n_in = n_in - 1
        
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        if no_autoregressive:
            cols.append(df.shift(i).iloc[:,:-n_labels])
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars-n_labels)]
        else:
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [16]:
# length of window
days = 10
no_autoregressive = True

# frame as supervised learning
train = series_to_supervised(train_df, n_in=days, no_autoregressive=no_autoregressive)
test = series_to_supervised(test_df, n_in=days, no_autoregressive=no_autoregressive)

#DataFrame(train)

### Features and Labels Set

In [17]:
def features_labels_set(timeseries_data, original_df, n_labels, no_autoregressive):
    
    """ Features """
    # We define the number of features as (features and labels)
    n_features = original_df.shape[1]

    # The features to train the model will be all except the values of the actual week 
    # We can't use other variables in week t because whe need to resample a a 3D Array
    if no_autoregressive:
        features_set = DataFrame(timeseries_data.values[:,:-n_labels])
    else:    
        features_set = DataFrame(timeseries_data.values[:,:-n_features])
    # Convert pandas data frame to np.array to reshape as 3D Array
    features_set = features_set.to_numpy()
    print(f'The shape of the features is {features_set.shape}')
    
    """ Labels """
    # We will use labels in last week 
    labels_set = DataFrame(timeseries_data.values[:,-n_labels:])
    # Convert pandas data frame to np.array
    labels_set = labels_set.to_numpy()
    print(f'The shape of the labels is {labels_set.shape}')
    
    return features_set, labels_set, n_features

In [18]:
# Train features and labels set
print('Train:')
train_X, train_y, n_features = features_labels_set(timeseries_data=train, original_df=dengue_df, n_labels=n_labels, no_autoregressive=no_autoregressive)

# Test features and labels set
print('Test:')
test_X, test_y, n_features = features_labels_set(timeseries_data=test, original_df=dengue_df, n_labels=n_labels, no_autoregressive=no_autoregressive)

Train:
The shape of the features is (115, 10)
The shape of the labels is (115, 3)
Test:
The shape of the features is (23, 10)
The shape of the labels is (23, 3)


# Modeling

In [19]:
def reshape_tensor(train_X, test_X, n_features):
    print('The initial shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], days, n_features))
    test_X = test_X.reshape((test_X.shape[0], days, n_features))
    
    print('-----------------------')
    print('The Final shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    return train_X, test_X

In [20]:
if no_autoregressive:
    # reshape input to be 3D [samples, timesteps, features]
    train_X, test_X = reshape_tensor(train_X, test_X, n_features-n_labels)
else:
    # reshape input to be 3D [samples, timesteps, features]
    train_X, test_X = reshape_tensor(train_X, test_X, n_features-n_labels)

The initial shapes are:
The train shape is (115, 10)
The test shape is (23, 10)
-----------------------
The Final shapes are:
The train shape is (115, 10, 1)
The test shape is (23, 10, 1)


# Define the Model

In [21]:
# Set Seed
#tf.random.set_seed(0)

def create_model():
    # design network
    model = Sequential()
    model.add(LSTM(120, dropout=0.1, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
    model.add(LSTM(240, dropout=0.1, input_shape=(train_X.shape[1], 120)))
    model.add(Dense(60))
    model.add(Dense(3, activation='softmax'))
    
    # Compile the model:
    opt = keras.optimizers.Adam()
    metrics = [
        tf.keras.metrics.AUC(name='auc', multi_label=True, num_labels=3),
        tf.keras.metrics.CategoricalAccuracy(name='acc'),
        tf.keras.metrics.CategoricalCrossentropy(name='entropy')
    ]
    
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)

    return model

### Train the model

In [22]:
from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=20, 
        verbose=1, mode='auto', restore_best_weights=True)

In [23]:
### Imbalanced data
n_zeros = (labels_df_orig.to_numpy() == 0).sum()
n_ones = (labels_df_orig.to_numpy() == 1).sum()
n_twos = (labels_df_orig.to_numpy() == 2).sum()
n_total = n_zeros + n_ones + n_twos

weights = {0: n_total/n_zeros, 1: n_total/n_ones, 2: n_total/n_twos}
print(f'zeros: {n_zeros}, ones: {n_ones}, twos: {n_twos}, total: {n_total}')
weights

zeros: 113, ones: 20, twos: 23, total: 156


{0: 1.3805309734513274, 1: 7.8, 2: 6.782608695652174}

In [24]:
# fit network
def train_model(model, monitor, weights, plot=None, epochs=20):
    if monitor and weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor], class_weight=weights)
    elif monitor:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor])
    elif weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, class_weight=weights)
    else:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False)
    
    if plot:
        # plot history
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='validation')
        plt.legend()
        plt.show()
        

# Test the model

# AUC

In [25]:
# You can also evaluate or predict on a dataset.
def evaluate(model, verbose = None):
    if verbose:
        print('Evaluate: ')
    result = model.evaluate(test_X, test_y)
    stored_results = {}
    for i, metric in enumerate(model.metrics_names):
        stored_results[metric] = result[i]
        if verbose:
            print(f'{metric}: {result[i]}')
    return stored_results

# Calculate Mean and SD

In [26]:
def calculate_mean_std(weights):
    
    metrics = {
        "auc": [],
        "acc": [],
        "entropy": []
    }
    
    for i in range(3):
        model = create_model()
        train_model(model=model, monitor=monitor, weights=weights)
        stored_results = evaluate(model=model)
        
        for key in metrics.keys():
            metrics[key].append(stored_results[key])
            
    for key in metrics.keys():
        results = metrics[key]
        print(key, f": average={np.average(results):.3f}, std={np.std(results):.3f}")


In [27]:
calculate_mean_std(weights=None)

Epoch 1/20
8/8 - 3s - loss: 1.0312 - auc: 0.5602 - acc: 0.4696 - entropy: 1.0312 - val_loss: 0.5699 - val_auc: 0.5667 - val_acc: 0.8696 - val_entropy: 0.5699
Epoch 2/20
8/8 - 0s - loss: 0.6910 - auc: 0.7788 - acc: 0.6609 - entropy: 0.6910 - val_loss: 0.5226 - val_auc: 0.5000 - val_acc: 0.8696 - val_entropy: 0.5226
Epoch 3/20
8/8 - 0s - loss: 0.7130 - auc: 0.7501 - acc: 0.6696 - entropy: 0.7130 - val_loss: 0.4925 - val_auc: 0.5333 - val_acc: 0.8696 - val_entropy: 0.4925
Epoch 4/20
8/8 - 0s - loss: 0.6983 - auc: 0.7651 - acc: 0.6783 - entropy: 0.6983 - val_loss: 0.5020 - val_auc: 0.4545 - val_acc: 0.8696 - val_entropy: 0.5020
Epoch 5/20
8/8 - 0s - loss: 0.7013 - auc: 0.7676 - acc: 0.6783 - entropy: 0.7013 - val_loss: 0.5015 - val_auc: 0.4848 - val_acc: 0.8696 - val_entropy: 0.5015
Epoch 6/20
8/8 - 0s - loss: 0.6888 - auc: 0.7863 - acc: 0.6957 - entropy: 0.6888 - val_loss: 0.5136 - val_auc: 0.5000 - val_acc: 0.8696 - val_entropy: 0.5136
Epoch 7/20
8/8 - 0s - loss: 0.7104 - auc: 0.7358 - a

In [28]:
calculate_mean_std(weights=weights)

Epoch 1/20
8/8 - 3s - loss: 3.4792 - auc: 0.5696 - acc: 0.5217 - entropy: 1.0260 - val_loss: 0.6800 - val_auc: 0.4917 - val_acc: 0.8696 - val_entropy: 0.6800
Epoch 2/20
8/8 - 0s - loss: 3.2457 - auc: 0.7666 - acc: 0.6783 - entropy: 0.7476 - val_loss: 0.6208 - val_auc: 0.4750 - val_acc: 0.8696 - val_entropy: 0.6208
Epoch 3/20
8/8 - 0s - loss: 3.1442 - auc: 0.7757 - acc: 0.7043 - entropy: 0.8361 - val_loss: 0.6402 - val_auc: 0.5106 - val_acc: 0.8696 - val_entropy: 0.6402
Epoch 4/20
8/8 - 0s - loss: 3.1129 - auc: 0.7701 - acc: 0.7043 - entropy: 0.8199 - val_loss: 0.5885 - val_auc: 0.5099 - val_acc: 0.8696 - val_entropy: 0.5885
Epoch 5/20
8/8 - 0s - loss: 3.0543 - auc: 0.7858 - acc: 0.6870 - entropy: 0.8641 - val_loss: 0.5984 - val_auc: 0.5444 - val_acc: 0.8696 - val_entropy: 0.5984
Epoch 6/20
8/8 - 0s - loss: 3.1393 - auc: 0.7348 - acc: 0.6783 - entropy: 0.9300 - val_loss: 0.6917 - val_auc: 0.5405 - val_acc: 0.8696 - val_entropy: 0.6917
Epoch 7/20
8/8 - 0s - loss: 3.0356 - auc: 0.7819 - a