# Setup enviorment

In [1]:
# Data reading in Dataframe format and data preprocessing
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Linear algebra operations
import numpy as np

# Machine learning models and preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional

# Epiweek
from epiweeks import Week, Year

# Date
from datetime import date as convert_to_date

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
features1 = 'Tabular_data/precipitation_all.csv'
features2 = 'Tabular_data/temperature_all 2.csv'
labels = 'Tabular_data/Label_CSV_All_Municipality.csv'
MUNICIPALITY = 'Cali'

In [4]:
cities =  {
  "76001":	"Cali",
  "05001":	"Medellín",
  "50001":	"Villavicencio",
  "54001":	"Cúcuta",
  "73001":	"Ibagué",
  "68001":	"Bucaramanga",
  "05360":	"Itagüí",
  "08001":	"Barranquilla",
  "41001":	"Neiva",
  "23001":	"Montería"
}

# Read Data

In [5]:
def epiweek_from_date(image_date):
    date = image_date.split('-')
    
    # Get year as int
    year = ''.join(filter(str.isdigit, date[0]))
    year = int(year)
    
    # Get month as int
    month = ''.join(filter(str.isdigit, date[1]))
    month = int(month)
    
    # Get day as int
    day = ''.join(filter(str.isdigit, date[2]))
    day = int(day)
    
    # Get epiweek:
    date = convert_to_date(year, month, day)
    epiweek = str(Week.fromdate(date))
    epiweek = int(epiweek)
    
    return epiweek

In [6]:
def get_epiweek(name):
    
    # Get week
    week = name.split('/')[1]
    week = week.replace('w','')
    week = int(week)
    
    # Year
    year = name.split('/')[0]
    year = int(year)
    
    epiweek = Week(year, week)
    
    epiweek = str(epiweek)
    epiweek = int(epiweek)

    return epiweek

In [7]:
def read_labels(path, Municipality = None):
    df = pd.read_csv(path)
    if df.shape[1] > 678:
        df = pd.concat([df[['Municipality code', 'Municipality']], df.iloc[:,-676:]], axis=1)
        cols = df.iloc[:, 2:].columns
        new_cols = df.iloc[:, 2:].columns.to_series().apply(get_epiweek)
        df = df.rename(columns=dict(zip(cols, new_cols))) 
        
    if 'Label_CSV_All_Municipality' in path:
        # Get Columns
        df = df[['epiweek', 'Municipality code', 'Municipality', 'final_cases_label']]
        
        # change epiweek format
        df.epiweek = df.epiweek.apply(get_epiweek)
        
        # Remove duplicates
        df = df[df.duplicated(['epiweek','Municipality code','Municipality']) == False]
        
        # Replace Increase, decrease, stable to numerical:
        """
        - Stable = 0
        - Increased = 1 
        - Decreased = 2
        """
        df.final_cases_label = df.final_cases_label.replace({'Stable': 0, 'Increased': 1, 'Decreased': 2})
        
        # Create table
        df = df.pivot(index=['Municipality code', 'Municipality'], columns='epiweek', values='final_cases_label')

        # Reset Index:
        df = df.reset_index()
    
    if Municipality:
        df = df[df['Municipality'] == Municipality]
        df.drop(columns=['Municipality code'], inplace=True)
        df.rename(columns={'Municipality': 'Municipality Code'}, inplace=True)
    
        df = df.set_index('Municipality Code')
        df = df.T

        df.columns.name = None
        df.index.name = None
        
        df.columns = ['Cases']
    
    return df

### 1. Features

In [8]:
def get_code(MUNICIPALITY):
    for code, city in cities.items():
        if city == MUNICIPALITY:
            return code

In [9]:
code = get_code(MUNICIPALITY)

# Precipitation
for col in pd.read_csv(features1).columns:
    if code in col:
        column = col
        continue
        
precipitation_df = pd.read_csv(features1)[['LastDayWeek', column]]

# Temperature
for col in pd.read_csv(features2).columns:
    if code in col:
        column = col
        continue
        
temperature_df = pd.read_csv(features2)[['LastDayWeek', column]]

features_df = temperature_df.merge(precipitation_df, how='inner', on='LastDayWeek')

features_df['LastDayWeek'] = features_df['LastDayWeek'].apply(epiweek_from_date)

features_df = features_df.set_index('LastDayWeek')
features_df.index.name = None

features_df

Unnamed: 0,temperature_76001,precipitation_76001
200701,26.798994,2.351892
200702,27.445338,4.404588
200703,26.663907,9.273447
200704,28.156202,6.406617
200705,26.317674,0.016323
...,...,...
201949,22.310196,3.819546
201950,23.566368,2.027720
201951,26.581760,9.874010
201952,26.583045,7.529475


### 2. Labels

In [10]:
labels_df = read_labels(path=labels, Municipality=MUNICIPALITY)
labels_df_orig = labels_df
labels_df = pd.get_dummies(labels_df['Cases'])
labels_df

Unnamed: 0,0,1,2
201601,1,0,0
201602,0,1,0
201603,0,1,0
201604,0,1,0
201605,0,1,0
...,...,...,...
201848,1,0,0
201849,1,0,0
201850,1,0,0
201851,1,0,0


# Data preparation

In [11]:
n_labels = labels_df.shape[1]

In [12]:
# Merge the two dataframes based on the date values
dengue_df = features_df.merge(labels_df, how='inner', left_index=True, right_index=True)
dengue_df

Unnamed: 0,temperature_76001,precipitation_76001,0,1,2
201601,23.840761,13.893864,1,0,0
201602,24.552385,1.694592,0,1,0
201603,22.452635,3.308565,0,1,0
201604,28.260753,0.000000,0,1,0
201605,26.890879,0.626154,0,1,0
...,...,...,...,...,...
201848,22.129020,10.406382,1,0,0
201849,24.505446,1.638980,1,0,0
201850,28.386873,4.215898,1,0,0
201851,25.932081,3.315602,1,0,0


### Train Test split

In [13]:
def train_test_split(df, train_percentage = 80):
    # We need a sequence so we can't split randomly
    # To divide into Train and test we have to calculate the train percentage of the dataset:
    size = df.shape[0]
    split = int(size*(train_percentage/100))
    
    """ Train """
    # We will train with 1st percentage % of data and test with the rest
    train_df = df.iloc[:split,:] ## percentage % train
    
    """ Test """
    test_df = df.iloc[split:,:] # 100 - percentage % test
    
    print(f'The train shape is: {train_df.shape}')
    print(f'The test shape is: {test_df.shape}')
    
    return train_df, test_df

In [14]:
train_df, test_df = train_test_split(dengue_df, train_percentage = 80)

The train shape is: (124, 5)
The test shape is: (32, 5)


### Normalize features

In [15]:
# Normalize train data and create the scaler
def normalize_train_features(df, feature_range=(-1, 1), n_labels=None):
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    scalers = {}
    # For each column in the dataframe
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Generate a new scaler
        scaler = MinMaxScaler(feature_range=feature_range)
        # Fit the scaler just for that column
        scaled_column = scaler.fit_transform(values)
        # Add the scaled column to the dataframe
        scaled_column = np.reshape(scaled_column, len(scaled_column))
        df[column] = scaled_column
        
        # Save the scaler of the column
        scalers['scaler_' + column] = scaler
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df, scalers


""" If you want to use the same scaler used in train, you can use this function"""
def normalize_test_features(df, scalers=None, n_labels=None):
    
    if not scalers:
        raise TypeError("You should provide a list of scalers.")
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Take the scaler of that column
        scaler = scalers['scaler_' + column]
        # Scale values
        scaled_column = scaler.transform(values)
        scaled_column = np.reshape(scaled_column,len(scaled_column))
        # Add the scaled values to the df
        df[column] = scaled_column
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df 

In [16]:
feature_range = (-1, 1)

# Scale train:
train_df, scalers = normalize_train_features(train_df, feature_range=feature_range, n_labels=n_labels)

#print(f'The scalers are: {scalers}')

train_df.head()

 Min values are: 
temperature_76001     -1.0
precipitation_76001   -1.0
0                      0.0
1                      0.0
2                      0.0
dtype: float64
 Max values are: 
temperature_76001      1.0
precipitation_76001    1.0
0                      1.0
1                      1.0
2                      1.0
dtype: float64


Unnamed: 0,temperature_76001,precipitation_76001,0,1,2
201601,0.30242,0.720816,1,0,0
201602,0.376799,-0.790117,0,1,0
201603,0.157332,-0.59022,0,1,0
201604,0.764398,-1.0,0,1,0
201605,0.621219,-0.922448,0,1,0


In [17]:
# Scale test:
test_df = normalize_test_features(test_df, scalers=scalers, n_labels=n_labels)
test_df.head()

 Min values are: 
temperature_76001      0.123508
precipitation_76001   -1.000000
0                      1.000000
1                      0.000000
2                      0.000000
dtype: float64
 Max values are: 
temperature_76001      0.892130
precipitation_76001    0.288876
0                      1.000000
1                      0.000000
2                      0.000000
dtype: float64


Unnamed: 0,temperature_76001,precipitation_76001,0,1,2
201821,0.333954,-0.172734,1,0,0
201822,0.359457,-0.351993,1,0,0
201823,0.564495,-0.953846,1,0,0
201824,0.578864,-0.406403,1,0,0
201825,0.60958,-0.897911,1,0,0


### Prepare data for time series supervised learning (function to create sliding window)

In [18]:
# prepare data for time series

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True, no_autoregressive=None):
    if no_autoregressive:
        n_in = n_in - 1
        
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        if no_autoregressive:
            cols.append(df.shift(i).iloc[:,:-n_labels])
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars-n_labels)]
        else:
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [19]:
# length of window
days = 10
no_autoregressive = True

# frame as supervised learning
train = series_to_supervised(train_df, n_in=days, no_autoregressive=no_autoregressive)
test = series_to_supervised(test_df, n_in=days, no_autoregressive=no_autoregressive)

#DataFrame(train)

### Features and Labels Set

In [20]:
def features_labels_set(timeseries_data, original_df, n_labels, no_autoregressive):
    
    """ Features """
    # We define the number of features as (features and labels)
    n_features = original_df.shape[1]

    # The features to train the model will be all except the values of the actual week 
    # We can't use other variables in week t because whe need to resample a a 3D Array
    if no_autoregressive:
        features_set = DataFrame(timeseries_data.values[:,:-n_labels])
    else:    
        features_set = DataFrame(timeseries_data.values[:,:-n_features])
    # Convert pandas data frame to np.array to reshape as 3D Array
    features_set = features_set.to_numpy()
    print(f'The shape of the features is {features_set.shape}')
    
    """ Labels """
    # We will use labels in last week 
    labels_set = DataFrame(timeseries_data.values[:,-n_labels:])
    # Convert pandas data frame to np.array
    labels_set = labels_set.to_numpy()
    print(f'The shape of the labels is {labels_set.shape}')
    
    return features_set, labels_set, n_features

In [21]:
# Train features and labels set
print('Train:')
train_X, train_y, n_features = features_labels_set(timeseries_data=train, original_df=dengue_df, n_labels=n_labels, no_autoregressive=no_autoregressive)

# Test features and labels set
print('Test:')
test_X, test_y, n_features = features_labels_set(timeseries_data=test, original_df=dengue_df, n_labels=n_labels, no_autoregressive=no_autoregressive)

Train:
The shape of the features is (115, 20)
The shape of the labels is (115, 3)
Test:
The shape of the features is (23, 20)
The shape of the labels is (23, 3)


# Modeling

In [22]:
def reshape_tensor(train_X, test_X, n_features):
    print('The initial shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], days, n_features))
    test_X = test_X.reshape((test_X.shape[0], days, n_features))
    
    print('-----------------------')
    print('The Final shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    return train_X, test_X

In [23]:
if no_autoregressive:
    # reshape input to be 3D [samples, timesteps, features]
    train_X, test_X = reshape_tensor(train_X, test_X, n_features-n_labels)
else:
    # reshape input to be 3D [samples, timesteps, features]
    train_X, test_X = reshape_tensor(train_X, test_X, n_features-n_labels)

The initial shapes are:
The train shape is (115, 20)
The test shape is (23, 20)
-----------------------
The Final shapes are:
The train shape is (115, 10, 2)
The test shape is (23, 10, 2)


# Define the Model

In [24]:
# Set Seed
#tf.random.set_seed(0)

def create_model():
    # design network
    model = Sequential()
    model.add(LSTM(120, dropout=0.1, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
    model.add(LSTM(240, dropout=0.1, input_shape=(train_X.shape[1], 120)))
    model.add(Dense(60))
    model.add(Dense(3, activation='softmax'))
    
    # Compile the model:
    opt = keras.optimizers.Adam()
    metrics = [
        tf.keras.metrics.AUC(name='auc', multi_label=True, num_labels=3),
        tf.keras.metrics.CategoricalAccuracy(name='acc'),
        tf.keras.metrics.CategoricalCrossentropy(name='entropy')
    ]
    
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)

    return model

### Train the model

In [25]:
from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=20, 
        verbose=1, mode='auto', restore_best_weights=True)

In [26]:
### Imbalanced data
n_zeros = (labels_df_orig.to_numpy() == 0).sum()
n_ones = (labels_df_orig.to_numpy() == 1).sum()
n_twos = (labels_df_orig.to_numpy() == 2).sum()
n_total = n_zeros + n_ones + n_twos

weights = {0: n_total/n_zeros, 1: n_total/n_ones, 2: n_total/n_twos}
print(f'zeros: {n_zeros}, ones: {n_ones}, twos: {n_twos}, total: {n_total}')
weights

zeros: 127, ones: 12, twos: 17, total: 156


{0: 1.2283464566929134, 1: 13.0, 2: 9.176470588235293}

In [27]:
# fit network
def train_model(model, monitor, weights, plot=None, epochs=20):
    if monitor and weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor], class_weight=weights)
    elif monitor:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor])
    elif weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, class_weight=weights)
    else:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False)
    
    if plot:
        # plot history
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='validation')
        plt.legend()
        plt.show()
        

# Test the model

# AUC

In [28]:
# You can also evaluate or predict on a dataset.
def evaluate(model, verbose = None):
    if verbose:
        print('Evaluate: ')
    result = model.evaluate(test_X, test_y)
    stored_results = {}
    for i, metric in enumerate(model.metrics_names):
        stored_results[metric] = result[i]
        if verbose:
            print(f'{metric}: {result[i]}')
    return stored_results

# Calculate Mean and SD

In [29]:
def calculate_mean_std(weights):
    
    metrics = {
        "auc": [],
        "acc": [],
        "entropy": []
    }
    
    for i in range(3):
        model = create_model()
        train_model(model=model, monitor=monitor, weights=weights)
        stored_results = evaluate(model=model)
        
        for key in metrics.keys():
            metrics[key].append(stored_results[key])
            
    for key in metrics.keys():
        results = metrics[key]
        print(key, f": average={np.average(results):.3f}, std={np.std(results):.3f}")


In [30]:
calculate_mean_std(weights=None)

Epoch 1/20
8/8 - 3s - loss: 0.7380 - auc: 0.8211 - acc: 0.8087 - entropy: 0.7380 - val_loss: 0.0014 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.0014
Epoch 2/20
8/8 - 0s - loss: 1.1145 - auc: 0.1937 - acc: 0.8087 - entropy: 1.1145 - val_loss: 0.2925 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.2925
Epoch 3/20
8/8 - 0s - loss: 0.7015 - auc: 0.4622 - acc: 0.8087 - entropy: 0.7015 - val_loss: 0.3136 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.3136
Epoch 4/20
8/8 - 0s - loss: 0.6243 - auc: 0.6353 - acc: 0.8087 - entropy: 0.6243 - val_loss: 0.1384 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.1384
Epoch 5/20
8/8 - 0s - loss: 0.6097 - auc: 0.5859 - acc: 0.8087 - entropy: 0.6097 - val_loss: 0.0837 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.0837
Epoch 6/20
8/8 - 0s - loss: 0.6653 - auc: 0.3928 - acc: 0.8087 - entropy: 0.6653 - val_loss: 0.1269 - val_auc: 0.0000e+00 - val_acc: 1.0000 - val_entropy: 0.1269
Epoch 7/20
8/8 - 0s - loss: 

In [31]:
calculate_mean_std(weights=weights)

Epoch 1/20
8/8 - 5s - loss: 3.3759 - auc: 0.3516 - acc: 0.0783 - entropy: 1.2726 - val_loss: 1.1521 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 1.1521
Epoch 2/20
8/8 - 0s - loss: 3.1201 - auc: 0.6440 - acc: 0.1391 - entropy: 1.0865 - val_loss: 0.9897 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 0.9897
Epoch 3/20
8/8 - 0s - loss: 3.1222 - auc: 0.5446 - acc: 0.1478 - entropy: 1.0288 - val_loss: 0.9398 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 0.9398
Epoch 4/20
8/8 - 0s - loss: 3.1070 - auc: 0.5213 - acc: 0.1652 - entropy: 1.0127 - val_loss: 0.9529 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 0.9529
Epoch 5/20
8/8 - 0s - loss: 3.1042 - auc: 0.5211 - acc: 0.2174 - entropy: 1.0197 - val_loss: 0.9644 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 0.9644
Epoch 6/20
8/8 - 0s - loss: 3.0817 - auc: 0.5074 - acc: 0.2000 - entropy: 1.0224 - val_loss: 0.9752 - val_auc: 0.0000e+00 - val_acc: 0.0000e+00 - val_entropy: 0.9752
Epoc