# Setup enviorment

In [1]:
# Data reading in Dataframe format and data preprocessing
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Linear algebra operations
import numpy as np

# Machine learning models and preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

# Deep learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, layers, callbacks
from tensorflow.keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional

# Epiweek
from epiweeks import Week, Year

# Date
from datetime import date as convert_to_date

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
embeddings = 'Embeddings/embeddings_vae_1024features_augmented.csv'
labels = 'Tabular_data/Label_CSV_All_Municipality.csv'
Municipalities = ['Medellín', 'Cali', 'Villavicencio', 'Cúcuta', 'Ibagué']

# Read Data

In [4]:
def epiweek_from_date(image_date):
    date = image_date.split('-')
    
    # Get year as int
    year = ''.join(filter(str.isdigit, date[0]))
    year = int(year)
    
    # Get month as int
    month = ''.join(filter(str.isdigit, date[1]))
    month = int(month)
    
    # Get day as int
    day = ''.join(filter(str.isdigit, date[2]))
    day = int(day)
    
    # Get epiweek:
    date = convert_to_date(year, month, day)
    epiweek = str(Week.fromdate(date))
    epiweek = int(epiweek)
    
    return epiweek

### 1. Features

In [5]:
def read_features(path, Municipality = None):
    df = pd.read_csv(path)
    #df.Date = pd.to_datetime(df.Date)
    
    if Municipality:
        print('Obtaining dataframe for the city of Medellin only...')
        df = df[df['Municipality Code'] == Municipality]
        
    df.Date = df.Date.apply(epiweek_from_date)
    
    df = df.sort_values(by=['Date'])
    
    df = df.set_index('Date')
    
    if Municipality:
        df.drop(columns=['Municipality Code'], inplace=True)
        
    df.index.name = None
    return df

In [6]:
features_df = [read_features(path=embeddings, Municipality=municipality) for municipality in Municipalities]
#features_df[0]

Obtaining dataframe for the city of Medellin only...
Obtaining dataframe for the city of Medellin only...
Obtaining dataframe for the city of Medellin only...
Obtaining dataframe for the city of Medellin only...
Obtaining dataframe for the city of Medellin only...


### 2. Labels

In [7]:
def get_epiweek(name):
    
    # Get week
    week = name.split('/')[1]
    week = week.replace('w','')
    week = int(week)
    
    # Year
    year = name.split('/')[0]
    year = int(year)
    
    epiweek = Week(year, week)
    
    epiweek = str(epiweek)
    epiweek = int(epiweek)

    return epiweek

In [8]:
def read_labels(path, Municipality = None):
    df = pd.read_csv(path)
    if df.shape[1] > 678:
        df = pd.concat([df[['Municipality code', 'Municipality']], df.iloc[:,-676:]], axis=1)
        cols = df.iloc[:, 2:].columns
        new_cols = df.iloc[:, 2:].columns.to_series().apply(get_epiweek)
        df = df.rename(columns=dict(zip(cols, new_cols))) 
        
    if 'Label_CSV_All_Municipality' in path:
        # Get Columns
        df = df[['epiweek', 'Municipality code', 'Municipality', 'final_cases_label']]
        
        # change epiweek format
        df.epiweek = df.epiweek.apply(get_epiweek)
        
        # Remove duplicates
        df = df[df.duplicated(['epiweek','Municipality code','Municipality']) == False]
        
        # Replace Increase, decrease, stable to numerical:
        """
        - Stable = 0
        - Increased = 1 
        - Decreased = 2
        """
        df.final_cases_label = df.final_cases_label.replace({'Stable': 0, 'Increased': 1, 'Decreased': 2})
        
        # Create table
        df = df.pivot(index=['Municipality code', 'Municipality'], columns='epiweek', values='final_cases_label')

        # Reset Index:
        df = df.reset_index()
    
    if Municipality:
        df = df[df['Municipality'] == Municipality]
        df.drop(columns=['Municipality code'], inplace=True)
        df.rename(columns={'Municipality': 'Municipality Code'}, inplace=True)
    
        df = df.set_index('Municipality Code')
        df = df.T

        df.columns.name = None
        df.index.name = None
        
        df.columns = ['Labels']
    
    return df

In [9]:
labels_df = [read_labels(path=labels, Municipality=municipality) for municipality in Municipalities]
labels_df = [pd.get_dummies(df['Labels']) for df in labels_df]

#labels_df[0]

# Data preparation

In [10]:
n_labels = labels_df[0].shape[1]

In [11]:
# Merge the two dataframes based on the date values
dengue_df = [features_df[i].merge(labels_df[i], how='inner', left_index=True, right_index=True) for i in range(len(labels_df))]
#dengue_df[1]

### Train Test split

In [12]:
def train_test_split(df, train_percentage = 80):
    # We need a sequence so we can't split randomly
    # To divide into Train and test we have to calculate the train percentage of the dataset:
    size = df.shape[0]
    split = int(size*(train_percentage/100))
    
    """ Train """
    # We will train with 1st percentage % of data and test with the rest
    train_df = df.iloc[:split,:] ## percentage % train
    
    """ Test """
    test_df = df.iloc[split:,:] # 100 - percentage % test
    
    print(f'The train shape is: {train_df.shape}')
    print(f'The test shape is: {test_df.shape}')
    
    return train_df, test_df

In [13]:
train_df = []
test_df = []

for i in range(len(dengue_df)):
    train_df_aux, test_df_aux = train_test_split(dengue_df[i], train_percentage = 80)
    train_df.append(train_df_aux)
    test_df.append(test_df_aux)
#test_df

The train shape is: (0, 1027)
The test shape is: (0, 1027)
The train shape is: (0, 1027)
The test shape is: (0, 1027)
The train shape is: (0, 1027)
The test shape is: (0, 1027)
The train shape is: (0, 1027)
The test shape is: (0, 1027)
The train shape is: (0, 1027)
The test shape is: (0, 1027)


### Normalize features

In [14]:
# Normalize train data and create the scaler
def normalize_train_features(df, feature_range=(-1, 1), n_labels=None):
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    scalers = {}
    # For each column in the dataframe
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Generate a new scaler
        scaler = MinMaxScaler(feature_range=feature_range)
        # Fit the scaler just for that column
        scaled_column = scaler.fit_transform(values)
        # Add the scaled column to the dataframe
        scaled_column = np.reshape(scaled_column, len(scaled_column))
        df[column] = scaled_column
        
        # Save the scaler of the column
        scalers['scaler_' + column] = scaler
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df, scalers


""" If you want to use the same scaler used in train, you can use this function"""
def normalize_test_features(df, scalers=None, n_labels=None):
    
    if not scalers:
        raise TypeError("You should provide a list of scalers.")
    
    if n_labels:
        n_features = df.shape[1] - n_labels
    
    for i, column in enumerate(df.columns):
        if n_labels:
            if i >= n_features:
                break
        # Get values of the column
        values = df[column].values.reshape(-1,1)
        # Take the scaler of that column
        scaler = scalers['scaler_' + column]
        # Scale values
        scaled_column = scaler.transform(values)
        scaled_column = np.reshape(scaled_column,len(scaled_column))
        # Add the scaled values to the df
        df[column] = scaled_column
        
    print(f' Min values are: ')
    print(df.min())
    print(f' Max values are: ')
    print(df.max())
        
    return df 

In [15]:
# Merge:
train_df = pd.concat([train_df[0], train_df[1], train_df[2], train_df[3], train_df[4]], keys=Municipalities)
test_df = pd.concat([test_df[0], test_df[1], test_df[2], test_df[3], test_df[4]], keys=Municipalities)

In [16]:
feature_range = (-1, 1)


# Scale train:
train_df, scalers = normalize_train_features(train_df, feature_range=feature_range, n_labels=n_labels)
train_df = [train_df[train_df.index.get_level_values(0) == municipality] for municipality in Municipalities]

#print(f'The scalers are: {scalers}')

train_df[0].head()

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required by MinMaxScaler.

In [None]:
feature_range = (-1, 1)

# Scale test:
test_df = normalize_test_features(test_df, scalers=scalers, n_labels=n_labels)
test_df = [test_df[test_df.index.get_level_values(0) == municipality] for municipality in Municipalities]

test_df[0].head()

### Prepare data for time series supervised learning (function to create sliding window)

In [None]:
# prepare data for time series

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True, no_autoregressive=None):
    
    if no_autoregressive:
        n_in = n_in - 1
    
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        
        if no_autoregressive:
            cols.append(df.shift(i).iloc[:,:-3])
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars-3)]
        else:
            cols.append(df.shift(i))
            names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
            
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
# length of window
days = 10

# length of window
days = 10
no_autoregressive = True

# frame as supervised learning
train = [series_to_supervised(df, n_in=days, no_autoregressive=no_autoregressive) for df in train_df]
test = [series_to_supervised(df, n_in=days, no_autoregressive=no_autoregressive) for df in test_df]

DataFrame(train[0])

### Merge train data

In [None]:
# Merge:
train = pd.concat([train[0], train[1], train[2], train[3], train[4]], keys=Municipalities)
test = pd.concat([test[0], test[1], test[2], test[3], test[4]], keys=Municipalities)

In [None]:
train.shape

### Features and Labels Set

In [None]:
def features_labels_set(timeseries_data, original_df, n_labels):
    
    """ Features """
    # We define the number of features as (features and labels)
    n_features = original_df.shape[1]

    # The features to train the model will be all except the values of the actual week 
    # We can't use other variables in week t because whe need to resample a a 3D Array
    features_set = DataFrame(timeseries_data.values[:,:-n_labels])
    # Convert pandas data frame to np.array to reshape as 3D Array
    features_set = features_set.to_numpy()
    print(f'The shape of the features is {features_set.shape}')
    
    """ Labels """
    # We will use labels in last week 
    labels_set = DataFrame(timeseries_data.values[:,-n_labels:])
    # Convert pandas data frame to np.array
    labels_set = labels_set.to_numpy()
    print(f'The shape of the labels is {labels_set.shape}')
    
    return features_set, labels_set, n_features

In [None]:
# Train features and labels set
print('Train:')
train_X, train_y, n_features = features_labels_set(timeseries_data=train, original_df=dengue_df[0], n_labels=n_labels)

# Test features and labels set
print('Test:')
test_X, test_y, n_features = features_labels_set(timeseries_data=test, original_df=dengue_df[0], n_labels=n_labels)

# Modeling

In [None]:
def reshape_tensor(train_X, test_X, n_features):
    print('The initial shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    # reshape input to be 3D [samples, timesteps, features]
    train_X = train_X.reshape((train_X.shape[0], days, n_features-n_labels))
    test_X = test_X.reshape((test_X.shape[0], days, n_features-n_labels))
    
    print('-----------------------')
    print('The Final shapes are:')
    print(f'The train shape is {train_X.shape}')
    print(f'The test shape is {test_X.shape}')
    
    return train_X, test_X

In [None]:
# reshape input to be 3D [samples, timesteps, features]
train_X, test_X = reshape_tensor(train_X, test_X, n_features)

# Define the Model

In [None]:
# Set Seed
#tf.random.set_seed(0)

def create_model():
    # design network
    model = Sequential()
    model.add(LSTM(120, dropout=0.1, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
    model.add(LSTM(240, dropout=0.1, input_shape=(train_X.shape[1], 120)))
    model.add(Dense(60))
    model.add(Dense(3, activation='softmax'))
    
    # Compile the model:
    opt = keras.optimizers.Adam()
    metrics = [
        tf.keras.metrics.AUC(name='auc', multi_label=True, num_labels=3),
        tf.keras.metrics.CategoricalAccuracy(name='acc'),
        tf.keras.metrics.CategoricalCrossentropy(name='entropy')
    ]
    
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=metrics)

    return model

### Train the model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# EarlyStopping:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=20, 
        verbose=1, mode='auto', restore_best_weights=True)

In [None]:
### Imbalanced data
n_zeros = train_y[:,0].sum()
n_ones = train_y[:,1].sum()
n_twos = train_y[:,2].sum()
n_total = n_zeros + n_ones + n_twos

weights = {0: n_total/n_zeros, 1: n_total/n_ones, 2: n_total/n_twos}
print(f'zeros: {n_zeros}, ones: {n_ones}, twos: {n_twos}, total: {n_total}')
weights

In [None]:
# fit network
def train_model(model, monitor, weights, plot=None, epochs=20):
    if monitor and weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor], class_weight=weights)
    elif monitor:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, callbacks=[monitor])
    elif weights:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False, class_weight=weights)
    else:
        history = model.fit(train_X, train_y, epochs=epochs, batch_size=16, validation_data=(test_X, test_y), verbose=2, shuffle=False)
    
    if plot:
        # plot history
        plt.plot(history.history['loss'], label='train')
        plt.plot(history.history['val_loss'], label='validation')
        plt.legend()
        plt.show()
        

# Test the model

# AUC

In [None]:
# You can also evaluate or predict on a dataset.
def evaluate(model, verbose = None):
    if verbose:
        print('Evaluate: ')
    result = model.evaluate(test_X, test_y)
    stored_results = {}
    for i, metric in enumerate(model.metrics_names):
        stored_results[metric] = result[i]
        if verbose:
            print(f'{metric}: {result[i]}')
    return stored_results

# Calculate Mean and SD

In [None]:
def calculate_mean_std(weights):
    
    metrics = {
        "auc": [],
        "acc": [],
        "entropy": []
    }
    
    for i in range(3):
        model = create_model()
        train_model(model=model, monitor=monitor, weights=weights)
        stored_results = evaluate(model=model)
        
        for key in metrics.keys():
            metrics[key].append(stored_results[key])
            
    for key in metrics.keys():
        results = metrics[key]
        print(key, f": average={np.average(results):.3f}, std={np.std(results):.3f}")


In [None]:
#calculate_mean_std(weights=None)
calculate_mean_std(weights=weights)

In [None]:
#model = create_model()
#train_model(model=model, monitor=monitor, weights=weights)
#stored_results = evaluate(model=model)
#print(stored_results)