In [192]:
import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline

In [193]:
# from jupyterthemes import jtplot
# jtplot.style()

In [194]:
train_data = pd.read_csv('data/in/train.csv').fillna('NaN').drop(columns=['Id'])
test_data = pd.read_csv('data/in/test.csv').fillna('NaN')

In [195]:
start_training = pd.to_datetime('2020-01-19')
train_up_to = pd.to_datetime('2020-03-25')
public_test_up_to = pd.to_datetime('2020-04-08')

In [196]:
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

public_mask = np.logical_and(True, train_data['Date'] <= train_up_to)
validation_mask = train_up_to < train_data['Date']

# train_data = train_data
train_data.loc[:, ('ConfirmedCases', 'Fatalities')] = train_data[['ConfirmedCases', 'Fatalities']] \
#                                                      .apply(lambda x: np.log1p(x))
# train_data.replace([np.inf, -np.inf], 0, inplace=True)

In [197]:
train_data.loc[public_mask, 'ForecastId'] = -1
train_data.loc[validation_mask, 'ForecastId'] = 0

test_data['ConfirmedCases'] = 0.0
test_data['Fatalities'] = 0.0

In [198]:
data = pd.concat([train_data, test_data], ignore_index=True)

In [199]:
states = {}
for c in data['Country_Region'].unique():
    states[c] = data[data['Country_Region'] == c]['Province_State'].unique()

In [200]:
def get_masks(df): 
    train_mask = data['ForecastId'] == -1
    validation_mask = data['ForecastId'] == 0
    test_mask = data['ForecastId'] > 0
    return train_mask, validation_mask, test_mask

train_mask, validation_mask, test_mask = get_masks(data)

In [201]:
from sklearn.preprocessing import LabelEncoder
def add_time_features(data):
    le = LabelEncoder()
    data['Day_num'] = le.fit_transform(data.Date)
    data['Day'] = data['Date'].dt.day
    data['Month'] = data['Date'].dt.month
    return data

data = add_time_features(data)

In [202]:
data.tail(7)

Unnamed: 0,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,ForecastId,Day_num,Day,Month
35489,,Zimbabwe,2020-05-01,0.0,0.0,13152.0,100,1,5
35490,,Zimbabwe,2020-05-02,0.0,0.0,13153.0,101,2,5
35491,,Zimbabwe,2020-05-03,0.0,0.0,13154.0,102,3,5
35492,,Zimbabwe,2020-05-04,0.0,0.0,13155.0,103,4,5
35493,,Zimbabwe,2020-05-05,0.0,0.0,13156.0,104,5,5
35494,,Zimbabwe,2020-05-06,0.0,0.0,13157.0,105,6,5
35495,,Zimbabwe,2020-05-07,0.0,0.0,13158.0,106,7,5


In [271]:
lstm_features = ['Day_num', 'Day', 'Month', 'ConfirmedCases', 'Fatalities']
train_mask, val_mask, test_mask = get_masks(data)
X_train = []
X_val = []
y_train = []
y_val = []
for country in tqdm(states.keys()): #tqdm(['Italy', 'China', 'US', 'Spain', 'Germany', 'Canada', 'Albania']):
    for state in states[country]:
        state_mask = (data.Country_Region == country) & (data.Province_State == state)
        state_train_mask = train_mask & state_mask
        state_val_mask = val_mask & state_mask
        
        train_df = data.loc[state_train_mask, lstm_features]
        val_df = data.loc[state_val_mask, lstm_features]
        
        X_train.append(train_df.shift(1).dropna().to_numpy())
        X_val.append(val_df.shift(1).dropna().to_numpy())
        
        y_train.append(train_df[['ConfirmedCases', 'Fatalities']].iloc[1:].to_numpy())
        y_val.append(val_df[['ConfirmedCases', 'Fatalities']].iloc[1:].to_numpy())
    
# X_train = np.array(X_train)
# y_train = np.array(y_train)

HBox(children=(IntProgress(value=0, max=180), HTML(value='')))




In [272]:
def get_timeseries_dataset(X_train, y_train):
    x_res = []
    y_res = []
    for xs, ys in zip(X_train, y_train):
        data_gen = sequence.TimeseriesGenerator(xs, ys, 4, batch_size=1)
        for x, y in data_gen:
            x_res.append(x[0])
            y_res.append(y[0])

    return np.array(x_res), np.array(y_res)

X_train, y_train = get_timeseries_dataset(X_train, y_train)
X_val, y_val = get_timeseries_dataset(X_val, y_val)

In [273]:
X_val.shape

(1224, 4, 5)

In [274]:
from keras import Model
from keras.layers import Input, Dense, LSTM, BatchNormalization, Embedding
from keras.preprocessing import sequence

In [275]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler

#### TODO:
    - Add features
    - Add Dropout

In [276]:
net = {}
input_layer = Input(shape=(4, 5))
x = LSTM(50, return_sequences=True)(input_layer)
x = BatchNormalization()(x)
x = LSTM(50, return_sequences=True)(x)
x = BatchNormalization()(x)
x = LSTM(50, return_sequences=True)(x)
x = BatchNormalization()(x)
x = LSTM(50)(x)
x = BatchNormalization()(x)
x = Dense(10, activation='relu')(x)
output_layer = Dense(2, activation='relu')(x)

model = Model(inputs = input_layer, outputs=output_layer)
model.summary()

Model: "model_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_28 (InputLayer)        (None, 4, 5)              0         
_________________________________________________________________
lstm_87 (LSTM)               (None, 4, 50)             11200     
_________________________________________________________________
batch_normalization_39 (Batc (None, 4, 50)             200       
_________________________________________________________________
lstm_88 (LSTM)               (None, 4, 50)             20200     
_________________________________________________________________
batch_normalization_40 (Batc (None, 4, 50)             200       
_________________________________________________________________
lstm_89 (LSTM)               (None, 4, 50)             20200     
_________________________________________________________________
batch_normalization_41 (Batc (None, 4, 50)             200

In [277]:
import keras.backend as K

def rmsle(y, y0):
    return K.sqrt(K.mean(K.pow(K.log(y + 1) - K.log(y0 + 1), 2)))

In [278]:
model.compile('adam', rmsle)

In [279]:
def lr_schedule(epoch):
    """Learning Rate Schedule

    Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
    Called automatically every epoch as part of callbacks during training.

    # Arguments
        epoch (int): The number of epochs

    # Returns
        lr (float32): learning rate
    """
    lr = 1e-2
    if epoch > 10:
        lr = 1e-3
    if epoch > 20:
        lr = 5e-4
    if epoch > 70:
        lr = 1e-5
    print('Learning rate reduced: ', lr)
    return lr

In [280]:
lr_scheduler = LearningRateScheduler(lr_schedule)
callbacks = [lr_scheduler]

In [281]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), 
          batch_size=128, epochs=100, callbacks=callbacks)

Train on 18054 samples, validate on 1224 samples
Epoch 1/100
Learning rate reduced:  0.01
Epoch 2/100
Learning rate reduced:  0.01
Epoch 3/100
Learning rate reduced:  0.01
Epoch 4/100
Learning rate reduced:  0.01
Epoch 5/100
Learning rate reduced:  0.01
Epoch 6/100
Learning rate reduced:  0.01
Epoch 7/100
Learning rate reduced:  0.01
Epoch 8/100
Learning rate reduced:  0.01
Epoch 9/100
Learning rate reduced:  0.01
Epoch 10/100
Learning rate reduced:  0.01
Epoch 11/100
Learning rate reduced:  0.01
Epoch 12/100
Learning rate reduced:  0.001
Epoch 13/100
Learning rate reduced:  0.001
Epoch 14/100
Learning rate reduced:  0.001
Epoch 15/100
Learning rate reduced:  0.001
Epoch 16/100
Learning rate reduced:  0.001
Epoch 17/100
Learning rate reduced:  0.001
Epoch 18/100
Learning rate reduced:  0.001
Epoch 19/100
Learning rate reduced:  0.001
Epoch 20/100
Learning rate reduced:  0.001
Epoch 21/100
Learning rate reduced:  0.001
Epoch 22/100
Learning rate reduced:  0.0005
Epoch 23/100
Learning ra

Epoch 60/100
Learning rate reduced:  0.0005
Epoch 61/100
Learning rate reduced:  0.0005
Epoch 62/100
Learning rate reduced:  0.0005
Epoch 63/100
Learning rate reduced:  0.0005
Epoch 64/100
Learning rate reduced:  0.0005
Epoch 65/100
Learning rate reduced:  0.0005
Epoch 66/100
Learning rate reduced:  0.0005
Epoch 67/100
Learning rate reduced:  0.0005
Epoch 68/100
Learning rate reduced:  0.0005
Epoch 69/100
Learning rate reduced:  0.0005
Epoch 70/100
Learning rate reduced:  0.0005
Epoch 71/100
Learning rate reduced:  0.0005
Epoch 72/100
Learning rate reduced:  1e-05
Epoch 73/100
Learning rate reduced:  1e-05
Epoch 74/100
Learning rate reduced:  1e-05
Epoch 75/100
Learning rate reduced:  1e-05
Epoch 76/100
Learning rate reduced:  1e-05
Epoch 77/100
Learning rate reduced:  1e-05
Epoch 78/100
Learning rate reduced:  1e-05
Epoch 79/100
Learning rate reduced:  1e-05
Epoch 80/100
Learning rate reduced:  1e-05
Epoch 81/100
Learning rate reduced:  1e-05
Epoch 82/100
Learning rate reduced:  1e-05

<keras.callbacks.callbacks.History at 0x17a91d390>