In [None]:
import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def get_masks(df): 
    train_mask = data['ForecastId'] == -1
    validation_mask = data['ForecastId'] == 0
    test_mask = data['ForecastId'] > 0
    return train_mask, validation_mask, test_mask

In [None]:
from sklearn.preprocessing import LabelEncoder
def add_time_features(data):
    le = LabelEncoder()
    data['Day_num'] = le.fit_transform(data.Date)
    data['Day'] = data['Date'].dt.day
    data['Month'] = data['Date'].dt.month
    return data

In [None]:
# from jupyterthemes import jtplot
# jtplot.style()

In [None]:
from tensorflow.keras.preprocessing import sequence

def extract_data(data, mask, features, targets):
    df = data.loc[mask, features]
    X = df.shift(1).iloc[1:]
    y = df[targets].iloc[1:]
    return X.to_numpy(), y.to_numpy()


def get_timeframes_dataset(X_train, y_train, window_size=4):
    x_res = []
    y_res = []
    for xs, ys in zip(X_train, y_train):
        data_gen = sequence.TimeseriesGenerator(xs, ys, window_size, batch_size=1)
        for x, y in data_gen:
            x_res.append(x[0])
            y_res.append(y[0])

    return np.array(x_res), np.array(y_res)


def generate_dataset(data, features, targets, window_size, train_mask, val_mask=None):
    X_train = []
    X_val = []
    y_train = []
    y_val = []
    for country in states.keys(): #tqdm(['Italy', 'China', 'US', 'Spain', 'Germany', 'Canada', 'Albania']):
        for state in states[country]:
            state_mask = (data.Country_Region == country) & (data.Province_State == state)
                            
            X, y = extract_data(data, train_mask & state_mask, features, targets)
            X_train.append(X)
            y_train.append(y)
            
            if val_mask is not None:
                X, y = extract_data(data, val_mask & state_mask, lstm_features, targets)
                X_val.append(X)
                y_val.append(y)
    
    
    X_train, y_train = get_timeframes_dataset(X_train, y_train, window_size)
    if val_mask is not None:
        X_val, y_val = get_timeframes_dataset(X_val, y_val, window_size)
        return X_train, y_train, X_val, y_val
    
    return X_train, y_train

#### TODO:
    - Add country specific features 

In [None]:
import tensorflow as tf

from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

In [None]:
def create_model(n_features, n_targets, window_size, verbose=0):
    n_lstm_units = 10 * n_features
    model = tf.keras.models.Sequential([
        tf.keras.layers.LSTM(n_lstm_units, input_shape=(window_size, n_features), return_sequences=True),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.LSTM(n_lstm_units, return_sequences=True),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.05),

        tf.keras.layers.LSTM(n_lstm_units, return_sequences=True),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),

        tf.keras.layers.LSTM(n_lstm_units),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),

        tf.keras.layers.Dense(n_lstm_units, activation='relu'),
        tf.keras.layers.Dense(2 * n_features, activation='relu'),
        tf.keras.layers.Dense(n_targets, activation='relu'),
    ])
    if verbose > 0:
        model.summary()
    return model

In [None]:
import tensorflow.keras.backend as K

def rmsle(y, y0):
    return K.sqrt(K.mean(K.pow(K.log(y + 1) - K.log(y0 + 1), 2)))

In [None]:
def lr_schedule(epoch):
    lr = 1e-2
    if epoch > 10:
        lr = 1e-3
    if epoch > 20:
        lr = 5e-4
    if epoch > 70:
        lr = 1e-5
    print('Learning rate reduced: ', lr)
    return lr

In [None]:
def train_model(model, model_title, loss, metrics):
    model.compile('adam', loss, metrics=metrics)
    reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=10, min_lr=0.0001, verbose=1)
    tnsboard = TensorBoard(log_dir=f'logs\\{model_title}')
    name_template = 'rmsle_{val_rmsle:.3f}_epoch_{epoch:02d}'
    checkout = ModelCheckpoint(f'models\\{model_title}_{name_template}.hdf5', monitor=f'val_rmsle', save_best_only=True)
    callbacks = [tnsboard, reduce_lr, checkout]
    model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=128, epochs=400, callbacks=callbacks)
    return model

## Prepare data

In [None]:
train_data = pd.read_csv('data/in/train.csv').fillna('NaN').drop(columns=['Id'])
test_data = pd.read_csv('data/in/test.csv').fillna('NaN')

In [None]:
start_training = pd.to_datetime('2020-01-19')
train_up_to = pd.to_datetime('2020-03-25')
public_test_up_to = pd.to_datetime('2020-04-08')

In [None]:
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

public_mask = np.logical_and(start_training < train_data['Date'], train_data['Date'] <= train_up_to)
validation_mask = train_up_to < train_data['Date']

In [None]:
train_data.loc[public_mask, 'ForecastId'] = -1
train_data.loc[validation_mask, 'ForecastId'] = 0

test_data['ConfirmedCases'] = 0.0
test_data['Fatalities'] = 0.0

In [None]:
data = pd.concat([train_data, test_data], ignore_index=True)

In [None]:
train_data.tail(20)

In [None]:
test_data

In [None]:
states = {}
for c in data['Country_Region'].unique():
    states[c] = data[data['Country_Region'] == c]['Province_State'].unique()

In [None]:
train_mask, validation_mask, test_mask = get_masks(data)
data = add_time_features(data)

### Format data for LSTM

In [None]:
data

In [None]:
lstm_features = ['Day_num', 'Day', 'Month', 'ConfirmedCases', 'Fatalities']
lstm_targets = ['ConfirmedCases', 'Fatalities']
window_size = 4

train_mask, val_mask, test_mask = get_masks(data)
X_train, y_train, X_val, y_val = generate_dataset(data, lstm_features, lstm_targets, window_size, train_mask, val_mask)

In [None]:
n_features = len(lstm_features)
n_targets = len(lstm_targets)

model = create_model(n_features, n_targets, window_size, verbose=0)

In [None]:
train_model(model, 'lstm_3xlstm_batchnorm_dropout_3xdense', 'msle', [rmsle])

In [None]:
x = np.log1p(1.37)
x

In [None]:
def predict(test_df, model):
    lags_queue = deque(test_df.iloc[0, -20:], maxlen=20)
    res = []
    for i, df_row in test_df.iterrows():
        X_test = df_row[['Day_num', 'Day', 'Month']].to_numpy().tolist()
        X_test = np.array([X_test + list(lags_queue)])
        
        y_pred =  model.predict(X_test).flatten()
        
        lags_queue.appendleft(y_pred[1])
        lags_queue.appendleft(y_pred[0])

        res.append(y_pred)
    
    return res