In [6]:
import os
import requests
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [7]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

Num GPUs Available:  1


In [8]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
DATA_FILE = 'data/OxCGRT_latest.csv'

# Download the data set
data = requests.get(DATA_URL)

# Persist the data set locally in order to use it after submission to make predictions,
# as the sandbox won't have access to the internet anymore.
if not os.path.exists('data'):
    os.mkdir('data')
open(DATA_FILE, 'wb').write(data.content)

19520881

In [9]:
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing',
               'H6_Facial Coverings']

In [145]:
NPIS_INPUT_FILE = "../../../validation/data/2020-12-16_historical_ip.csv"
start_date = "2020-08-01"
end_date = "2020-08-31"

In [11]:
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
df = pd.read_csv(DATA_URL,
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)

In [12]:
new_df = df[['CountryName', 'RegionName', 'Date'] + NPI_COLUMNS]

In [13]:
new_df.to_csv('2020-12-16_historical_ip.csv', index=False)

## Baseline LSTM

In [146]:
# Reload the module to get the latest changes
import xprize_predictor
from importlib import reload
reload(xprize_predictor)
from xprize_predictor import XPrizePredictor

In [147]:
model_weights_file = "models/trained_model_weights.h5"
predictor = XPrizePredictor(model_weights_file, DATA_FILE)

In [148]:
%%time
preds_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

CPU times: user 2min 11s, sys: 3.28 s, total: 2min 14s
Wall time: 2min 5s


## 60 Test Days

In [149]:
model_60_weights_file = "models/trained_model_weights_60_test_days.h5"
predictor = XPrizePredictor(model_60_weights_file, DATA_FILE)

In [150]:
%%time
preds_60_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

KeyboardInterrupt: 

## Use Prediction Ratio to sort countries to use for training + 60 test days

In [None]:
model_weights_pred_ratio_file = "models/trained_model_weights_prediction_ratio_weights.h5"
predictor = XPrizePredictor(model_weights_pred_ratio_file, DATA_FILE)

In [None]:
%%time
preds_ratio_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Don't Shuffle

In [None]:
no_shuffle = "models/no_shuffle.h5"
predictor = XPrizePredictor(no_shuffle, DATA_FILE)

In [None]:
%%time
no_shuffle_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## More training countries + no shuffle

In [None]:
more_training = "models/14_day_more_data.h5"
predictor = XPrizePredictor(more_training, DATA_FILE)

In [None]:
%%time
more_training_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## More training countries, shuffle, 14 day window, 14 test days, 30 lookback

In [None]:
longer = "models/14_day_window_30_lookback.h5"
predictor = XPrizePredictor(longer, DATA_FILE)

In [None]:
%%time
longer_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Dropout

In [None]:
dropout = "models/dropout.h5"
predictor = XPrizePredictor(dropout, DATA_FILE)

In [None]:
%%time
dropout_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Baseline Lasso

In [None]:
import xprize_linear_predictor
from importlib import reload
reload(xprize_linear_predictor)
from xprize_linear_predictor import XPrizeLinearPredictor

In [None]:
predictor = XPrizeLinearPredictor("../linear/models/model.pkl", DATA_FILE)

In [None]:
%%time
preds_linear_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Lasso CV

In [None]:
predictor = XPrizeLinearPredictor("../linear/models/model_cv.pkl", DATA_FILE)

In [None]:
%%time
preds_lasso_cv_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Bayesian Ridge

In [None]:
predictor = XPrizeLinearPredictor("../linear/models/model_bayesian.pkl", DATA_FILE)

In [None]:
%%time
preds_bayesian_ridge_df = predictor.predict(start_date, end_date, NPIS_INPUT_FILE)

## Actual Values

In [None]:
df["GeoID"] = np.where(df["RegionName"].isnull(),
                                      df["CountryName"],
                                      df["CountryName"] + ' / ' + df["RegionName"])
df["DailyChangeConfirmedCases"] = df.groupby(["GeoID"]).ConfirmedCases.diff().fillna(0)
date_df = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]

In [None]:
country_name = "United States"
region_name = "California"
geo_id = country_name if region_name is None else country_name + ' / ' + region_name 

In [None]:
true_df = date_df[(date_df.GeoID == geo_id)]
y_true = true_df.DailyChangeConfirmedCases.values
labels = pd.to_datetime(true_df.Date.map(str),format="%Y-%m-%d").values

In [None]:
predictions = {
    "Baseline LSTM": preds_df,
    # "60 Test Days LSTM": preds_60_df,
    # "All of the above + Prediction Ratio Sorting LSTM": preds_ratio_df,
    # "All above + no shuffle LSTM": no_shuffle_df,
    # "All above + more training data LSTM": more_training_df,
    # "14 test, 14 window, 30 lookback, more training LSTM": longer_df,
    # "Dropout LSTM": dropout_df,
    # "Baseline Lasso": preds_linear_df,
    # "Lasso CV": preds_lasso_cv_df,
    # "Bayesian Ridge": preds_bayesian_ridge_df,
}

In [None]:
plt.scatter(labels, y_true)
for name, pred_df in predictions.items():
    y_pred_df = pred_df[(pred_df['Date'] >= start_date) & (pred_df['Date'] <= end_date)]
    y_pred_df["GeoID"] = np.where(y_pred_df["RegionName"].isnull(),
                                      y_pred_df["CountryName"],
                                      y_pred_df["CountryName"] + ' / ' + y_pred_df["RegionName"])
    y_pred = y_pred_df[(y_pred_df.GeoID == geo_id)]
    plt.plot(labels, y_pred.PredictedDailyNewCases.values, label=name)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

## Calculate MAE

In [None]:
from sklearn.metrics import mean_absolute_error
for name, pred_df in predictions.items():
    y_pred_df = pred_df[(pred_df['Date'] >= start_date) & (pred_df['Date'] <= end_date)]
    y_pred_df["GeoID"] = np.where(y_pred_df["RegionName"].isnull(),
                                      y_pred_df["CountryName"],
                                      y_pred_df["CountryName"] + ' / ' + y_pred_df["RegionName"])
    y_pred = y_pred_df[(y_pred_df.GeoID == geo_id)]
    y_pred_values = y_pred.PredictedDailyNewCases.values
    mae = mean_absolute_error(y_true, y_pred_values)
    print(f"{name}: {mae}") 