# Predicting COVID-19 in European Countries with Random Forest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
from datetime import datetime
from pandas.plotting import lag_plot
from matplotlib import pyplot as plt
from matplotlib.dates import date2num
%matplotlib inline
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from category_encoders import TargetEncoder
import json
plt.close("all")

euro_data = pd.read_csv("data/euro_countries_filled.csv", index_col=0)
with open("data/iso_country_codes.json", "r") as read_file:
    country_codes = json.load(read_file)

euro_data


Bad key savefig.frameon in file /home/alpakkan/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 421 ('savefig.frameon : True')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.2/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.level in file /home/alpakkan/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 472 ('verbose.level  : silent      # one of silent, helpful, debug, debug-annoying')
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.3.2/matplotlibrc.template
or from the matplotlib source distribution

Bad key verbose.fileo in file /home/alpakkan/.local/lib/python3.6/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle, line 473 ('verbose.fileo  : sys.stdout  # a log filename, sys.stdout or sys.stderr')
You probably need to get an upd

Unnamed: 0,iso_code,location,date,new_cases,new_cases_smoothed,population,total_tests,new_tests,new_tests_smoothed,stringency_index,latitude,longitude
0,ALB,Albania,2019-12-31,0.0,0.000,2877800.0,0.0,0.0,0.0,0.0,41.0,20.0
1,ALB,Albania,2020-01-01,0.0,0.000,2877800.0,0.0,0.0,0.0,0.0,41.0,20.0
2,ALB,Albania,2020-01-02,0.0,0.000,2877800.0,0.0,0.0,0.0,0.0,41.0,20.0
3,ALB,Albania,2020-01-03,0.0,0.000,2877800.0,0.0,0.0,0.0,0.0,41.0,20.0
4,ALB,Albania,2020-01-04,0.0,0.000,2877800.0,0.0,0.0,0.0,0.0,41.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...
10100,UKR,Ukraine,2020-11-06,9850.0,8578.571,43733759.0,3521175.0,7460.0,42174.0,58.8,49.0,32.0
10101,UKR,Ukraine,2020-11-07,9721.0,8779.857,43733759.0,3572433.0,51258.0,43312.0,58.8,49.0,32.0
10102,UKR,Ukraine,2020-11-08,10746.0,9064.714,43733759.0,3613899.0,41466.0,43050.0,58.8,49.0,32.0
10103,UKR,Ukraine,2020-11-09,9397.0,9270.143,43733759.0,3648292.0,34393.0,43440.0,58.8,49.0,32.0


## Preprocessing

In [7]:
# Remove rows before 2020-03-01 and after 2020-11-07
start_date, end_date = '2020-03-01', '2020-11-07'

df = euro_data
df['date'] = pd.to_datetime(df['date'])
df = df[~(df['date'] < start_date)]
df = df[~(df['date'] > end_date)]

# Drop unneeded columns
df = df.drop(['location', 'latitude', 'longitude', 'new_tests', 'new_tests_smoothed', 'total_tests', 'stringency_index'], axis = 1)

dfs = [group[1] for group in df.groupby('iso_code')]

dfs[0]

Unnamed: 0,iso_code,date,new_cases,new_cases_smoothed,population
61,ALB,2020-03-01,0.0,0.000,2877800.0
62,ALB,2020-03-02,0.0,0.000,2877800.0
63,ALB,2020-03-03,0.0,0.000,2877800.0
64,ALB,2020-03-04,0.0,0.000,2877800.0
65,ALB,2020-03-05,0.0,0.000,2877800.0
...,...,...,...,...,...
308,ALB,2020-11-03,321.0,296.857,2877800.0
309,ALB,2020-11-04,381.0,310.714,2877800.0
310,ALB,2020-11-05,396.0,322.857,2877800.0
311,ALB,2020-11-06,421.0,343.714,2877800.0


In [8]:
for i in range(len(dfs)):
    dfs[i]['day'] = df['date'].dt.day
    dfs[i]['month'] = df['date'].dt.month
    dfs[i]['year'] = df['date'].dt.year
    dfs[i]['weekday'] = df['date'].dt.strftime('%A')
    
dfs[0]

Unnamed: 0,iso_code,date,new_cases,new_cases_smoothed,population,day,month,year,weekday
61,ALB,2020-03-01,0.0,0.000,2877800.0,1,3,2020,Sunday
62,ALB,2020-03-02,0.0,0.000,2877800.0,2,3,2020,Monday
63,ALB,2020-03-03,0.0,0.000,2877800.0,3,3,2020,Tuesday
64,ALB,2020-03-04,0.0,0.000,2877800.0,4,3,2020,Wednesday
65,ALB,2020-03-05,0.0,0.000,2877800.0,5,3,2020,Thursday
...,...,...,...,...,...,...,...,...,...
308,ALB,2020-11-03,321.0,296.857,2877800.0,3,11,2020,Tuesday
309,ALB,2020-11-04,381.0,310.714,2877800.0,4,11,2020,Wednesday
310,ALB,2020-11-05,396.0,322.857,2877800.0,5,11,2020,Thursday
311,ALB,2020-11-06,421.0,343.714,2877800.0,6,11,2020,Friday


## Detrend the Time Series

### Differencing

In [9]:
# Difference all countries
for i in range(len(dfs)):
    dfs[i]['new_cases_diff'] = dfs[i]['new_cases'].diff(periods=7)
dfs[0]

Unnamed: 0,iso_code,date,new_cases,new_cases_smoothed,population,day,month,year,weekday,new_cases_diff
61,ALB,2020-03-01,0.0,0.000,2877800.0,1,3,2020,Sunday,
62,ALB,2020-03-02,0.0,0.000,2877800.0,2,3,2020,Monday,
63,ALB,2020-03-03,0.0,0.000,2877800.0,3,3,2020,Tuesday,
64,ALB,2020-03-04,0.0,0.000,2877800.0,4,3,2020,Wednesday,
65,ALB,2020-03-05,0.0,0.000,2877800.0,5,3,2020,Thursday,
...,...,...,...,...,...,...,...,...,...,...
308,ALB,2020-11-03,321.0,296.857,2877800.0,3,11,2020,Tuesday,33.0
309,ALB,2020-11-04,381.0,310.714,2877800.0,4,11,2020,Wednesday,97.0
310,ALB,2020-11-05,396.0,322.857,2877800.0,5,11,2020,Thursday,85.0
311,ALB,2020-11-06,421.0,343.714,2877800.0,6,11,2020,Friday,146.0


## Feature Engineering

In [10]:
for i in range(len(dfs)):
    # Remove date column
    dfs[i] = dfs[i].drop('date', axis = 1)

In [11]:
lag = 3

def build_lagged_features(s,lag=3,dropna=True):
    if type(s) is pd.DataFrame:
        new_dict={}
        for col_name in s:
            new_dict[col_name]=s[col_name]
            # create lagged Series
            for l in range(1,lag+1):
                new_dict['%s_lag%d' %(col_name,l*7)]=s[col_name].shift(l*7)
        res=pd.DataFrame(new_dict,index=s.index)
    if dropna:
        return res.dropna()
    else:
        return res

def build_all(features):
    new_features = []
    for f in features:
        f = f.dropna(subset=['new_cases_diff'])
        lagged = build_lagged_features(f.drop(['day', 'month', 'year', 'weekday', 'population', 'iso_code'], axis = 1), lag=lag)
        lagged['day'] = f['day'][lag:]
        lagged['month'] = f['month'][lag:]
        lagged['year'] = f['year'][lag:]
        lagged['weekday'] = f['weekday'][lag:]
        lagged['population'] = f['population'][lag:]
        #lagged = lagged.drop(['total_tests', 'new_tests_smoothed'], axis = 1)
        # One-hot encode weekdays
        lagged = pd.get_dummies(lagged)
        lagged['iso_code'] = f['iso_code'][lag:]
        new_features.append(lagged)
    return new_features

# Create lags
features = build_all(dfs)
features = [x for x in features if len(x) > 7]

features[0]

Unnamed: 0,new_cases,new_cases_lag7,new_cases_lag14,new_cases_lag21,new_cases_smoothed,new_cases_smoothed_lag7,new_cases_smoothed_lag14,new_cases_smoothed_lag21,new_cases_diff,new_cases_diff_lag7,...,year,population,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,iso_code
89,11.0,6.0,5.0,0.0,17.286,5.429,5.429,0.000,5.0,1.0,...,2020,2877800.0,0,0,0,1,0,0,0,ALB
90,15.0,13.0,4.0,2.0,17.571,6.714,5.714,0.000,2.0,9.0,...,2020,2877800.0,0,1,0,0,0,0,0,ALB
91,11.0,11.0,9.0,4.0,17.571,7.000,6.429,0.000,0.0,2.0,...,2020,2877800.0,0,0,0,0,0,1,0,ALB
92,20.0,23.0,4.0,4.0,17.143,9.714,6.429,0.000,-3.0,19.0,...,2020,2877800.0,0,0,0,0,0,0,1,ALB
93,16.0,23.0,4.0,1.0,16.143,12.429,6.857,0.000,-7.0,19.0,...,2020,2877800.0,0,0,0,0,1,0,0,ALB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,321.0,288.0,295.0,171.0,296.857,299.286,254.286,165.714,33.0,-7.0,...,2020,2877800.0,0,0,0,0,0,1,0,ALB
309,381.0,284.0,301.0,182.0,310.714,296.857,271.286,169.143,97.0,-17.0,...,2020,2877800.0,0,0,0,0,0,0,1,ALB
310,396.0,311.0,297.0,203.0,322.857,298.857,284.714,175.000,85.0,14.0,...,2020,2877800.0,0,0,0,0,1,0,0,ALB
311,421.0,275.0,302.0,257.0,343.714,295.000,291.143,187.571,146.0,-27.0,...,2020,2877800.0,1,0,0,0,0,0,0,ALB


In [12]:
# Splitting into labels and features, training and testing sets
labels = {}
labels_diff = []

train_features = []
train_labels = []
test_features = {}
test_labels = {}
for i in range(len(features)):
    iso_code = features[i]['iso_code'].iloc[0]
    labels[iso_code] = features[i]['new_cases']
    labels_diff.append(features[i]['new_cases_diff'])
    features[i] = features[i].drop(['new_cases', 'new_cases_smoothed', 'new_cases_diff'], axis = 1)
    # Using first 7 days of November as testing set
    train_features.append(features[i][:-7])
    train_labels.append(labels_diff[i][:-7])
    test_features[iso_code] = features[i][-7:]
    test_labels[iso_code] = labels[iso_code][-7:]

feature_list = list(features[0].columns)

In [13]:
# Target encode ISO code
target_encoder = TargetEncoder(cols=['iso_code'], smoothing=8, min_samples_leaf=5).fit(pd.concat(train_features), pd.concat(train_labels))

for i in range(len(train_features)):
    train_features[i] = target_encoder.transform(train_features[i])

for iso_code, tf in test_features.items():
    test_features[iso_code] = target_encoder.transform(tf)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical

## Training

In [14]:
# Convert to numpy array
np_train_features = np.array(pd.concat(train_features))
np_train_labels = np.array(pd.concat(train_labels))

In [15]:
# Train random forest model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 49)
rf.fit(np_train_features, np_train_labels)

RandomForestRegressor(n_estimators=1000, random_state=49)

In [16]:
diff_predictions = {}
for iso_code, tf in test_features.items():
    diff_predictions[iso_code] = rf.predict(np.array(tf))

In [17]:
from pmdarima.utils import diff_inv
predictions = {}
errors = {}
for iso_code, tl in test_labels.items():
    predictions[iso_code] = diff_inv(np.append(np.array(labels[iso_code].iloc[-14:-7]), diff_predictions[iso_code]), lag=7, differences=1)[-7:]
    # Get Mean Absolute Error (MAE)
    errors[iso_code] = abs(predictions[iso_code] - np.array(tl))
    print('MAE:', iso_code, round(np.mean(errors[iso_code]), 2), 'new cases')

MAE: ALB 70.39 new cases
MAE: AUT 751.35 new cases
MAE: BEL 9315.27 new cases
MAE: BGR 477.63 new cases
MAE: BIH 297.47 new cases
MAE: BLR 94.42 new cases
MAE: CHE 1584.18 new cases
MAE: CZE 2464.57 new cases
MAE: DEU 1712.14 new cases
MAE: DNK 292.75 new cases
MAE: ESP 5239.33 new cases
MAE: EST 62.2 new cases
MAE: FIN 66.15 new cases
MAE: FRA 9566.19 new cases
MAE: GRC 574.77 new cases
MAE: HRV 685.0 new cases
MAE: HUN 735.11 new cases
MAE: ITA 2839.59 new cases
MAE: LTU 368.89 new cases
MAE: LVA 39.5 new cases
MAE: MDA 98.06 new cases
MAE: NLD 2791.6 new cases
MAE: NOR 130.0 new cases
MAE: POL 4541.14 new cases
MAE: PRT 1214.98 new cases
MAE: ROU 1758.66 new cases
MAE: RUS 1348.12 new cases
MAE: SRB 469.76 new cases
MAE: SVK 1247.95 new cases
MAE: SVN 1278.0 new cases
MAE: SWE 633.1 new cases
MAE: UKR 1280.56 new cases


In [18]:
# Get MAE for entire Europe
predictions_eur = np.sum(np.array([*predictions.values()], dtype=object), 0)
labels_eur = np.sum(np.array([*test_labels.values()], dtype=object), 0)
errors_eur = abs(predictions_eur - labels_eur)

print('MAE:', round(np.mean(errors_eur), 2), 'new cases')

MAE: 17557.16 new cases


In [19]:
# Get feature importances
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort feature importances descendingly by importance
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: new_cases_diff_lag7  Importance: 0.34
Variable: month                Importance: 0.09
Variable: new_cases_smoothed_lag7 Importance: 0.08
Variable: new_cases_smoothed_lag21 Importance: 0.08
Variable: new_cases_lag7       Importance: 0.06
Variable: iso_code             Importance: 0.06
Variable: new_cases_diff_lag14 Importance: 0.05
Variable: new_cases_lag14      Importance: 0.04
Variable: new_cases_lag21      Importance: 0.04
Variable: new_cases_smoothed_lag14 Importance: 0.04
Variable: new_cases_diff_lag21 Importance: 0.04
Variable: day                  Importance: 0.04
Variable: population           Importance: 0.02
Variable: weekday_Monday       Importance: 0.01
Variable: year                 Importance: 0.0
Variable: weekday_Friday       Importance: 0.0
Variable: weekday_Saturday     Importance: 0.0
Variable: weekday_Sunday       Importance: 0.0
Variable: weekday_Thursday     Importance: 0.0
Variable: weekday_Tuesday      Importance: 0.0
Variable: weekday_Wednesday    Impo