In [5]:
import json 
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier 
import joblib 

# load dataset
df = pd.read_csv('https://covid.ourworldindata.org/data/owid-covid-data.csv', skipinitialspace=True)
x_cols = [c for c in df.columns if c != 'total_deaths_per_million']

# set input matrix and target column
X = df[x_cols]
y = df['total_deaths_per_million']

# show first rows of data
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [8]:
# data split train / test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234)

# fill missing values
train_mode = dict(X_train.mode().iloc[0])
X_train = X_train.fillna(train_mode)
print(train_mode)

{'iso_code': 'OWID_UMC', 'continent': 'Africa', 'location': 'Upper middle income', 'date': '2021-07-16', 'total_cases': 1.0, 'new_cases': 0.0, 'new_cases_smoothed': 0.0, 'total_deaths': 1.0, 'new_deaths': 0.0, 'new_deaths_smoothed': 0.0, 'total_cases_per_million': 33251.232, 'new_cases_per_million': 0.0, 'new_cases_smoothed_per_million': 0.0, 'new_deaths_per_million': 0.0, 'new_deaths_smoothed_per_million': 0.0, 'reproduction_rate': 0.99, 'icu_patients': 0.0, 'icu_patients_per_million': 0.0, 'hosp_patients': 0.0, 'hosp_patients_per_million': 0.0, 'weekly_icu_admissions': 0.0, 'weekly_icu_admissions_per_million': 0.0, 'weekly_hosp_admissions': 1.0, 'weekly_hosp_admissions_per_million': 0.0, 'new_tests': 1.0, 'total_tests': 2.0, 'total_tests_per_thousand': 0.001, 'new_tests_per_thousand': 0.0, 'new_tests_smoothed': 955.0, 'new_tests_smoothed_per_thousand': 1.208, 'positive_rate': 0.0, 'tests_per_case': 4.0, 'tests_units': 'tests performed', 'total_vaccinations': 0.0, 'people_vaccinated':

In [None]:
# convert categoricals
encoders = {}
for column in ['iso_code', 'continent', 'location']:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert
    
# train the Random Forest algorithm
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

# train the Extra Trees algorithm
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

# save preprocessing objects and RF algorithm
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)