# Covid data exploration



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2, f_regression
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot

In [2]:
dataset = pd.read_csv('dataframe_final.csv', index_col=0, low_memory = False)
dataset.shape

(422, 1857)

In [3]:
dataset.head()

Unnamed: 0,confirmados_novos,recuperados,obitos,internados_uci,ativos,internados_enfermaria,Max_Temp,Min_Temp,Temperature,Precipitation,...,people_vaccinated_per_hundred_Venezuela,people_fully_vaccinated_per_hundred_Venezuela,gdp_per_capita_Venezuela,cardiovasc_death_rate_Venezuela,diabetes_prevalence_Venezuela,female_smokers_Venezuela,male_smokers_Venezuela,handwashing_facilities_Venezuela,hospital_beds_per_thousand_Venezuela,life_expectancy_Venezuela
2020-02-26,0.0,0.0,0.0,0.0,0.0,0.0,16.4,10.0,12.9,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-27,0.0,0.0,0.0,0.0,0.0,0.0,20.4,10.4,14.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-28,0.0,0.0,0.0,0.0,0.0,0.0,19.1,10.3,13.3,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-29,0.0,0.0,0.0,0.0,0.0,0.0,18.0,13.1,15.1,3.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-03-01,0.0,0.0,0.0,0.0,0.0,0.0,16.1,14.0,15.3,0.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## pearson correlation

In [4]:
#Using Pearson Correlation
#plt.figure(figsize=(40,40))
cor = dataset.corr()
#sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#plt.show()
#Correlation with output variable
cor_target = abs(cor['obitos'])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
len(relevant_features)

133

In [5]:
relevant_features

confirmados_novos                          0.851787
recuperados                                0.849360
obitos                                     1.000000
Max_Temp                                   0.537942
total_testes                               0.589853
                                             ...   
icu_patients_per_million_United States     0.671061
hosp_patients_United States                0.691360
hosp_patients_per_million_United States    0.691360
new_tests_United States                    0.503568
new_tests_per_thousand_United States       0.503569
Name: obitos, Length: 133, dtype: float64

Como entendemos que temos muitas features, após observar os resultados das correlações de pearson, fizemos um filtro preliminar de features, eliminando features cuja correlação fosse menor que +/-0.50.

In [6]:
col = [column for column in dataset.columns if (column not in relevant_features.index)]
dataset = dataset.drop(columns=col, axis=1)

## Feature selection com SelectKBest

In [7]:
df_data = dataset.drop(columns='obitos')
df_label = dataset['obitos']

In [8]:
# Feature extraction
columns = []
test = SelectKBest(f_regression, k=70)
fit = test.fit_transform(abs(df_data), df_label)
for i in range(0, len(test.get_support())):
    if test.get_support()[i]:
        columns.append(dataset.columns[i])

In [9]:
len(columns)

70

In [10]:
fit[:19]

array([[0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [3.400e+01, 0.000e+00, 1.551e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [5.700e+01, 1.000e+00, 1.253e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [7.600e+01, 1.000e+00, 9.510e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [11]:
df_data[:19]

Unnamed: 0,confirmados_novos,recuperados,Max_Temp,total_testes,testes_pcr,tests_per_case_Australia,new_deaths_Canada,new_deaths_per_million_Canada,icu_patients_Canada,icu_patients_per_million_Canada,...,new_cases_United States,new_deaths_United States,new_cases_per_million_United States,new_deaths_per_million_United States,icu_patients_United States,icu_patients_per_million_United States,hosp_patients_United States,hosp_patients_per_million_United States,new_tests_United States,new_tests_per_thousand_United States
2020-02-26,0.0,0.0,16.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-27,0.0,0.0,20.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-28,0.0,0.0,19.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-29,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,1.0,0.024,0.003,0.0,0.0,0.0,0.0,0.0,0.0
2020-03-01,0.0,0.0,16.1,25.0,25.0,0.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.021,0.0,0.0,0.0,0.0,0.0,347.0,0.001
2020-03-02,2.0,0.0,17.3,45.0,45.0,0.0,0.0,0.0,0.0,0.0,...,23.0,5.0,0.069,0.015,0.0,0.0,0.0,0.0,503.0,0.002
2020-03-03,2.0,0.0,19.5,61.0,61.0,0.0,0.0,0.0,0.0,0.0,...,19.0,1.0,0.057,0.003,0.0,0.0,0.0,0.0,604.0,0.002
2020-03-04,2.0,0.0,20.0,40.0,40.0,0.0,0.0,0.0,0.0,0.0,...,33.0,4.0,0.1,0.012,0.0,0.0,0.0,0.0,856.0,0.003
2020-03-05,3.0,0.0,17.8,80.0,80.0,0.0,0.0,0.0,0.0,0.0,...,77.0,1.0,0.233,0.003,0.0,0.0,0.0,0.0,1190.0,0.004
2020-03-06,4.0,0.0,16.6,81.0,81.0,0.0,0.0,0.0,0.0,0.0,...,53.0,2.0,0.16,0.006,0.0,0.0,0.0,0.0,1487.0,0.004


Resultados:



## Feature selection of Lag Variables com Random Forest Regressor

We can also use feature selection to automatically identify and select those input features that are most predictive.

A popular method for feature selection is called Recursive Feature Selection (RFE).

RFE works by creating predictive models, weighting features, and pruning those with the smallest weights, then repeating the process until a desired number of features are left.

The code below uses RFE with a random forest predictive model and sets the desired number of input features to 10.

(A bar graph is also created showing the feature selection rank (smaller is better) for each input feature.)

In [12]:
# separate into input and output variables
array = dataset.values
# perform feature selection
rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), n_features_to_select=70)
fit = rfe.fit(df_data, df_label)
# report selected features
print('Selected Features:')
names = dataset.columns.values[0:-1]
columns_1 = []
for i in range(len(fit.support_)):
    if fit.support_[i]:
        columns_1.append(names[i])
        print(names[i])
# plot feature rank
#names = dataset.columns.values[0:-1]
#ticks = [i for i in range(len(names))]
#pyplot.bar(ticks, fit.ranking_)
#pyplot.xticks(ticks, names)
#pyplot.show()

Selected Features:
confirmados_novos
recuperados
obitos
testes_pcr
tests_per_case_Australia
new_deaths_Canada
icu_patients_per_million_Cyprus
new_deaths_Denmark
hosp_patients_per_million_Denmark
new_cases_Dominican Republic
new_cases_per_million_Dominican Republic
new_tests_Dominican Republic
new_deaths_per_million_Germany
icu_patients_Germany
new_deaths_Ireland
new_deaths_per_million_Ireland
icu_patients_Ireland
hosp_patients_per_million_Ireland
new_tests_Ireland
new_tests_per_thousand_Ireland
new_cases_Japan
new_cases_per_million_Japan
new_cases_Latvia
new_cases_per_million_Latvia
positive_rate_Lithuania
positive_rate_Malta
new_cases_Mexico
new_cases_per_million_Mexico
new_deaths_per_million_Mexico
new_tests_per_thousand_Mexico
new_cases_per_million_Montenegro
new_cases_Mozambique
new_cases_per_million_Mozambique
new_tests_Mozambique
new_tests_per_thousand_Mozambique
positive_rate_Mozambique
new_cases_Portugal
new_deaths_Portugal
new_cases_per_million_Portugal
new_deaths_per_million_

In [13]:
len(columns_1)

70

## Escolha final de features:
Após fazer estes dois métodos de selecção de features, e visto que o seus resultados fazem bastante overlap, decidimos escolher as features resultantes iguais, bem como manter as diferentes. 

In [14]:
df_data.columns

Index(['confirmados_novos', 'recuperados', 'Max_Temp', 'total_testes',
       'testes_pcr', 'tests_per_case_Australia', 'new_deaths_Canada',
       'new_deaths_per_million_Canada', 'icu_patients_Canada',
       'icu_patients_per_million_Canada',
       ...
       'new_cases_United States', 'new_deaths_United States',
       'new_cases_per_million_United States',
       'new_deaths_per_million_United States', 'icu_patients_United States',
       'icu_patients_per_million_United States', 'hosp_patients_United States',
       'hosp_patients_per_million_United States', 'new_tests_United States',
       'new_tests_per_thousand_United States'],
      dtype='object', length=132)

In [15]:
final_columns = columns.copy()

for i in columns_1:
    if i not in columns:
        final_columns.append(i)

In [16]:
columns_to_drop = []
for i in df_data.columns:
    if i not in final_columns:
        columns_to_drop.append(i)
df_data = df_data.drop(columns=columns_to_drop, axis=1)
df_data.shape

(422, 101)

In [17]:
df_data.columns

Index(['confirmados_novos', 'recuperados', 'total_testes', 'testes_pcr',
       'tests_per_case_Australia', 'new_deaths_Canada',
       'icu_patients_per_million_Canada', 'hosp_patients_Canada',
       'icu_patients_per_million_Cyprus', 'new_deaths_Denmark',
       ...
       'hosp_patients_United Kingdom',
       'hosp_patients_per_million_United Kingdom', 'new_cases_United States',
       'new_deaths_United States', 'new_cases_per_million_United States',
       'new_deaths_per_million_United States', 'icu_patients_United States',
       'icu_patients_per_million_United States', 'hosp_patients_United States',
       'new_tests_United States'],
      dtype='object', length=101)

In [18]:
df_data.to_csv("dataframe_explored.csv")