# Covid data exploration



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2, f_regression
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot

In [2]:
dataset = pd.read_csv('dataframe_final.csv', index_col=0, low_memory = False)
dataset.shape

(478, 449)

In [3]:
dataset.head()

Unnamed: 0,confirmados_novos,recuperados,obitos,internados_uci,ativos,internados_enfermaria,Max_Temp,Min_Temp,Temperature,Precipitation,...,new_deaths_United States,new_cases_per_million_United States,new_deaths_per_million_United States,gdp_per_capita_United States,cardiovasc_death_rate_United States,diabetes_prevalence_United States,female_smokers_United States,male_smokers_United States,hospital_beds_per_thousand_United States,life_expectancy_United States
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,13.9,6.9,9.6,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,12.0,8.0,10.1,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,16.9,11.0,13.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,15.1,9.6,12.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,13.0,7.4,10.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## pearson correlation

In [4]:
#Using Pearson Correlation
#plt.figure(figsize=(40,40))
cor = dataset.corr()
#sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#plt.show()
#Correlation with output variable
cor_target = abs(cor['obitos'])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
len(relevant_features)

33

In [5]:
relevant_features

confirmados_novos                             0.858915
recuperados                                   0.856448
obitos                                        1.000000
AEM                                           0.512692
RES                                           0.511906
total_testes                                  0.613779
testes_pcr                                    0.692652
new_cases_Japan                               0.602637
new_deaths_Japan                              0.754107
new_cases_per_million_Japan                   0.602636
new_deaths_per_million_Japan                  0.754092
new_tests_Japan                               0.533131
new_tests_per_thousand_Japan                  0.533231
new_cases_Mexico                              0.658547
new_cases_per_million_Mexico                  0.658548
new_tests_Mexico                              0.610669
new_tests_per_thousand_Mexico                 0.610994
new_cases_Monaco                              0.672421
new_cases_

Como entendemos que temos muitas features, após observar os resultados das correlações de pearson, fizemos um filtro preliminar de features, eliminando features cuja correlação fosse menor que +/-0.50.

In [6]:
col = [column for column in dataset.columns if (column not in relevant_features.index)]
dataset = dataset.drop(columns=col, axis=1)

## Feature selection com SelectKBest

In [7]:
df_data = dataset.drop(columns='obitos')
df_label = dataset['obitos']

In [9]:
# Feature extraction
columns = []
test = SelectKBest(f_regression, k=30)
fit = test.fit_transform(abs(df_data), df_label)
for i in range(0, len(test.get_support())):
    if test.get_support()[i]:
        columns.append(dataset.columns[i])

In [10]:
len(columns)

30

In [11]:
fit[:19]

array([[0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        2.60e+01, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        7.20e+01, 1.00e-03, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00],
       [0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        8.90e+01, 1.00e-03, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00,
        0.00e+00, 0.00e+00, 0.00e+00, 0.00e+00

In [12]:
df_data[:19]

Unnamed: 0,confirmados_novos,recuperados,AEM,RES,total_testes,testes_pcr,new_cases_Japan,new_deaths_Japan,new_cases_per_million_Japan,new_deaths_per_million_Japan,...,new_cases_per_million_Spain,new_cases_United Arab Emirates,new_cases_per_million_United Arab Emirates,positive_rate_United Arab Emirates,new_cases_United Kingdom,new_cases_per_million_United Kingdom,new_cases_United States,new_deaths_United States,new_cases_per_million_United States,new_deaths_per_million_United States
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.0,366.0,375.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,384.0,375.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,366.0,367.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,350.0,338.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-06,0.0,0.0,407.0,428.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-07,0.0,0.0,395.0,412.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-08,0.0,0.0,439.0,386.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-09,0.0,0.0,419.0,383.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-10,0.0,0.0,439.0,401.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Resultados:



## Feature selection of Lag Variables com Random Forest Regressor

We can also use feature selection to automatically identify and select those input features that are most predictive.

A popular method for feature selection is called Recursive Feature Selection (RFE).

RFE works by creating predictive models, weighting features, and pruning those with the smallest weights, then repeating the process until a desired number of features are left.

The code below uses RFE with a random forest predictive model and sets the desired number of input features to 10.

(A bar graph is also created showing the feature selection rank (smaller is better) for each input feature.)

In [15]:
# separate into input and output variables
array = dataset.values
# perform feature selection
rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), n_features_to_select=30)
fit = rfe.fit(df_data, df_label)
# report selected features
print('Selected Features:')
names = dataset.columns.values[0:-1]
columns_1 = []
for i in range(len(fit.support_)):
    if fit.support_[i]:
        columns_1.append(names[i])
        print(names[i])
# plot feature rank
#names = dataset.columns.values[0:-1]
#ticks = [i for i in range(len(names))]
#pyplot.bar(ticks, fit.ranking_)
#pyplot.xticks(ticks, names)
#pyplot.show()

Selected Features:
confirmados_novos
recuperados
RES
total_testes
testes_pcr
new_cases_Japan
new_deaths_Japan
new_cases_per_million_Japan
new_deaths_per_million_Japan
new_tests_Japan
new_tests_per_thousand_Japan
new_cases_Mexico
new_cases_per_million_Mexico
new_tests_Mexico
new_tests_per_thousand_Mexico
new_cases_Monaco
new_cases_per_million_Monaco
new_cases_Russia
new_cases_per_million_Russia
positive_rate_Slovenia
new_cases_Spain
new_cases_per_million_Spain
new_cases_United Arab Emirates
new_cases_per_million_United Arab Emirates
positive_rate_United Arab Emirates
new_cases_United Kingdom
new_cases_per_million_United Kingdom
new_cases_United States
new_deaths_United States
new_cases_per_million_United States


In [16]:
len(columns_1)

30

## Escolha final de features:
Após fazer estes dois métodos de selecção de features, e visto que o seus resultados fazem bastante overlap, decidimos escolher as features resultantes iguais, bem como manter as diferentes. 

In [17]:
df_data.columns

Index(['confirmados_novos', 'recuperados', 'AEM', 'RES', 'total_testes',
       'testes_pcr', 'new_cases_Japan', 'new_deaths_Japan',
       'new_cases_per_million_Japan', 'new_deaths_per_million_Japan',
       'new_tests_Japan', 'new_tests_per_thousand_Japan', 'new_cases_Mexico',
       'new_cases_per_million_Mexico', 'new_tests_Mexico',
       'new_tests_per_thousand_Mexico', 'new_cases_Monaco',
       'new_cases_per_million_Monaco', 'new_cases_Russia',
       'new_cases_per_million_Russia', 'positive_rate_Slovenia',
       'new_cases_Spain', 'new_cases_per_million_Spain',
       'new_cases_United Arab Emirates',
       'new_cases_per_million_United Arab Emirates',
       'positive_rate_United Arab Emirates', 'new_cases_United Kingdom',
       'new_cases_per_million_United Kingdom', 'new_cases_United States',
       'new_deaths_United States', 'new_cases_per_million_United States',
       'new_deaths_per_million_United States'],
      dtype='object')

In [18]:
final_columns = columns.copy()

for i in columns_1:
    if i not in columns:
        final_columns.append(i)

In [19]:
columns_to_drop = []
for i in df_data.columns:
    if i not in final_columns:
        columns_to_drop.append(i)
df_data = df_data.drop(columns=columns_to_drop, axis=1)
df_data.shape

(478, 30)

In [20]:
df_data.columns

Index(['confirmados_novos', 'recuperados', 'RES', 'total_testes', 'testes_pcr',
       'new_cases_Japan', 'new_deaths_Japan', 'new_cases_per_million_Japan',
       'new_deaths_per_million_Japan', 'new_tests_Japan',
       'new_tests_per_thousand_Japan', 'new_cases_Mexico',
       'new_cases_per_million_Mexico', 'new_tests_Mexico',
       'new_tests_per_thousand_Mexico', 'new_cases_Monaco',
       'new_cases_per_million_Monaco', 'new_cases_Russia',
       'new_cases_per_million_Russia', 'positive_rate_Slovenia',
       'new_cases_Spain', 'new_cases_per_million_Spain',
       'new_cases_United Arab Emirates',
       'new_cases_per_million_United Arab Emirates',
       'positive_rate_United Arab Emirates', 'new_cases_United Kingdom',
       'new_cases_per_million_United Kingdom', 'new_cases_United States',
       'new_deaths_United States', 'new_cases_per_million_United States'],
      dtype='object')

In [21]:
df_data.to_csv("dataframe_explored.csv")