# Covid data exploration



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, chi2, f_regression
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot

In [2]:
dataset = pd.read_csv('dataframe_final.csv', index_col=0, low_memory = False)
dataset.shape

(478, 4146)

In [3]:
dataset.head()

Unnamed: 0,confirmados_novos,recuperados,obitos,internados_uci,ativos,internados_enfermaria,Max_Temp,Min_Temp,Temperature,Precipitation,...,Vaccinations_United Arab Emirates,Vaccinations_per_Million_United Arab Emirates,Vaccinations_United Kingdom,Vaccinations_per_Million_United Kingdom,Vaccinations_United States,Vaccinations_per_Million_United States,Vaccinations_Venezuela,Vaccinations_per_Million_Venezuela,Vaccinations_Wales,Vaccinations_per_Million_Wales
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,16.1,14.0,15.3,0.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-04,808.0,0.0,27.0,42.0,781.0,57.0,12.6,7.5,9.3,18.77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-05,295.0,128.0,18.0,-18.0,149.0,-58.0,22.0,15.7,17.9,0.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## pearson correlation

In [None]:
#Using Pearson Correlation
#plt.figure(figsize=(40,40))
cor = dataset.corr()
#sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
#plt.show()
#Correlation with output variable
cor_target = abs(cor['obitos'])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
len(relevant_features)

In [None]:
relevant_features

Como entendemos que temos muitas features, após observar os resultados das correlações de pearson, fizemos um filtro preliminar de features, eliminando features cuja correlação fosse menor que +/-0.50.

In [None]:
col = [column for column in dataset.columns if (column not in relevant_features.index)]
dataset = dataset.drop(columns=col, axis=1)

## Feature selection com SelectKBest

In [None]:
df_data = dataset.drop(columns='obitos')
df_label = dataset['obitos']

In [None]:
# Feature extraction
columns = []
test = SelectKBest(f_regression, k=70)
fit = test.fit_transform(abs(df_data), df_label)
for i in range(0, len(test.get_support())):
    if test.get_support()[i]:
        columns.append(dataset.columns[i])

In [None]:
len(columns)

In [None]:
fit[:19]

In [None]:
df_data[:19]

Resultados:



## Feature selection of Lag Variables com Random Forest Regressor

We can also use feature selection to automatically identify and select those input features that are most predictive.

A popular method for feature selection is called Recursive Feature Selection (RFE).

RFE works by creating predictive models, weighting features, and pruning those with the smallest weights, then repeating the process until a desired number of features are left.

The code below uses RFE with a random forest predictive model and sets the desired number of input features to 10.

(A bar graph is also created showing the feature selection rank (smaller is better) for each input feature.)

In [None]:
# separate into input and output variables
array = dataset.values
# perform feature selection
rfe = RFE(RandomForestRegressor(n_estimators=500, random_state=1), n_features_to_select=70)
fit = rfe.fit(df_data, df_label)
# report selected features
print('Selected Features:')
names = dataset.columns.values[0:-1]
columns_1 = []
for i in range(len(fit.support_)):
    if fit.support_[i]:
        columns_1.append(names[i])
        print(names[i])
# plot feature rank
#names = dataset.columns.values[0:-1]
#ticks = [i for i in range(len(names))]
#pyplot.bar(ticks, fit.ranking_)
#pyplot.xticks(ticks, names)
#pyplot.show()

In [None]:
len(columns_1)

## Escolha final de features:
Após fazer estes dois métodos de selecção de features, e visto que o seus resultados fazem bastante overlap, decidimos escolher as features resultantes iguais, bem como manter as diferentes. 

In [13]:
df_data.columns

Index(['recuperados', 'Relative_Humidity', 'total_testes', 'testes_pcr',
       'total_cases_Andorra', 'new_cases_smoothed_Andorra',
       'total_cases_per_million_Andorra',
       'new_cases_smoothed_per_million_Andorra', 'new_tests_smoothed_Andorra',
       'new_tests_smoothed_per_thousand_Andorra',
       ...
       'new_tests_smoothed_per_thousand_United States',
       'positive_rate_United States', 'total_cases_Vatican',
       'total_cases_per_million_Vatican', 'Vaccinations_Gibraltar',
       'Vaccinations_per_Million_Gibraltar', 'Vaccinations_Monaco',
       'Vaccinations_per_Million_Monaco', 'Vaccinations_United Arab Emirates',
       'Vaccinations_per_Million_United Arab Emirates'],
      dtype='object', length=469)

In [59]:
final_columns = columns.copy()

for i in columns_1:
    if i not in columns:
        final_columns.append(i)

In [62]:
columns_to_drop = []
for i in df_data.columns:
    if i not in final_columns:
        columns_to_drop.append(i)
df_data = df_data.drop(columns=columns_to_drop, axis=1)
df_data.shape

(1249, 113)

In [63]:
df_data.columns

Index(['recuperados', 'new_cases_smoothed_per_million_Andorra',
       'new_tests_smoothed_Andorra', 'new_deaths_Austria',
       'new_deaths_per_million_Austria', 'new_tests_smoothed_Belgium',
       'new_cases_smoothed_per_million_Canada',
       'new_tests_smoothed_per_thousand_Croatia', 'new_cases_Czechia',
       'new_cases_per_million_Czechia',
       ...
       'new_cases_smoothed_United States', 'new_deaths_United States',
       'new_cases_per_million_United States',
       'new_cases_smoothed_per_million_United States',
       'new_deaths_per_million_United States',
       'new_deaths_smoothed_per_million_United States',
       'icu_patients_United States', 'icu_patients_per_million_United States',
       'hosp_patients_United States', 'new_tests_per_thousand_United States'],
      dtype='object', length=113)

In [None]:
df_data.to_csv("dataframe_explored.csv")