#Análise de Pacientes com Coronavírus 🦠

### Etapa 01: Importação do Conjunto de Dados

In [33]:
# Importação de bibliotecas
import pandas as pd
import numpy as np

In [34]:
# Importando o conjunto de dados dos pacientes
url = '/content/covid_data.csv'
dados = pd.read_csv(url)

In [35]:
# Verificando se a leitura foi feita com sucesso
dados.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


###Etapa 02: Exploração do Conjunto de Dados

In [36]:
# Obtendo as informações gerais do dataframe
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   USMER                 1048575 non-null  int64 
 1   MEDICAL_UNIT          1048575 non-null  int64 
 2   SEX                   1048575 non-null  int64 
 3   PATIENT_TYPE          1048575 non-null  int64 
 4   DATE_DIED             1048575 non-null  object
 5   INTUBED               1048575 non-null  int64 
 6   PNEUMONIA             1048575 non-null  int64 
 7   AGE                   1048575 non-null  int64 
 8   PREGNANT              1048575 non-null  int64 
 9   DIABETES              1048575 non-null  int64 
 10  COPD                  1048575 non-null  int64 
 11  ASTHMA                1048575 non-null  int64 
 12  INMSUPR               1048575 non-null  int64 
 13  HIPERTENSION          1048575 non-null  int64 
 14  OTHER_DISEASE         1048575 non-null  int64 
 15

In [37]:
# Descobrindo a quantidade de valores nulos (NaN)
dados.isnull().sum()

Unnamed: 0,0
USMER,0
MEDICAL_UNIT,0
SEX,0
PATIENT_TYPE,0
DATE_DIED,0
INTUBED,0
PNEUMONIA,0
AGE,0
PREGNANT,0
DIABETES,0


In [38]:
# Verificando os valores da única coluna do tipo 'object'
dados['DATE_DIED'].unique()

array(['03/05/2020', '03/06/2020', '09/06/2020', '12/06/2020',
       '21/06/2020', '9999-99-99', '26/02/2020', '05/04/2020',
       '08/05/2020', '20/05/2020', '17/07/2020', '13/01/2020',
       '22/01/2020', '29/01/2020', '13/02/2020', '18/02/2020',
       '19/02/2020', '20/02/2020', '24/02/2020', '04/03/2020',
       '07/03/2020', '12/03/2020', '14/03/2020', '18/03/2020',
       '27/03/2020', '28/03/2020', '29/03/2020', '02/04/2020',
       '06/04/2020', '07/04/2020', '08/04/2020', '09/04/2020',
       '10/04/2020', '11/04/2020', '12/04/2020', '13/04/2020',
       '14/04/2020', '15/04/2020', '16/04/2020', '17/04/2020',
       '18/04/2020', '20/04/2020', '21/04/2020', '22/04/2020',
       '23/04/2020', '24/04/2020', '25/04/2020', '26/04/2020',
       '27/04/2020', '28/04/2020', '29/04/2020', '30/04/2020',
       '01/05/2020', '02/05/2020', '04/05/2020', '05/05/2020',
       '06/05/2020', '07/05/2020', '09/05/2020', '10/05/2020',
       '11/05/2020', '12/05/2020', '13/05/2020', '14/05

###Etapa 03: Manipulação do Conjunto de Dados

In [39]:
# Substituindo os valores de 97 e 99 por valores nulos (NaN)
valores_trocar = [97, 98, 99]
dados.replace(valores_trocar, np.nan, inplace=True)

In [40]:
# Verificando a alteração
dados['INTUBED']

Unnamed: 0,INTUBED
0,
1,
2,1.0
3,
4,
...,...
1048570,
1048571,2.0
1048572,
1048573,


In [41]:
# Removendo os registros com valores nulos (NaN)
dados.dropna(inplace=True)

In [42]:
# Obtendo os valores da coluna de gênero
dados['SEX'].unique()

array([1])

In [43]:
# Trocando os valores da coluna de gênero por 'object' (string)
dados['SEX'] = dados['SEX'].apply(lambda x: 'Female' if x == 1 else 'Man')
dados.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
5,2,1,Female,2,9999-99-99,2.0,1.0,40.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
8,2,1,Female,2,9999-99-99,2.0,2.0,37.0,2.0,1.0,...,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,3,2.0
9,2,1,Female,2,9999-99-99,2.0,2.0,25.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
16,2,1,Female,2,9999-99-99,2.0,1.0,80.0,2.0,2.0,...,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0
56,2,1,Female,2,9999-99-99,1.0,1.0,58.0,2.0,2.0,...,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,7,1.0


In [44]:
# Removendo a coluna de estado do paciente
dados.drop(columns=['PATIENT_TYPE'], inplace=True)
dados.head()
# Motivo: a coluna apenas possui o valor numérico de dois

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
5,2,1,Female,9999-99-99,2.0,1.0,40.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
8,2,1,Female,9999-99-99,2.0,2.0,37.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,3,2.0
9,2,1,Female,9999-99-99,2.0,2.0,25.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
16,2,1,Female,9999-99-99,2.0,1.0,80.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0
56,2,1,Female,9999-99-99,1.0,1.0,58.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,7,1.0


In [45]:
# Obtendo os valores únicos da coluna de classificação
dados['CLASIFFICATION_FINAL'].unique()

array([3, 7, 5, 4, 2, 6, 1])

In [46]:
# Obtendo os registros (índices) com valores acima de três (inconclusivos)
filtro = dados['CLASIFFICATION_FINAL'] > 3
registros = dados[filtro].sort_values('CLASIFFICATION_FINAL', ascending=True).index
registros

Index([ 660054,  341959,  660483,  660461,  660454,  341913,  660385,  660384,
        408099,  660354,
       ...
        300733,  300718,  300701,  300647,  300622,  300569,  300565,  300554,
        300841, 1048569],
      dtype='int64', length=35455)

In [47]:
# Removendo os registros sem conclusão
dados.drop(registros, inplace=True)
dados.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
5,2,1,Female,9999-99-99,2.0,1.0,40.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
8,2,1,Female,9999-99-99,2.0,2.0,37.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,3,2.0
9,2,1,Female,9999-99-99,2.0,2.0,25.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
16,2,1,Female,9999-99-99,2.0,1.0,80.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0
162,1,2,Female,9999-99-99,2.0,1.0,10.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,3,2.0


In [48]:
# Verificando se o processo foi feito com sucesso
dados.sort_values('CLASIFFICATION_FINAL', ascending=True)

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
395106,2,9,Female,9999-99-99,2.0,1.0,36.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,2.0
432975,2,10,Female,9999-99-99,2.0,2.0,42.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,2.0
432978,2,10,Female,9999-99-99,2.0,2.0,58.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,2.0
432982,2,10,Female,9999-99-99,2.0,2.0,36.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1,2.0
394668,2,9,Female,9999-99-99,2.0,2.0,50.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95951,2,4,Female,9999-99-99,2.0,2.0,50.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
95969,2,4,Female,9999-99-99,2.0,1.0,31.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
95979,1,4,Female,9999-99-99,2.0,1.0,66.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
95925,2,4,Female,9999-99-99,2.0,1.0,50.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,1.0


In [49]:
# Alterando os valores da coluna de data da morte
dados['DATE_DIED'] = dados['DATE_DIED'].replace('9999-99-99', 'Not died')
dados.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
5,2,1,Female,Not died,2.0,1.0,40.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
8,2,1,Female,Not died,2.0,2.0,37.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,3,2.0
9,2,1,Female,Not died,2.0,2.0,25.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3,2.0
16,2,1,Female,Not died,2.0,1.0,80.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,3,1.0
162,1,2,Female,Not died,2.0,1.0,10.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,3,2.0


In [50]:
# Filtrando as colunas que possuem os valores de um e dois apenas
valores_booleanos = [1, 2]
colunas_booleanas = dados.columns[dados.apply(lambda coluna: coluna.isin(valores_booleanos).all())]
colunas_booleanas

Index(['USMER', 'INTUBED', 'PNEUMONIA', 'PREGNANT', 'DIABETES', 'COPD',
       'ASTHMA', 'INMSUPR', 'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR',
       'OBESITY', 'RENAL_CHRONIC', 'TOBACCO', 'ICU'],
      dtype='object')

In [53]:
# Trocando os valores de um e dois para True e False
for coluna in colunas_booleanas:
  dados[coluna] = dados[coluna].apply(lambda num: True if num == 1 else False)

In [56]:
# Verificando se o processo foi bem executado
dados.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
5,False,1,Female,Not died,False,True,40.0,False,False,False,False,False,False,False,False,False,False,False,3,False
8,False,1,Female,Not died,False,False,37.0,False,True,False,False,False,True,False,False,True,False,False,3,False
9,False,1,Female,Not died,False,False,25.0,False,False,False,False,False,False,False,False,False,False,False,3,False
16,False,1,Female,Not died,False,True,80.0,False,False,False,False,False,True,False,False,False,False,False,3,True
162,True,2,Female,Not died,False,True,10.0,False,False,False,False,False,False,True,True,False,False,False,3,False
