In [49]:
import pandas as pd


In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
#cola nos parenteses o conteúdo de "copiar caminho"
df = pd.read_csv("/content/drive/MyDrive/IIC - COVID dataset/Covid Data.csv")

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   USMER                 1048575 non-null  int64 
 1   MEDICAL_UNIT          1048575 non-null  int64 
 2   SEX                   1048575 non-null  int64 
 3   PATIENT_TYPE          1048575 non-null  int64 
 4   DATE_DIED             1048575 non-null  object
 5   INTUBED               1048575 non-null  int64 
 6   PNEUMONIA             1048575 non-null  int64 
 7   AGE                   1048575 non-null  int64 
 8   PREGNANT              1048575 non-null  int64 
 9   DIABETES              1048575 non-null  int64 
 10  COPD                  1048575 non-null  int64 
 11  ASTHMA                1048575 non-null  int64 
 12  INMSUPR               1048575 non-null  int64 
 13  HIPERTENSION          1048575 non-null  int64 
 14  OTHER_DISEASE         1048575 non-null  int64 
 15

## Column Definitions



*   USMER : This indicates whether the patient treated medical units of the first, second or third level
* MEDICAL_UNIT : The type of institution of the Natinal Health System that provided the care.
* SEX : 1-"Female", 2-"male".
* PATIENT_TYPE : The type of care the patient received in the unit. 1-"returned home", 2-"hospitalization". 97 and 99 are missing data.
* DATE_DIED : Indicated date of death means that the patient died, except for 9999-99-99 which means that the patient survived..
* INTUBED : Wether the patient was connected to a ventilator. 1-"yes", 2-"no"
* PNEUMONIA : Whether the patient already have air sacs inflamation or not.1-"yes", 2-"no", while 97 and 99 are missing data.
* AGE: The patient's age.
* PREGNANT: Whether the patient is pregnant or not.1-"yes", 2-"no", while 97 and 99 are missing data.
* DIABETES : Whether the patient has diabetes or not.
* COPD : Whether the patient has chronic obstructive pulmonary disease or not.
* ASTHMA : Whether the patient has asthma or not.
* INMSUPR : Whether the patient is immunosuppressed or not.
* HYPERTENSION : Whether the patient has hypertension or not.
* OTHER DISEASE : Whether the patient has other disease or not.
* CARDIOVASCULAR : Whether the patient has patient has heart or blood vessels related diseases.
* OBESITY : Whether the patient is obessed or not.
* RENAL CHRONIC : Whether the patient has chronic renal disease or not.
* TOBACCO : Whether the patient is a tobacco user or not.
* CLASIFFICATION_FINAL : Covid test findings. Values 1-3 means that the patient was diagnosed with covid in different degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.
* ICU : Whether the patient has been admitted into the intensive care unit.




In [53]:
df.columns


Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

# Pre-Processamento

## Excluindo as colunas "INTUBED", "PREGNANT" e "ICU" porque os dados nessas colunas têm muito valor ausente


In [54]:
df.drop(columns=["USMER","INTUBED", "PREGNANT","ICU", "PATIENT_TYPE", "INMSUPR", "MEDICAL_UNIT"], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 14 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   SEX                   1048575 non-null  int64 
 1   DATE_DIED             1048575 non-null  object
 2   PNEUMONIA             1048575 non-null  int64 
 3   AGE                   1048575 non-null  int64 
 4   DIABETES              1048575 non-null  int64 
 5   COPD                  1048575 non-null  int64 
 6   ASTHMA                1048575 non-null  int64 
 7   HIPERTENSION          1048575 non-null  int64 
 8   OTHER_DISEASE         1048575 non-null  int64 
 9   CARDIOVASCULAR        1048575 non-null  int64 
 10  OBESITY               1048575 non-null  int64 
 11  RENAL_CHRONIC         1048575 non-null  int64 
 12  TOBACCO               1048575 non-null  int64 
 13  CLASIFFICATION_FINAL  1048575 non-null  int64 
dtypes: int64(13), object(1)
memory usage: 112.0+ MB


## Removendo linhas com valores 97 ,98 e 99. Pois indicam a falta de dados

In [55]:
# Define a lista de valores que deseja buscar
valores = ['97', '98', '99']

# Verifica em todas as colunas se há pelo menos um elemento igual a qualquer um dos valores
mascara = df.apply(lambda coluna: coluna.isin(valores).any(), axis=0)

# Seleciona as linhas que atendem a essa condição
df = df.loc[~df.loc[:,mascara].isin(valores).any(axis=1)]

## Substituindo dados
Substitundo os dados 9999-99-99 por 2 que indica que o paciente está vivo, enquanto os dados da data de morte serão alterados para 1 que indica que o paciente morreu.


In [56]:
df.DATE_DIED.value_counts()


9999-99-99    971633
06/07/2020      1000
07/07/2020       996
13/07/2020       990
16/06/2020       979
               ...  
24/11/2020         1
17/12/2020         1
08/12/2020         1
16/03/2021         1
22/04/2021         1
Name: DATE_DIED, Length: 401, dtype: int64

In [57]:
df["DEATH"] = [2 if row=='9999-99-99' else 1 for row in df["DATE_DIED"]]
df["DEATH"].value_counts()

2    971633
1     76942
Name: DEATH, dtype: int64

In [58]:
df.drop("DATE_DIED", axis=1, inplace=True)


## Transformando coluna clasiffication_final

São os dados da pessoa se tem COVID positivo ou negativo. 
Até 3 POSITIVO
Acima de 3 NEGATIVO ou INCONCLUSIVO

Transformando dados para 1 positivo e 2 negativo

In [59]:
# Define a função que realiza a transformação
def transforma_valor(valor):
    if valor >= 1 and valor <= 3:
        return 1
    else:
        return 2

# Aplica a função à coluna desejada
df['CLASIFFICATION_FINAL'] = df['CLASIFFICATION_FINAL'].apply(transforma_valor)

In [60]:
df.head()

Unnamed: 0,SEX,PNEUMONIA,AGE,DIABETES,COPD,ASTHMA,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,DEATH
0,1,1,65,2,2,2,1,2,2,2,2,2,1,1
1,2,1,72,2,2,2,1,2,2,1,1,2,2,1
2,2,2,55,1,2,2,2,2,2,2,2,2,1,1
3,1,2,53,2,2,2,2,2,2,2,2,2,2,1
4,2,2,68,1,2,2,1,2,2,2,2,2,1,1


# Treino e teste


In [61]:
X = df.drop("DEATH", axis=1)
y = df["DEATH"]

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

# Classificação


## TreeClassifier

In [63]:
from sklearn.tree import DecisionTreeClassifier
modelo_tree = DecisionTreeClassifier(random_state=42)
modelo_tree.fit(X_train,y_train)
y_pred = modelo_tree.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print("Relatório de Classificação:\n", classification_report(y_test, y_pred, digits=4))

Relatório de Classificação:
               precision    recall  f1-score   support

           1     0.5295    0.3756    0.4394     15240
           2     0.9522    0.9738    0.9629    194475

    accuracy                         0.9304    209715
   macro avg     0.7408    0.6747    0.7012    209715
weighted avg     0.9214    0.9304    0.9248    209715



In [64]:
import pickle
with open('modelo.pkl', 'wb') as file:
    pickle.dump(modelo_tree, file)

In [65]:
from google.colab import files
files.download('modelo.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1048575 entries, 0 to 1048574
Data columns (total 14 columns):
 #   Column                Non-Null Count    Dtype
---  ------                --------------    -----
 0   SEX                   1048575 non-null  int64
 1   PNEUMONIA             1048575 non-null  int64
 2   AGE                   1048575 non-null  int64
 3   DIABETES              1048575 non-null  int64
 4   COPD                  1048575 non-null  int64
 5   ASTHMA                1048575 non-null  int64
 6   HIPERTENSION          1048575 non-null  int64
 7   OTHER_DISEASE         1048575 non-null  int64
 8   CARDIOVASCULAR        1048575 non-null  int64
 9   OBESITY               1048575 non-null  int64
 10  RENAL_CHRONIC         1048575 non-null  int64
 11  TOBACCO               1048575 non-null  int64
 12  CLASIFFICATION_FINAL  1048575 non-null  int64
 13  DEATH                 1048575 non-null  int64
dtypes: int64(14)
memory usage: 120.0 MB
