# Data analysis 

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("COVID19_data.csv", index_col="ID")
df.head()

Unnamed: 0_level_0,AGE,SEX,DAYS_HOSPITAL,DAYS_ICU,EXITUS,DESTINATION,TEMP,HEART_RATE,GLUCOSE,SAT_O2,BLOOD_PRES_SYS,BLOOD_PRES_DIAS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,15.0,FEMALE,4,0,NO,,37.0,0,0,92,0,0
2,18.0,FEMALE,4,0,NO,ADMISSION,37.3,105,0,97,0,0
3,21.0,MALE,7,0,NO,,38.5,112,0,95,85,47
4,21.0,MALE,10,0,NO,ADMISSION,39.2,113,0,97,0,0
5,22.0,MALE,4,0,NO,,36.3,80,0,92,111,70


In [55]:
print(df.size)

24648


## Preprocessing

### Null values

In [56]:
df.isnull().sum() 

AGE                   4
SEX                   2
DAYS_HOSPITAL         0
DAYS_ICU              0
EXITUS               41
DESTINATION        1383
TEMP                  0
HEART_RATE            0
GLUCOSE               0
SAT_O2                0
BLOOD_PRES_SYS        0
BLOOD_PRES_DIAS       0
dtype: int64

#### DESTINATION

In [57]:
df.DESTINATION.value_counts()

ADMISSION    671
Name: DESTINATION, dtype: int64

In [58]:
df.DESTINATION.isnull().sum()  * 100 / len(df)

67.33203505355404

Since the number of null elements in this variable is extremely high, and all elements have the same value, this variable is eliminated. 

In [59]:
df = df.drop(['DESTINATION'], axis=1)

#### Rest of variables 
Since the number of null elements in the rest of the variables is very small, these elements are eliminated. 

In [60]:
df = df[df['AGE'].notna()]
df = df[df['SEX'].notna()]
df = df[df['EXITUS'].notna()]
df.isnull().sum() 

AGE                0
SEX                0
DAYS_HOSPITAL      0
DAYS_ICU           0
EXITUS             0
TEMP               0
HEART_RATE         0
GLUCOSE            0
SAT_O2             0
BLOOD_PRES_SYS     0
BLOOD_PRES_DIAS    0
dtype: int64

In [61]:
df.dtypes

AGE                float64
SEX                 object
DAYS_HOSPITAL        int64
DAYS_ICU             int64
EXITUS              object
TEMP               float64
HEART_RATE           int64
GLUCOSE              int64
SAT_O2               int64
BLOOD_PRES_SYS       int64
BLOOD_PRES_DIAS      int64
dtype: object

### Dataset normalization

In [62]:
df.EXITUS.value_counts()

NO     1678
YES     329
Name: EXITUS, dtype: int64

In [63]:
df.EXITUS = df.EXITUS.replace('NO',0)
df.EXITUS = df.EXITUS.replace('YES',1)
Y = df.EXITUS

In [64]:

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer

X = df.drop(['EXITUS'], axis=1)

col_transformer = make_column_transformer(
    (OneHotEncoder(), ['SEX']), 
    (MinMaxScaler(), ['AGE', 'DAYS_HOSPITAL', 'DAYS_ICU', 'TEMP', 'HEART_RATE', 'GLUCOSE', 'SAT_O2', 'BLOOD_PRES_SYS', 'BLOOD_PRES_DIAS']))

X = col_transformer.fit_transform(X)

In [65]:
X = pd.DataFrame(data=X)


# Clasification

In [66]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=4815)

### Regresión logistica 

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

LL = LogisticRegression()
LL.fit(X_train,y_train)
Y_pred_test = LL.predict(X_test)  
print(metrics.confusion_matrix(y_test,Y_pred_test))
print( "Accuracy : " + str(np.round(metrics.accuracy_score(y_test,Y_pred_test),3)))
print( "Precision : " + str(np.round(metrics.precision_score(y_test,Y_pred_test),3)))
print( "Recall : " + str(np.round(metrics.recall_score(y_test,Y_pred_test),3)))
print( "F1 : " + str(np.round(metrics.f1_score(y_test,Y_pred_test),3)))

[[491   1]
 [106   5]]
Accuracy : 0.823
Precision : 0.833
Recall : 0.045
F1 : 0.085
