# SC1015 Mini-Project: What pre-exisiting conditions increases a patient's vulnerability to mortality from Covid-19?

### Essential Libraries

> NumPy : Library for Numeric Computations in Python  
> Pandas : Library for Data Acquisition and Preparation  
> Matplotlib : Low-level library for Data Visualization  
> Seaborn : Higher-level library for Data Visualization  
> Sklearn : Machine Learning and Statistical Modeling

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import plot_tree
sb.set() # set the default Seaborn style for graphics
pd. set_option('display.max_columns', None)

## Exploratory Data Analysis

In [2]:
#Import Dataset
covid_data = pd.read_csv('Dataset/Covid Data.csv')
covid_data.head(100)

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,2,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,2,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,2,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,2,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,2,2,2,1,2,2,2,2,2,3,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,1,1,1,9999-99-99,97,2,49,2,2,2,2,2,2,2,2,2,2,2,7,97
96,2,1,1,1,9999-99-99,97,2,24,2,2,2,2,2,2,2,2,2,2,2,7,97
97,2,1,2,1,9999-99-99,97,2,28,97,2,2,2,2,2,2,2,2,2,2,7,97
98,2,1,2,1,9999-99-99,97,2,27,97,2,2,2,2,2,2,2,2,2,1,7,97


In [3]:
covid_data.shape

(1048575, 21)

Data have 717 Rows and 36 Columns

In [4]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   USMER                 1048575 non-null  int64 
 1   MEDICAL_UNIT          1048575 non-null  int64 
 2   SEX                   1048575 non-null  int64 
 3   PATIENT_TYPE          1048575 non-null  int64 
 4   DATE_DIED             1048575 non-null  object
 5   INTUBED               1048575 non-null  int64 
 6   PNEUMONIA             1048575 non-null  int64 
 7   AGE                   1048575 non-null  int64 
 8   PREGNANT              1048575 non-null  int64 
 9   DIABETES              1048575 non-null  int64 
 10  COPD                  1048575 non-null  int64 
 11  ASTHMA                1048575 non-null  int64 
 12  INMSUPR               1048575 non-null  int64 
 13  HIPERTENSION          1048575 non-null  int64 
 14  OTHER_DISEASE         1048575 non-null  int64 
 15

In [5]:
#remove duplicates 
covid_data = covid_data.drop_duplicates()
covid_data.info()
covid_data.describe() 


<class 'pandas.core.frame.DataFrame'>
Index: 236526 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   USMER                 236526 non-null  int64 
 1   MEDICAL_UNIT          236526 non-null  int64 
 2   SEX                   236526 non-null  int64 
 3   PATIENT_TYPE          236526 non-null  int64 
 4   DATE_DIED             236526 non-null  object
 5   INTUBED               236526 non-null  int64 
 6   PNEUMONIA             236526 non-null  int64 
 7   AGE                   236526 non-null  int64 
 8   PREGNANT              236526 non-null  int64 
 9   DIABETES              236526 non-null  int64 
 10  COPD                  236526 non-null  int64 
 11  ASTHMA                236526 non-null  int64 
 12  INMSUPR               236526 non-null  int64 
 13  HIPERTENSION          236526 non-null  int64 
 14  OTHER_DISEASE         236526 non-null  int64 
 15  CARDIOVASCULAR       

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
count,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0,236526.0
mean,1.512337,7.63423,1.547521,1.577712,44.238401,4.525765,52.620274,54.794826,2.840855,2.966976,2.949325,3.13563,2.691319,3.687645,2.980767,2.768863,2.954956,2.973352,4.781263,44.371008
std,0.499849,3.661318,0.497738,0.493925,47.394072,16.731469,19.318869,47.225449,10.500621,9.882551,9.839247,10.619391,10.098528,12.930442,10.026841,9.959472,9.88831,10.289529,1.897253,47.337447
min,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,4.0,1.0,1.0,2.0,1.0,40.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,2.0,2.0,3.0,2.0
50%,2.0,6.0,2.0,2.0,2.0,2.0,54.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,5.0,2.0
75%,2.0,12.0,2.0,2.0,97.0,2.0,67.0,97.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,7.0,97.0
max,2.0,13.0,2.0,2.0,99.0,99.0,121.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0,7.0,99.0


In [6]:
#dropping missing rows from icu
covid_data = covid_data.drop(covid_data[(covid_data['ICU'] == 97) | (covid_data['ICU'] == 99)].index)
covid_data.head(50)

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,COPD,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
2,2,1,2,2,09/06/2020,1,2,55,97,1,2,2,2,2,2,2,2,2,2,3,2
5,2,1,1,2,9999-99-99,2,1,40,2,2,2,2,2,2,2,2,2,2,2,3,2
8,2,1,1,2,9999-99-99,2,2,37,2,1,2,2,2,1,2,2,1,2,2,3,2
9,2,1,1,2,9999-99-99,2,2,25,2,2,2,2,2,2,2,2,2,2,2,3,2
11,2,1,2,2,9999-99-99,2,2,24,97,2,2,2,2,2,2,2,2,2,2,3,2
12,2,1,2,2,9999-99-99,2,2,30,97,2,2,2,2,2,2,2,2,2,2,3,2
16,2,1,1,2,9999-99-99,2,1,80,2,2,2,2,2,1,2,2,2,2,2,3,1
20,2,1,2,2,9999-99-99,2,1,59,97,1,2,2,2,2,2,2,2,2,1,3,1
30,2,1,2,2,9999-99-99,2,1,45,97,2,2,2,2,2,2,2,2,2,2,3,2
41,2,1,2,2,9999-99-99,1,1,45,97,2,2,2,2,2,2,2,2,2,2,3,1
