# Diagnosis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
random.seed(1) 
np.random.seed(1)

### Load Data

In [3]:
file_path = r"E:\EICU\eicu-collaborative-research-database-2.0\diagnosis.csv.gz"
diagnosis_df = pd.read_csv(file_path, compression="gzip", low_memory=False, usecols=["patientunitstayid", "diagnosisstring"])

In [4]:
len(diagnosis_df)

2710672

In [5]:
diagnosis_df.head()

Unnamed: 0,patientunitstayid,diagnosisstring
0,141168,cardiovascular|chest pain / ASHD|coronary arte...
1,141168,cardiovascular|ventricular disorders|cardiomyo...
2,141168,pulmonary|disorders of the airways|COPD
3,141168,pulmonary|disorders of the airways|COPD
4,141168,cardiovascular|ventricular disorders|congestiv...


In [7]:
diagnosis_df.isnull().sum()

patientunitstayid    0
diagnosisstring      0
dtype: int64

### Extract Diagnosis

In [9]:
diagnosis_df.diagnosisstring.value_counts()

diagnosisstring
pulmonary|respiratory failure|acute respiratory failure                     97836
renal|disorder of kidney|acute renal failure                                65313
endocrine|glucose metabolism|diabetes mellitus                              44491
neurologic|altered mental status / pain|change in mental status             41034
pulmonary|pulmonary infections|pneumonia                                    39729
                                                                            ...  
renal|disorder of acid base|metabolic acidosis|due to acetazolamide             1
surgery|renal issues|metabolic acidosis|normal anion gap                        1
cardiovascular|cardiac surgery|valve replacement >= 7 days|pulmonic             1
general|congenital anomalies|genital organ anomaly                              1
infectious diseases|head and neck infections|upper respiratory infection        1
Name: count, Length: 3933, dtype: int64

In [10]:
diagnosis_df.diagnosisstring.map(lambda x: x.split('|')).head()

0    [cardiovascular, chest pain / ASHD, coronary a...
1    [cardiovascular, ventricular disorders, cardio...
2          [pulmonary, disorders of the airways, COPD]
3          [pulmonary, disorders of the airways, COPD]
4    [cardiovascular, ventricular disorders, conges...
Name: diagnosisstring, dtype: object

In [11]:
diagnosis_df.diagnosisstring.map(lambda x: len(x.split('|'))).min()

3

In [12]:
diagnosis_df.diagnosisstring.map(lambda x: len(x.split('|'))).max()

6

In [14]:
diagnosis_df["diagnosisstring"] = diagnosis_df["diagnosisstring"].astype(str)

In [15]:
diagnosis_df["diagnosis_split"] = diagnosis_df["diagnosisstring"].str.split('|')

In [16]:
diagnosis_df["diagnosis_type_1"] = diagnosis_df["diagnosis_split"].apply(lambda x: x[0] if len(x) > 0 else None)
diagnosis_df["diagnosis_disorder_2"] = diagnosis_df["diagnosis_split"].apply(lambda x: x[1] if len(x) > 1 else None)
diagnosis_df["diagnosis_detailed_3"] = diagnosis_df["diagnosis_split"].apply(lambda x: '|'.join(x[2:]) if len(x) > 2 else None)


In [17]:
diagnosis_df = diagnosis_df.drop(columns=["diagnosis_split"])

In [18]:
diagnosis_df.head()

Unnamed: 0,patientunitstayid,diagnosisstring,diagnosis_type_1,diagnosis_disorder_2,diagnosis_detailed_3
0,141168,cardiovascular|chest pain / ASHD|coronary arte...,cardiovascular,chest pain / ASHD,coronary artery disease|known
1,141168,cardiovascular|ventricular disorders|cardiomyo...,cardiovascular,ventricular disorders,cardiomyopathy
2,141168,pulmonary|disorders of the airways|COPD,pulmonary,disorders of the airways,COPD
3,141168,pulmonary|disorders of the airways|COPD,pulmonary,disorders of the airways,COPD
4,141168,cardiovascular|ventricular disorders|congestiv...,cardiovascular,ventricular disorders,congestive heart failure


In [19]:
diagnosis_df.diagnosis_type_1.value_counts()

diagnosis_type_1
cardiovascular           705966
pulmonary                473721
neurologic               329192
renal                    304523
gastrointestinal         242437
endocrine                183125
infectious diseases      156811
hematology               138990
burns/trauma              80324
oncology                  31914
toxicology                26923
surgery                   22279
general                    8869
transplant                 5423
obstetrics/gynecology        94
genitourinary                44
musculoskeletal              37
Name: count, dtype: int64

In [20]:
diagnosis_df.diagnosis_disorder_2.value_counts()

diagnosis_disorder_2
respiratory failure                      246048
altered mental status / pain             190489
shock / hypotension                      187767
disorder of kidney                       144747
glucose metabolism                       142748
                                          ...  
adverse effect of external cause              6
signs/symptoms/ill-defined conditions         5
inflammatory                                  3
thymus                                        2
breast disorder                               1
Name: count, Length: 131, dtype: int64

In [21]:
diagnosis_df.diagnosis_detailed_3.value_counts()

diagnosis_detailed_3
acute respiratory failure                              97836
acute renal failure                                    65419
hypertension                                           59530
pneumonia                                              50955
diabetes mellitus                                      44491
                                                       ...  
hemorrhage|vital signs unstable                            1
brain tumor|with carcinomatous meningitis                  1
adverse effect: heat/light                                 1
neuosurgical bleeding|possible coagulopathy present        1
other adverse effect                                       1
Name: count, Length: 3197, dtype: int64

In [24]:
diagnosis_df.drop(columns=["diagnosisstring",'diagnosis_disorder_2','diagnosis_detailed_3'])

Unnamed: 0,patientunitstayid,diagnosis_type_1
0,141168,cardiovascular
1,141168,cardiovascular
2,141168,pulmonary
3,141168,pulmonary
4,141168,cardiovascular
...,...,...
2710667,3353251,renal
2710668,3353251,cardiovascular
2710669,3353254,renal
2710670,3353254,gastrointestinal


In [25]:
diagnosis_df.to_csv('diagnosis_cleaned.csv',index=False)