# MIMIC III Descriptive Statistics (per icd9 category)

* Input:
  * ./data/NOTEEVENTS-2.csv (contains clinical notes / text)
  * ./data/DIAGNOSES_ICD.csv (contains admission to ICD9 code diagnosis)
* Output: None
* Description:
  * Descriptive statistics of ICD9 categories

## Initialization and Data Loading

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

ne_struct = StructType([StructField("row_id", IntegerType(), True),
                      StructField("subject_id", IntegerType(), True),
                      StructField("hadm_id", IntegerType(), True),
                      StructField("chartdate", DateType(), True),
                      StructField("category", StringType(), True),
                      StructField("description", StringType(), True),
                      StructField("cgid", IntegerType(), True),
                      StructField("iserror", IntegerType(), True),
                      StructField("text", StringType(), True)])
df_ne = spark.read.csv("./data/NOTEEVENTS-2.csv",
# df_ne = spark.read.csv("./data/NOTEEVENTS-2sample.csv",
                       header=True,
                       schema=ne_struct)
df_ne.registerTempTable("noteevents")
df_ne.filter(df_ne.category=="Discharge summary") \
    .registerTempTable("noteevents2")
    
# i want to cache noteevents, but it's too big

# many icd to one hadm_id
diag_struct = StructType([StructField("ROW_ID", IntegerType(), True),
                          StructField("SUBJECT_ID", IntegerType(), True),
                          StructField("HADM_ID", IntegerType(), True),
                          StructField("SEQ_NUM", IntegerType(), True),
                          StructField("ICD9_CODE", StringType(), True)])
df_diag_m = spark.read.csv("./data/DIAGNOSES_ICD.csv",
                           header=True,
                           schema=diag_struct) \
            .selectExpr("ROW_ID as row_id", 
                        "SUBJECT_ID as subject_id",
                        "HADM_ID as hadm_id",
                        "SEQ_NUM as seq_num",
                        "ICD9_CODE as icd9_code")
    
# added to filter out categories
geticd9cat_udf = F.udf(lambda x: str(x)[:3], StringType())
df_diag_m = df_diag_m.withColumn("icd9_code", geticd9cat_udf("icd9_code"))
df_diag_m.registerTempTable("diagnoses_icd_m")
df_diag_m.cache()

# one icd to one hadm_id (take the smallest seq number as primary)
diag_o_rdd = df_diag_m.rdd.sortBy(lambda x: (x.hadm_id, x.subject_id, x.seq_num)) \
    .groupBy(lambda x: x.hadm_id) \
    .mapValues(list) \
    .reduceByKey(lambda x, y: x if x.seq_num < y.seq_num else y) \
    .map(lambda (hid, d): d[0])
df_diag_o = spark.createDataFrame(diag_o_rdd)
df_diag_o.registerTempTable("diagnoses_icd_o")
df_diag_o.cache()

# get hadm_id list in noteevents
df_hadm_id_list = spark.sql("""
SELECT DISTINCT hadm_id FROM noteevents2
""")
df_hadm_id_list.registerTempTable("hadm_id_list")
df_hadm_id_list.cache()

# get subject_id list in noteevents
df_subject_id_list = spark.sql("""
SELECT DISTINCT subject_id FROM noteevents2
""")
df_subject_id_list.registerTempTable("subject_id_list")
df_subject_id_list.cache()

df_diag_o2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_o.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_o JOIN hadm_id_list
ON diagnoses_icd_o.hadm_id = hadm_id_list.hadm_id
""")
df_diag_o2.registerTempTable("diagnoses_icd_o2")
df_diag_o2.cache()

df_diag_m2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_m.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_m JOIN hadm_id_list
ON diagnoses_icd_m.hadm_id = hadm_id_list.hadm_id
""")
df_diag_m2.registerTempTable("diagnoses_icd_m2")
df_diag_m2.cache()

print df_ne.dtypes
print df_diag_m.dtypes
print df_diag_o.dtypes
print df_hadm_id_list.dtypes
print df_subject_id_list.dtypes

[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('chartdate', 'date'), ('category', 'string'), ('description', 'string'), ('cgid', 'int'), ('iserror', 'int'), ('text', 'string')]
[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('seq_num', 'int'), ('icd9_code', 'string')]
[('row_id', 'bigint'), ('subject_id', 'bigint'), ('hadm_id', 'bigint'), ('seq_num', 'bigint'), ('icd9_code', 'string')]
[('hadm_id', 'int')]
[('subject_id', 'int')]


In [3]:
spark.sql("""
SELECT * FROM diagnoses_icd_m
LIMIT 10
""").show()

+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
|  1297|       109| 172335|      1|      403|
|  1298|       109| 172335|      2|      486|
|  1299|       109| 172335|      3|      582|
|  1300|       109| 172335|      4|      585|
|  1301|       109| 172335|      5|      425|
|  1302|       109| 172335|      6|      276|
|  1303|       109| 172335|      7|      710|
|  1304|       109| 172335|      8|      276|
|  1305|       109| 172335|      9|      724|
|  1306|       109| 172335|     10|      458|
+------+----------+-------+-------+---------+



## Descriptive Statistics

### noteevents
Basic Counts:

In [6]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), COUNT(DISTINCT hadm_id)
FROM noteevents
""").show()
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), COUNT(DISTINCT hadm_id)
FROM noteevents2
""").show()

+--------+--------------------------+-----------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|
+--------+--------------------------+-----------------------+
| 2083180|                     46146|                  58361|
+--------+--------------------------+-----------------------+

+--------+--------------------------+-----------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|
+--------+--------------------------+-----------------------+
|   59652|                     41127|                  52726|
+--------+--------------------------+-----------------------+



In [7]:
spark.sql("""
SELECT COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
WHERE icd9_code IN
    (SELECT icd9_code
    FROM diagnoses_icd_m2
    GROUP BY icd9_code
    ORDER BY COUNT(DISTINCT hadm_id) DESC
    LIMIT 10)
""").show()

spark.sql("""
SELECT COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
WHERE icd9_code IN
    (SELECT icd9_code
    FROM diagnoses_icd_m2
    GROUP BY icd9_code
    ORDER BY COUNT(DISTINCT hadm_id) DESC
    LIMIT 50)
""").show()

spark.sql("""
SELECT COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
WHERE icd9_code IN
    (SELECT icd9_code
    FROM diagnoses_icd_m2
    GROUP BY icd9_code
    ORDER BY COUNT(DISTINCT hadm_id) DESC
    LIMIT 100)
""").show()

+----------+
|hadm_count|
+----------+
|     44419|
+----------+

+----------+
|hadm_count|
+----------+
|     51034|
+----------+

+----------+
|hadm_count|
+----------+
|     52096|
+----------+



Categories:

In [7]:
spark.sql("""
SELECT DISTINCT(category)
FROM noteevents
""").show()

+-----------------+
|         category|
+-----------------+
|              ECG|
|     Respiratory |
|          Nursing|
|          General|
|          Consult|
|             Echo|
|        Nutrition|
|       Physician |
|         Pharmacy|
|   Rehab Services|
| Case Management |
|        Radiology|
|    Nursing/other|
|Discharge summary|
|      Social Work|
+-----------------+



### diagnoses_icd: many (icd_code) to one (hadm_id)
Basic Counts:

In [8]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_m
""").show()

spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT LOWER(ICD9_CODE))
FROM diagnoses_icd_m
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|  651047|                     46520|                  58976|                      943|
+--------+--------------------------+-----------------------+-------------------------+

+--------+--------------------------+-----------------------+--------------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT lower(ICD9_CODE))|
+--------+--------------------------+-----------------------+--------------------------------+
|  651047|                     46520|                  58976|                             943|
+--------+--------------------------+-----------------------+--------------------------------+



### diagnoses_icd: one (icd_code) to one (hadm_id)
Basic Counts:

In [9]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_o
""").show()

spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT LOWER(ICD9_CODE))
FROM diagnoses_icd_o
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|   58976|                     46520|                  58976|                      652|
+--------+--------------------------+-----------------------+-------------------------+

+--------+--------------------------+-----------------------+--------------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT lower(ICD9_CODE))|
+--------+--------------------------+-----------------------+--------------------------------+
|   58976|                     46520|                  58976|                             652|
+--------+--------------------------+-----------------------+--------------------------------+



Just to check if I really did get "seq_num = 1" for all diagnosis, the code below should return empty. 

In [10]:
# check code
spark.sql("""
SELECT *
FROM diagnoses_icd_o
WHERE seq_num <> 1
""").show()

+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
+------+----------+-------+-------+---------+



### noteevents and diagnoses_icd (one to one)
Basic Counts:

In [11]:
spark.sql("""
SELECT COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM diagnoses_icd_o2
""").show()

+--------------------------+-----------------------+-------------------------+
|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------------------------+-----------------------+-------------------------+
|                     41127|                  52726|                      641|
+--------------------------+-----------------------+-------------------------+



Top 50 ICD 9 codes based on "subject_id" count

In [12]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM diagnoses_icd_o2
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|      414|     3503|
|      410|     3137|
|      038|     2966|
|      V30|     2348|
|      424|     1691|
|      518|     1324|
|      428|     1248|
|      996|     1199|
|      V31|      981|
|      431|      948|
|      852|      903|
|      427|      900|
|      998|      726|
|      441|      724|
|      434|      690|
|      486|      654|
|      250|      631|
|      584|      611|
|      578|      606|
|      507|      583|
|      198|      513|
|      430|      491|
|      162|      455|
|      571|      427|
|      801|      419|
|      577|      394|
|      562|      377|
|      415|      363|
|      440|      361|
|      433|      342|
|      997|      324|
|      396|      311|
|      197|      308|
|      805|      278|
|      965|      277|
|      482|      277|
|      432|      268|
|      780|      265|
|      519|      260|
|      437|      254|
|      532|      251|
|      820|      244|
|      851

Top 50 ICD 9 codes based on "hadm_id" count

In [13]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_o2
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|      414|      3540|
|      038|      3276|
|      410|      3228|
|      V30|      2348|
|      424|      1707|
|      518|      1510|
|      428|      1460|
|      996|      1373|
|      V31|       981|
|      431|       966|
|      427|       962|
|      852|       940|
|      250|       884|
|      441|       782|
|      998|       747|
|      486|       703|
|      434|       693|
|      578|       656|
|      507|       643|
|      584|       634|
|      198|       553|
|      430|       495|
|      571|       483|
|      162|       471|
|      577|       434|
|      801|       419|
|      562|       404|
|      440|       389|
|      415|       367|
|      433|       353|
|      997|       332|
|      197|       328|
|      519|       320|
|      396|       314|
|      291|       314|
|      437|       296|
|      482|       293|
|      432|       285|
|      491|       284|
|      805|       281|
|      965|

### noteevents and diagnoses_icd (many to one)
Basic Counts:

In [6]:
spark.sql("""
SELECT COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM diagnoses_icd_m2
""").show()

spark.sql("""
SELECT COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM (
    SELECT row_id, subject_id, diagnoses_icd_m.hadm_id AS hadm_id,
    seq_num, icd9_code
    FROM diagnoses_icd_m JOIN (SELECT DISTINCT hadm_id FROM noteevents) AS a
    ON diagnoses_icd_m.hadm_id = a.hadm_id
)
""").show()

+--------------------------+-----------------------+-------------------------+
|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------------------------+-----------------------+-------------------------+
|                     41127|                  52726|                      942|
+--------------------------+-----------------------+-------------------------+

+--------------------------+-----------------------+-------------------------+
|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------------------------+-----------------------+-------------------------+
|                     46139|                  58361|                      943|
+--------------------------+-----------------------+-------------------------+



Top ICD 9 codes based on "subject_id" count

In [15]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|      401|    17551|
|      427|    13666|
|      276|    12326|
|      272|    12023|
|      414|    11693|
|      518|    11063|
|      285|    10479|
|      250|    10072|
|      428|     9974|
|      584|     9300|
|      V45|     6897|
|      599|     6355|
|      530|     6010|
|      E87|     5799|
|      V58|     5723|
|      038|     5355|
|      V10|     4980|
|      410|     4918|
|      424|     4871|
|      997|     4867|
|      995|     4818|
|      585|     4808|
|      780|     4803|
|      785|     4687|
|      998|     4595|
|      458|     4546|
|      403|     4510|
|      305|     4457|
|      486|     4329|
|      041|     3966|
|      244|     3941|
|      V15|     3925|
|      496|     3491|
|      287|     3487|
|      996|     3486|
|      790|     3400|
|      507|     3335|
|      E93|     3202|
|      V12|     3163|
|      511|     2972|
|      348|     2939|
|      765|     2905|
|      311

Top ICD 9 codes based on "hadm_id" count

In [16]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|      401|     20646|
|      427|     16774|
|      276|     14712|
|      272|     14212|
|      414|     14081|
|      250|     13818|
|      428|     13330|
|      518|     12997|
|      285|     12404|
|      584|     11147|
|      V45|      8846|
|      599|      7199|
|      530|      7191|
|      V58|      6998|
|      585|      6764|
|      E87|      6483|
|      403|      6297|
|      V10|      6204|
|      038|      6085|
|      995|      5480|
|      424|      5404|
|      410|      5301|
|      780|      5296|
|      244|      5101|
|      997|      5078|
|      785|      5048|
|      305|      5000|
|      998|      4948|
|      458|      4935|
|      486|      4732|
|      V15|      4420|
|      041|      4399|
|      496|      4296|
|      996|      4251|
|      287|      3881|
|      V12|      3782|
|      790|      3672|
|      507|      3608|
|      E93|      3473|
|      493|      3400|
|      311|

In [None]:
print "Done!"
#sc.stop()