# MIMIC III Preprocessing

## Initialization and Data Loading

In [79]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

In [100]:
conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

ne_struct = StructType([StructField("row_id", IntegerType(), True),
                      StructField("subject_id", IntegerType(), True),
                      StructField("hadm_id", IntegerType(), True),
                      StructField("chartdate", DateType(), True),
                      StructField("category", StringType(), True),
                      StructField("description", StringType(), True),
                      StructField("cgid", IntegerType(), True),
                      StructField("iserror", IntegerType(), True),
                      StructField("text", StringType(), True)])
df_ne = spark.read.csv("./data/NOTEEVENTS-2.csv",
                       header=True,
                       schema=df_ne_struct)
df_ne.registerTempTable("noteevents")

# many icd to one hadm_id
diag_struct = StructType([StructField("ROW_ID", IntegerType(), True),
                          StructField("SUBJECT_ID", IntegerType(), True),
                          StructField("HADM_ID", IntegerType(), True),
                          StructField("SEQ_NUM", IntegerType(), True),
                          StructField("ICD9_CODE", StringType(), True)])
df_diag_m = spark.read.csv("./data/DIAGNOSES_ICD.csv",
                           header=True,
                           schema=df_diag_struct)
df_diag_m.registerTempTable("diagnoses_icd_m")

# one icd to one hadm_id (take the smallest seq number as primary)
diag_o_rdd = df_diag_m.rdd.sortBy(lambda x: (x.HADM_ID, x.SUBJECT_ID, x.SEQ_NUM)) \
    .groupBy(lambda x: x.HADM_ID) \
    .mapValues(list) \
    .reduceByKey(lambda x, y: x if x.SEQ_NUM < y.SEQ_NUM else y) \
    .map(lambda (hid, d): d[0])
df_diag_o = spark.createDataFrame(diag_o_rdd,
                                 schema=diag_struct)
df_diag_o.registerTempTable("diagnoses_icd_o")

# noteevents + many2one diagnoses_icd
df_ne_m = spark.sql("""
SELECT noteevents.subject_id AS subject_id, noteevents.hadm_id AS hadm_id,
noteevents.category AS category, noteevents.description AS description,
noteevents.iserror AS iserror, noteevents.text AS text,
diagnoses_icd_m.SEQ_NUM AS seq_num, diagnoses_icd_m.ICD9_CODE AS icd9_code
FROM noteevents
JOIN diagnoses_icd_m
ON noteevents.hadm_id = diagnoses_icd_m.hadm_id
AND noteevents.subject_id = diagnoses_icd_m.subject_id
""")
df_ne_m.registerTempTable("noteevents_m")

# noteevents + one2one diagnoses_icd
df_ne_o = spark.sql("""
SELECT noteevents.subject_id AS subject_id, noteevents.hadm_id AS hadm_id,
noteevents.category AS category, noteevents.description AS description,
noteevents.iserror AS iserror, noteevents.text AS text,
diagnoses_icd_o.SEQ_NUM AS seq_num, diagnoses_icd_o.ICD9_CODE AS icd9_code
FROM noteevents
JOIN diagnoses_icd_o
ON noteevents.hadm_id = diagnoses_icd_o.hadm_id
AND noteevents.subject_id = diagnoses_icd_o.subject_id
""")
df_ne_o.registerTempTable("noteevents_o")

print df_ne.dtypes
print df_diag_m.dtypes
print df_diag_o.dtypes
print df_ne_m.dtypes
print df_ne_o.dtypes

[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('chartdate', 'date'), ('category', 'string'), ('description', 'string'), ('cgid', 'int'), ('iserror', 'int'), ('text', 'string')]
[('ROW_ID', 'int'), ('SUBJECT_ID', 'int'), ('HADM_ID', 'int'), ('SEQ_NUM', 'int'), ('ICD9_CODE', 'string')]
[('ROW_ID', 'int'), ('SUBJECT_ID', 'int'), ('HADM_ID', 'int'), ('SEQ_NUM', 'int'), ('ICD9_CODE', 'string')]
[('subject_id', 'int'), ('hadm_id', 'int'), ('category', 'string'), ('description', 'string'), ('iserror', 'int'), ('text', 'string'), ('seq_num', 'int'), ('icd9_code', 'string')]
[('subject_id', 'int'), ('hadm_id', 'int'), ('category', 'string'), ('description', 'string'), ('iserror', 'int'), ('text', 'string'), ('seq_num', 'int'), ('icd9_code', 'string')]


## Descriptive Statistics

### noteevents
Basic Counts:

In [84]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), COUNT(DISTINCT hadm_id)
FROM noteevents
""").show()

+--------+--------------------------+-----------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|
+--------+--------------------------+-----------------------+
| 2083180|                     46146|                  58361|
+--------+--------------------------+-----------------------+



Categories:

In [85]:
spark.sql("""
SELECT DISTINCT(category)
FROM noteevents
""").show()

+-----------------+
|         category|
+-----------------+
|              ECG|
|     Respiratory |
|          Nursing|
|          General|
|          Consult|
|             Echo|
|        Nutrition|
|       Physician |
|         Pharmacy|
|   Rehab Services|
| Case Management |
|        Radiology|
|    Nursing/other|
|Discharge summary|
|      Social Work|
+-----------------+



### diagnoses_icd: many (icd_code) to one (hadm_id)
Basic Counts:

In [86]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_m
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|  651047|                     46520|                  58976|                     6984|
+--------+--------------------------+-----------------------+-------------------------+



### diagnoses_icd: one (icd_code) to one (hadm_id)
Basic Counts:

In [101]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_o
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|   58976|                     46520|                  58976|                     2789|
+--------+--------------------------+-----------------------+-------------------------+



Just to check if I really did get "seq_num = 1" for all diagnosis, the code below should return empty. 

In [103]:
# check code
spark.sql("""
SELECT *
FROM diagnoses_icd_o
WHERE seq_num <> 1
""").show()

+------+----------+-------+-------+---------+
|ROW_ID|SUBJECT_ID|HADM_ID|SEQ_NUM|ICD9_CODE|
+------+----------+-------+-------+---------+
+------+----------+-------+-------+---------+



### noteevents and diagnoses_icd (one to one)
Basic Counts:

In [105]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM noteevents_o
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------+--------------------------+-----------------------+-------------------------+
| 1851344|                     46139|                  58361|                     2769|
+--------+--------------------------+-----------------------+-------------------------+



Top 50 ICD 9 codes based on "subject_id" count

In [115]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM noteevents_o
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|    41401|     3463|
|    V3000|     3427|
|    V3001|     2695|
|     0389|     1898|
|    41071|     1698|
|     4241|     1130|
|    51881|     1012|
|    V3101|      993|
|      431|      988|
|      486|      672|
|     5070|      593|
|     4240|      557|
|     4280|      523|
|      430|      510|
|     5849|      506|
|    41041|      478|
|    41011|      476|
|     5789|      420|
|     5770|      347|
|    41519|      340|
|     1983|      339|
|    43411|      336|
|    43491|      331|
|    42731|      321|
|    99859|      302|
|    85221|      291|
|    03842|      291|
|    56212|      257|
|    42823|      248|
|    V3401|      239|
|     4373|      235|
|    42833|      232|
|    99662|      229|
|     4271|      228|
|    51884|      225|
|    43310|      222|
|     4321|      220|
|     5712|      217|
|    99811|      216|
|    49121|      212|
|    03849|      203|
|    85220|      200|
|    03811

Top 50 ICD 9 codes based on "hadm_id" count

In [114]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM noteevents_o
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|    41401|      3497|
|    V3000|      3427|
|    V3001|      2695|
|     0389|      2043|
|    41071|      1747|
|     4241|      1139|
|    51881|      1120|
|      431|      1007|
|    V3101|       993|
|      486|       721|
|     5070|       655|
|     4280|       568|
|     4240|       567|
|     5849|       524|
|      430|       516|
|    41041|       480|
|    41011|       479|
|     5789|       444|
|     1983|       369|
|     5770|       369|
|    41519|       343|
|    43411|       337|
|    42731|       332|
|    43491|       331|
|    99859|       312|
|    03842|       304|
|    85221|       302|
|    25013|       300|
|     4373|       296|
|    42823|       286|
|    99662|       285|
|    56212|       273|
|    42833|       271|
|    49121|       268|
|     4271|       264|
|     5712|       256|
|    51884|       250|
|    29181|       243|
|    V3401|       239|
|     4321|       237|
|    43310|

### noteevents and diagnoses_icd (many to one)
Basic Counts:

In [116]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM noteevents_m
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------+--------------------------+-----------------------+-------------------------+
|25736048|                     46139|                  58361|                     6967|
+--------+--------------------------+-----------------------+-------------------------+



Top ICD 9 codes based on "subject_id" count

In [117]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM noteevents_m
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|     4019|    17500|
|    41401|    10733|
|    42731|    10193|
|     4280|     9802|
|     5849|     7634|
|     2724|     7414|
|    25000|     7327|
|    51881|     6632|
|     5990|     5746|
|     V053|     5597|
|     V290|     5436|
|     2720|     5319|
|    53081|     5242|
|     2859|     4965|
|      486|     4391|
|     2851|     4231|
|     2762|     4120|
|     2449|     3788|
|      496|     3568|
|    99592|     3504|
|    V3000|     3427|
|     0389|     3387|
|     5070|     3362|
|    V5861|     3183|
|     3051|     2978|
|      311|     2902|
|    41071|     2902|
|     5859|     2886|
|    40390|     2811|
|     2761|     2789|
|     2875|     2783|
|      412|     2771|
|    V3001|     2696|
|     4240|     2642|
|     5119|     2554|
|    V1582|     2531|
|    78552|     2376|
|    V4581|     2316|
|     4241|     2302|
|     9971|     2299|
|    42789|     2297|
|    V4582|     2243|
|     7742

Top ICD 9 codes based on "hadm_id" count

In [118]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM noteevents_m
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|     4019|     20555|
|     4280|     13062|
|    42731|     12800|
|    41401|     12382|
|     5849|      9065|
|    25000|      9002|
|     2724|      8629|
|    51881|      7410|
|     5990|      6522|
|    53081|      6290|
|     2720|      5910|
|     V053|      5600|
|     V290|      5448|
|     2859|      5377|
|     2449|      4877|
|      486|      4806|
|     2851|      4542|
|     2762|      4471|
|      496|      4405|
|    99592|      3856|
|    V5861|      3774|
|     0389|      3679|
|     5070|      3646|
|    V3000|      3427|
|     5859|      3412|
|      311|      3405|
|    40390|      3394|
|     3051|      3333|
|      412|      3262|
|    41071|      3046|
|     2875|      3041|
|    V4581|      3031|
|     2761|      3023|
|     4240|      2912|
|    V1582|      2791|
|     5119|      2723|
|    V4582|      2705|
|    V3001|      2696|
|    40391|      2615|
|    78552|      2546|
|     4241|

## Data Preprocessing
Obtain a set of the top 10 and top 50 icd9 codes

In [122]:
df_icd9score = spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS score
FROM noteevents_m
GROUP BY icd9_code
ORDER BY score DESC
""")
df_icd9score.registerTempTable("icd9_score")

spark.sql("""
SELECT * FROM icd9_score
LIMIT 50
""").show(n=50)

+---------+-----+
|icd9_code|score|
+---------+-----+
|     4019|20555|
|     4280|13062|
|    42731|12800|
|    41401|12382|
|     5849| 9065|
|    25000| 9002|
|     2724| 8629|
|    51881| 7410|
|     5990| 6522|
|    53081| 6290|
|     2720| 5910|
|     V053| 5600|
|     V290| 5448|
|     2859| 5377|
|     2449| 4877|
|      486| 4806|
|     2851| 4542|
|     2762| 4471|
|      496| 4405|
|    99592| 3856|
|    V5861| 3774|
|     0389| 3679|
|     5070| 3646|
|    V3000| 3427|
|     5859| 3412|
|      311| 3405|
|    40390| 3394|
|     3051| 3333|
|      412| 3262|
|    41071| 3046|
|     2875| 3041|
|    V4581| 3031|
|     2761| 3023|
|     4240| 2912|
|    V1582| 2791|
|     5119| 2723|
|    V4582| 2705|
|    V3001| 2696|
|    40391| 2615|
|    78552| 2546|
|     4241| 2541|
|    V5867| 2525|
|    42789| 2435|
|    32723| 2365|
|     9971| 2334|
|     5845| 2257|
|     2760| 2250|
|     7742| 2249|
|    49390| 2184|
|     5180| 2155|
+---------+-----+



Obtain the RDD for the merged noteevents and diagnoses_icd, and then filter out the RDD such that only the relevant icd9 codes remain.

In [123]:
df_nedi_top10 = spark.sql("""
SELECT * FROM noteevents_m
WHERE icd9_code IN 
    (SELECT icd9_code FROM icd9_score LIMIT 10)
""")
df_nedi_top10.write.csv("./data/NOTEEVENTS-TOP10.csv",
                       header=True)
df_nedi_top50 = spark.sql("""
SELECT * FROM noteevents_m
WHERE icd9_code IN 
    (SELECT icd9_code FROM icd9_score LIMIT 50)
""")
df_nedi_top50.write.csv("./data/NOTEEVENTS-TOP50.csv",
                       header=True)

Name: org.apache.toree.interpreter.broker.BrokerException
Message: Traceback (most recent call last):
  File "/tmp/kernel-PySpark-4dc38fcb-a1e9-4f31-841d-40a773eb07ab/pyspark_runner.py", line 189, in <module>
    eval(compiled_code)
  File "<string>", line 14, in <module>
  File "/home/luke/Documents/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/readwriter.py", line 711, in csv
    self._jwrite.csv(path)
  File "/home/luke/Documents/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1131, in __call__
    answer = self.gateway_client.send_command(command)
  File "/home/luke/Documents/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 883, in send_command
    response = connection.send_command(command)
  File "/home/luke/Documents/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1028, in send_command

### Top 10 ICD 9 codes category (cleaned)
### Top 50 ICD 9 codes category (cleaned)

In [None]:
sc.stop()