# MIMIC III Preprocessing

## Initialization and Data Loading

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

In [2]:
conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

ne_struct = StructType([StructField("row_id", IntegerType(), True),
                      StructField("subject_id", IntegerType(), True),
                      StructField("hadm_id", IntegerType(), True),
                      StructField("chartdate", DateType(), True),
                      StructField("category", StringType(), True),
                      StructField("description", StringType(), True),
                      StructField("cgid", IntegerType(), True),
                      StructField("iserror", IntegerType(), True),
                      StructField("text", StringType(), True)])
df_ne = spark.read.csv("./data/NOTEEVENTS-2.csv",
# df_ne = spark.read.csv("./data/NOTEEVENTS-2sample.csv",
                       header=True,
                       schema=ne_struct)
df_ne.registerTempTable("noteevents")
df_ne.filter(df_ne.category=="Discharge summary") \
    .registerTempTable("noteevents2")
    
# i want to cache noteevents, but it's too big

# many icd to one hadm_id
diag_struct = StructType([StructField("ROW_ID", IntegerType(), True),
                          StructField("SUBJECT_ID", IntegerType(), True),
                          StructField("HADM_ID", IntegerType(), True),
                          StructField("SEQ_NUM", IntegerType(), True),
                          StructField("ICD9_CODE", StringType(), True)])
df_diag_m = spark.read.csv("./data/DIAGNOSES_ICD.csv",
                           header=True,
                           schema=diag_struct) \
            .selectExpr("ROW_ID as row_id", 
                        "SUBJECT_ID as subject_id",
                        "HADM_ID as hadm_id",
                        "SEQ_NUM as seq_num",
                        "ICD9_CODE as icd9_code")
df_diag_m.registerTempTable("diagnoses_icd_m")
df_diag_m.cache()

# one icd to one hadm_id (take the smallest seq number as primary)
diag_o_rdd = df_diag_m.rdd.sortBy(lambda x: (x.hadm_id, x.subject_id, x.seq_num)) \
    .groupBy(lambda x: x.hadm_id) \
    .mapValues(list) \
    .reduceByKey(lambda x, y: x if x.seq_num < y.seq_num else y) \
    .map(lambda (hid, d): d[0])
df_diag_o = spark.createDataFrame(diag_o_rdd)
df_diag_o.registerTempTable("diagnoses_icd_o")
df_diag_o.cache()

# get hadm_id list in noteevents
df_hadm_id_list = spark.sql("""
SELECT DISTINCT hadm_id FROM noteevents2
""")
df_hadm_id_list.registerTempTable("hadm_id_list")
df_hadm_id_list.cache()

# get subject_id list in noteevents
df_subject_id_list = spark.sql("""
SELECT DISTINCT subject_id FROM noteevents2
""")
df_subject_id_list.registerTempTable("subject_id_list")
df_subject_id_list.cache()

df_icd9desc = spark.read.csv("./data/D_ICD_DIAGNOSES.csv",
                       header=True, inferSchema=True)
df_icd9desc.registerTempTable("diagnoses_icd_desc")

df_diag_o2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_o.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_o JOIN hadm_id_list
ON diagnoses_icd_o.hadm_id = hadm_id_list.hadm_id
""")
df_diag_o2.registerTempTable("diagnoses_icd_o2")
df_diag_o2.cache()

df_diag_m2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_m.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_m JOIN hadm_id_list
ON diagnoses_icd_m.hadm_id = hadm_id_list.hadm_id
""")
df_diag_m2.registerTempTable("diagnoses_icd_m2")
df_diag_m2.cache()

print df_ne.dtypes
print df_diag_m.dtypes
print df_diag_o.dtypes
print df_hadm_id_list.dtypes
print df_subject_id_list.dtypes
print df_icd9desc.dtypes

[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('chartdate', 'date'), ('category', 'string'), ('description', 'string'), ('cgid', 'int'), ('iserror', 'int'), ('text', 'string')]
[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('seq_num', 'int'), ('icd9_code', 'string')]
[('row_id', 'bigint'), ('subject_id', 'bigint'), ('hadm_id', 'bigint'), ('seq_num', 'bigint'), ('icd9_code', 'string')]
[('hadm_id', 'int')]
[('subject_id', 'int')]
[('ROW_ID', 'int'), ('ICD9_CODE', 'string'), ('SHORT_TITLE', 'string'), ('LONG_TITLE', 'string')]


## Data Preprocessing (all icd9 codes)

Returns RDD[(hadm_id, list(icd9_codes))]

In [3]:
icd9_score_hadm = spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS score
FROM diagnoses_icd_m2
GROUP BY icd9_code
""").rdd.cache()

icd9_score_subj = spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS score
FROM diagnoses_icd_m2
GROUP BY icd9_code
""").rdd.cache()

def get_id_to_topicd9(id_type, topX):
    if id_type == "hadm_id":
        icd9_score = icd9_score_hadm
    else:
        icd9_score = icd9_score_subj
        
    icd9_topX = set([i.icd9_code for i in icd9_score.takeOrdered(topX, key=lambda x: -x.score)])
    
    id_to_topicd9 = df_diag_m2.rdd \
        .map(lambda x: (x.hadm_id if id_type=="hadm_id" else x.subject_id, x.icd9_code)) \
        .groupByKey() \
        .mapValues(lambda x: set(x) & icd9_topX) \
        .filter(lambda (x, y): y)
        
    return id_to_topicd9, list(icd9_topX)

# for i in get_id_to_topicd9("hadm_id", 10)[0].take(3):
#     print i
# for i in get_id_to_topicd9("subject_id", 50)[0].take(3):
#     print i

Obtain dataframe for the merged noteevents and ID-to-ICD9 mapping

In [4]:
def sparse2vec(mapper, data):
    out = [0] * len(mapper)
    if data != None:
        for i in data:
            out[mapper[i]] = 1
    return out

def get_id_to_texticd9(id_type, topX):
    id_to_topicd9, topicd9 = get_id_to_topicd9(id_type, topX)
    mapper = dict(zip(topicd9, range(topX)))
    
    ne_topX = df_ne.rdd \
        .filter(lambda x: x.category == "Discharge summary") \
        .map(lambda x: (x.hadm_id if id_type=="hadm_id" else x.subject_id, x.text)) \
        .groupByKey() \
        .mapValues(lambda x: " ".join(x)) \
        #.join(id_to_topicd9) \ # involve only data related to top10
        # involve all data, even those not related to top10
        .leftOuterJoin(id_to_topicd9) \
        .map(lambda (id_, (text, icd9)): \
             [id_]+sparse2vec(mapper, icd9)+[text])
#              list(Vectors.sparse(topX, dict.fromkeys(map(lambda x: mapper[x], icd9), 1))))
        
    return spark.createDataFrame(ne_topX, ["id"]+topicd9+["text"]), mapper

# get_id_to_texticd9("hadm_id", 10)[0].show()

Make list of unique ICD9CODES

In [5]:
import pickle

ICD9CODES = spark.sql("""
SELECT DISTINCT icd9_code FROM diagnoses_icd_m2
""").rdd.map(lambda x: x.icd9_code).collect()
ICD9CODES = [str(i).lower() for i in ICD9CODES]

pickle.dump(ICD9CODES, open( "./data/ICD9CODES.p", "wb" ))

Output to csv file

In [6]:
df_id2texticd9, topicd9_mapper = get_id_to_texticd9("hadm_id", 10)
df_id2texticd9.write.csv("./data/DATA_HADM_TOP10", header=True)

print topicd9_mapper
print df_id2texticd9.count()

{u'4019': 0, u'2724': 1, u'25000': 2, u'4280': 3, u'41401': 4, u'42731': 7, u'5849': 8, u'53081': 5, u'51881': 6, u'5990': 9}
52726


Test csv file

In [7]:
import pandas as pd
df = pd.read_csv("./data/DATA_HADM_TOP10.csv", escapechar='\\')
print df.head()

       id  4019  2724  25000  4280  41401  53081  51881  42731  5849  5990  \
0  117760     0     0      0     0      0      1      1      0     0     0   
1  129030     1     1      0     0      0      1      0      0     0     0   
2  172040     0     0      0     0      1      0      0      0     1     0   
3  156170     0     0      1     1      0      0      0      1     1     0   
4  199180     0     0      1     1      1      0      0      0     0     0   

                                                text  
0  "Admission Date:  [**2118-12-14**]            ...  
1  Admission Date:  [**2137-8-31**]              ...  
2  Admission Date:  [**2174-1-6**]              D...  
3  Admission Date:  [**2102-6-9**]              D...  
4  Admission Date:  [**2164-7-2**]       Discharg...  


Test by counting rows (depreciated)

In [8]:
spark.sql("""
SELECT icd9_code
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY COUNT(DISTINCT hadm_id) DESC
LIMIT 10
""").show()
    
# id_to_topicd9, topicd9 = get_id_to_topicd9("hadm_id", 10)
# print id_to_topicd9.count()

# spark.sql("""
# SELECT COUNT(DISTINCT hadm_id) AS hadm_count
# FROM diagnoses_icd_m2
# WHERE icd9_code IN
#     (SELECT icd9_code
#     FROM diagnoses_icd_m2
#     GROUP BY icd9_code
#     ORDER BY COUNT(DISTINCT hadm_id) DESC
#     LIMIT 10)
# """).show()

+---------+
|icd9_code|
+---------+
|     4019|
|     4280|
|    42731|
|    41401|
|     5849|
|    25000|
|     2724|
|    51881|
|     5990|
|    53081|
+---------+



In [9]:
#sc.stop()
print "Done!"

Done!
