# MIMIC III Preprocessing (per icd9 category)

## Initialization and Data Loading

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

ne_struct = StructType([StructField("row_id", IntegerType(), True),
                      StructField("subject_id", IntegerType(), True),
                      StructField("hadm_id", IntegerType(), True),
                      StructField("chartdate", DateType(), True),
                      StructField("category", StringType(), True),
                      StructField("description", StringType(), True),
                      StructField("cgid", IntegerType(), True),
                      StructField("iserror", IntegerType(), True),
                      StructField("text", StringType(), True)])
df_ne = spark.read.csv("./data/NOTEEVENTS-2.csv",
# df_ne = spark.read.csv("./data/NOTEEVENTS-2sample.csv",
                       header=True,
                       schema=ne_struct)
df_ne.registerTempTable("noteevents")
df_ne.filter(df_ne.category=="Discharge summary") \
    .registerTempTable("noteevents2")
    
# i want to cache noteevents, but it's too big

# many icd to one hadm_id
diag_struct = StructType([StructField("ROW_ID", IntegerType(), True),
                          StructField("SUBJECT_ID", IntegerType(), True),
                          StructField("HADM_ID", IntegerType(), True),
                          StructField("SEQ_NUM", IntegerType(), True),
                          StructField("ICD9_CODE", StringType(), True)])
df_diag_m = spark.read.csv("./data/DIAGNOSES_ICD.csv",
                           header=True,
                           schema=diag_struct) \
            .selectExpr("ROW_ID as row_id", 
                        "SUBJECT_ID as subject_id",
                        "HADM_ID as hadm_id",
                        "SEQ_NUM as seq_num",
                        "ICD9_CODE as icd9_code")
    
# added to filter out categories
geticd9cat_udf = F.udf(lambda x: str(x)[:3], StringType())
df_diag_m = df_diag_m.withColumn("icd9_code", geticd9cat_udf("icd9_code"))
df_diag_m.registerTempTable("diagnoses_icd_m")
df_diag_m.cache()

# one icd to one hadm_id (take the smallest seq number as primary)
diag_o_rdd = df_diag_m.rdd.sortBy(lambda x: (x.hadm_id, x.subject_id, x.seq_num)) \
    .groupBy(lambda x: x.hadm_id) \
    .mapValues(list) \
    .reduceByKey(lambda x, y: x if x.seq_num < y.seq_num else y) \
    .map(lambda (hid, d): d[0])
df_diag_o = spark.createDataFrame(diag_o_rdd)
df_diag_o.registerTempTable("diagnoses_icd_o")
df_diag_o.cache()

# get hadm_id list in noteevents
df_hadm_id_list = spark.sql("""
SELECT DISTINCT hadm_id FROM noteevents2
""")
df_hadm_id_list.registerTempTable("hadm_id_list")
df_hadm_id_list.cache()

# get subject_id list in noteevents
df_subject_id_list = spark.sql("""
SELECT DISTINCT subject_id FROM noteevents2
""")
df_subject_id_list.registerTempTable("subject_id_list")
df_subject_id_list.cache()

print df_ne.dtypes
print df_diag_m.dtypes
print df_diag_o.dtypes
print df_hadm_id_list.dtypes
print df_subject_id_list.dtypes

[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('chartdate', 'date'), ('category', 'string'), ('description', 'string'), ('cgid', 'int'), ('iserror', 'int'), ('text', 'string')]
[('row_id', 'int'), ('subject_id', 'int'), ('hadm_id', 'int'), ('seq_num', 'int'), ('icd9_code', 'string')]
[('row_id', 'bigint'), ('subject_id', 'bigint'), ('hadm_id', 'bigint'), ('seq_num', 'bigint'), ('icd9_code', 'string')]
[('hadm_id', 'int')]
[('subject_id', 'int')]


In [3]:
spark.sql("""
SELECT * FROM diagnoses_icd_m
LIMIT 10
""").show()

+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
|  1297|       109| 172335|      1|      403|
|  1298|       109| 172335|      2|      486|
|  1299|       109| 172335|      3|      582|
|  1300|       109| 172335|      4|      585|
|  1301|       109| 172335|      5|      425|
|  1302|       109| 172335|      6|      276|
|  1303|       109| 172335|      7|      710|
|  1304|       109| 172335|      8|      276|
|  1305|       109| 172335|      9|      724|
|  1306|       109| 172335|     10|      458|
+------+----------+-------+-------+---------+



In [4]:
df_diag_o2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_o.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_o JOIN hadm_id_list
ON diagnoses_icd_o.hadm_id = hadm_id_list.hadm_id
""")
df_diag_o2.registerTempTable("diagnoses_icd_o2")
df_diag_o2.cache()

DataFrame[row_id: bigint, subject_id: bigint, hadm_id: bigint, seq_num: bigint, icd9_code: string]

In [5]:
df_diag_m2 = spark.sql("""
SELECT row_id, subject_id, diagnoses_icd_m.hadm_id AS hadm_id,
seq_num, icd9_code
FROM diagnoses_icd_m JOIN hadm_id_list
ON diagnoses_icd_m.hadm_id = hadm_id_list.hadm_id
""")
df_diag_m2.registerTempTable("diagnoses_icd_m2")
df_diag_m2.cache()

DataFrame[row_id: int, subject_id: int, hadm_id: int, seq_num: int, icd9_code: string]

## Descriptive Statistics

### noteevents
Basic Counts:

In [6]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), COUNT(DISTINCT hadm_id)
FROM noteevents
""").show()
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), COUNT(DISTINCT hadm_id)
FROM noteevents2
""").show()

+--------+--------------------------+-----------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|
+--------+--------------------------+-----------------------+
| 2083180|                     46146|                  58361|
+--------+--------------------------+-----------------------+

+--------+--------------------------+-----------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|
+--------+--------------------------+-----------------------+
|   59652|                     41127|                  52726|
+--------+--------------------------+-----------------------+



Categories:

In [7]:
spark.sql("""
SELECT DISTINCT(category)
FROM noteevents
""").show()

+-----------------+
|         category|
+-----------------+
|              ECG|
|     Respiratory |
|          Nursing|
|          General|
|          Consult|
|             Echo|
|        Nutrition|
|       Physician |
|         Pharmacy|
|   Rehab Services|
| Case Management |
|        Radiology|
|    Nursing/other|
|Discharge summary|
|      Social Work|
+-----------------+



### diagnoses_icd: many (icd_code) to one (hadm_id)
Basic Counts:

In [8]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_m
""").show()

spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT LOWER(ICD9_CODE))
FROM diagnoses_icd_m
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|  651047|                     46520|                  58976|                      943|
+--------+--------------------------+-----------------------+-------------------------+

+--------+--------------------------+-----------------------+--------------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT lower(ICD9_CODE))|
+--------+--------------------------+-----------------------+--------------------------------+
|  651047|                     46520|                  58976|                             943|
+--------+--------------------------+-----------------------+--------------------------------+



### diagnoses_icd: one (icd_code) to one (hadm_id)
Basic Counts:

In [9]:
spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT ICD9_CODE)
FROM diagnoses_icd_o
""").show()

spark.sql("""
SELECT COUNT(*), COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT LOWER(ICD9_CODE))
FROM diagnoses_icd_o
""").show()

+--------+--------------------------+-----------------------+-------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT ICD9_CODE)|
+--------+--------------------------+-----------------------+-------------------------+
|   58976|                     46520|                  58976|                      652|
+--------+--------------------------+-----------------------+-------------------------+

+--------+--------------------------+-----------------------+--------------------------------+
|count(1)|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT lower(ICD9_CODE))|
+--------+--------------------------+-----------------------+--------------------------------+
|   58976|                     46520|                  58976|                             652|
+--------+--------------------------+-----------------------+--------------------------------+



Just to check if I really did get "seq_num = 1" for all diagnosis, the code below should return empty. 

In [10]:
# check code
spark.sql("""
SELECT *
FROM diagnoses_icd_o
WHERE seq_num <> 1
""").show()

+------+----------+-------+-------+---------+
|row_id|subject_id|hadm_id|seq_num|icd9_code|
+------+----------+-------+-------+---------+
+------+----------+-------+-------+---------+



### noteevents and diagnoses_icd (one to one)
Basic Counts:

In [11]:
spark.sql("""
SELECT COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM diagnoses_icd_o2
""").show()

+--------------------------+-----------------------+-------------------------+
|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------------------------+-----------------------+-------------------------+
|                     41127|                  52726|                      641|
+--------------------------+-----------------------+-------------------------+



Top 50 ICD 9 codes based on "subject_id" count

In [12]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM diagnoses_icd_o2
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|      414|     3503|
|      410|     3137|
|      038|     2966|
|      V30|     2348|
|      424|     1691|
|      518|     1324|
|      428|     1248|
|      996|     1199|
|      V31|      981|
|      431|      948|
|      852|      903|
|      427|      900|
|      998|      726|
|      441|      724|
|      434|      690|
|      486|      654|
|      250|      631|
|      584|      611|
|      578|      606|
|      507|      583|
|      198|      513|
|      430|      491|
|      162|      455|
|      571|      427|
|      801|      419|
|      577|      394|
|      562|      377|
|      415|      363|
|      440|      361|
|      433|      342|
|      997|      324|
|      396|      311|
|      197|      308|
|      805|      278|
|      965|      277|
|      482|      277|
|      432|      268|
|      780|      265|
|      519|      260|
|      437|      254|
|      532|      251|
|      820|      244|
|      851

Top 50 ICD 9 codes based on "hadm_id" count

In [13]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_o2
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|      414|      3540|
|      038|      3276|
|      410|      3228|
|      V30|      2348|
|      424|      1707|
|      518|      1510|
|      428|      1460|
|      996|      1373|
|      V31|       981|
|      431|       966|
|      427|       962|
|      852|       940|
|      250|       884|
|      441|       782|
|      998|       747|
|      486|       703|
|      434|       693|
|      578|       656|
|      507|       643|
|      584|       634|
|      198|       553|
|      430|       495|
|      571|       483|
|      162|       471|
|      577|       434|
|      801|       419|
|      562|       404|
|      440|       389|
|      415|       367|
|      433|       353|
|      997|       332|
|      197|       328|
|      519|       320|
|      396|       314|
|      291|       314|
|      437|       296|
|      482|       293|
|      432|       285|
|      491|       284|
|      805|       281|
|      965|

### noteevents and diagnoses_icd (many to one)
Basic Counts:

In [14]:
spark.sql("""
SELECT COUNT(DISTINCT subject_id), 
COUNT(DISTINCT hadm_id), COUNT(DISTINCT icd9_code)
FROM diagnoses_icd_m2
""").show()

+--------------------------+-----------------------+-------------------------+
|count(DISTINCT subject_id)|count(DISTINCT hadm_id)|count(DISTINCT icd9_code)|
+--------------------------+-----------------------+-------------------------+
|                     41127|                  52726|                      942|
+--------------------------+-----------------------+-------------------------+



Top ICD 9 codes based on "subject_id" count

In [15]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS sid_count
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY sid_count DESC
LIMIT 50
""").show(n=50)

+---------+---------+
|icd9_code|sid_count|
+---------+---------+
|      401|    17551|
|      427|    13666|
|      276|    12326|
|      272|    12023|
|      414|    11693|
|      518|    11063|
|      285|    10479|
|      250|    10072|
|      428|     9974|
|      584|     9300|
|      V45|     6897|
|      599|     6355|
|      530|     6010|
|      E87|     5799|
|      V58|     5723|
|      038|     5355|
|      V10|     4980|
|      410|     4918|
|      424|     4871|
|      997|     4867|
|      995|     4818|
|      585|     4808|
|      780|     4803|
|      785|     4687|
|      998|     4595|
|      458|     4546|
|      403|     4510|
|      305|     4457|
|      486|     4329|
|      041|     3966|
|      244|     3941|
|      V15|     3925|
|      496|     3491|
|      287|     3487|
|      996|     3486|
|      790|     3400|
|      507|     3335|
|      E93|     3202|
|      V12|     3163|
|      511|     2972|
|      348|     2939|
|      765|     2905|
|      311

Top ICD 9 codes based on "hadm_id" count

In [16]:
spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY hadm_count DESC
LIMIT 50
""").show(n=50)

+---------+----------+
|icd9_code|hadm_count|
+---------+----------+
|      401|     20646|
|      427|     16774|
|      276|     14712|
|      272|     14212|
|      414|     14081|
|      250|     13818|
|      428|     13330|
|      518|     12997|
|      285|     12404|
|      584|     11147|
|      V45|      8846|
|      599|      7199|
|      530|      7191|
|      V58|      6998|
|      585|      6764|
|      E87|      6483|
|      403|      6297|
|      V10|      6204|
|      038|      6085|
|      995|      5480|
|      424|      5404|
|      410|      5301|
|      780|      5296|
|      244|      5101|
|      997|      5078|
|      785|      5048|
|      305|      5000|
|      998|      4948|
|      458|      4935|
|      486|      4732|
|      V15|      4420|
|      041|      4399|
|      496|      4296|
|      996|      4251|
|      287|      3881|
|      V12|      3782|
|      790|      3672|
|      507|      3608|
|      E93|      3473|
|      493|      3400|
|      311|

## Data Preprocessing (all icd9 codes)

Returns RDD[(hadm_id, list(icd9_codes))]

In [17]:
icd9_score_hadm = spark.sql("""
SELECT icd9_code, COUNT(DISTINCT hadm_id) AS score
FROM diagnoses_icd_m2
GROUP BY icd9_code
""").rdd.cache()

icd9_score_subj = spark.sql("""
SELECT icd9_code, COUNT(DISTINCT subject_id) AS score
FROM diagnoses_icd_m2
GROUP BY icd9_code
""").rdd.cache()

def get_id_to_topicd9(id_type, topX):
    if id_type == "hadm_id":
        icd9_score = icd9_score_hadm
    else:
        icd9_score = icd9_score_subj
        
    icd9_topX = set([i.icd9_code for i in icd9_score.takeOrdered(topX, key=lambda x: -x.score)])
    
    id_to_topicd9 = df_diag_m2.rdd \
        .map(lambda x: (x.hadm_id if id_type=="hadm_id" else x.subject_id, x.icd9_code)) \
        .groupByKey() \
        .mapValues(lambda x: set(x) & icd9_topX) \
        .filter(lambda (x, y): y)
        
    return id_to_topicd9, list(icd9_topX)

# for i in get_id_to_topicd9("hadm_id", 10)[0].take(3):
#     print i
# for i in get_id_to_topicd9("subject_id", 50)[0].take(3):
#     print i

Obtain dataframe for the merged noteevents and ID-to-ICD9 mapping

In [18]:
def sparse2vec(mapper, data):
    out = [0] * len(mapper)
    for i in data:
        out[mapper[i]] = 1
    return out

def get_id_to_texticd9(id_type, topX):
    id_to_topicd9, topicd9 = get_id_to_topicd9(id_type, topX)
    mapper = dict(zip(topicd9, range(topX)))
    
    ne_topX = df_ne.rdd \
        .filter(lambda x: x.category == "Discharge summary") \
        .map(lambda x: (x.hadm_id if id_type=="hadm_id" else x.subject_id, x.text)) \
        .groupByKey() \
        .mapValues(lambda x: " ".join(x)) \
        .join(id_to_topicd9) \
        .map(lambda (id_, (text, icd9)): \
             [id_, text]+sparse2vec(mapper, icd9))
#              list(Vectors.sparse(topX, dict.fromkeys(map(lambda x: mapper[x], icd9), 1))))
        
    return spark.createDataFrame(ne_topX, ["id", "text"]+topicd9), mapper

# get_id_to_texticd9("hadm_id", 10)[0].show()

## Feature Extraction

### TF-IDF
Input df must be RDD[(label, text)]

In [19]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover

def create_TFIDF(sentenceData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20):
    tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words")
    wordsData = tokenizer.transform(sentenceData)
    
    remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered")
    wordsDataFiltered = remover.transform(wordsData)
    
    hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsDataFiltered)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    
    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)

In [20]:
from pyspark.mllib.util import Vectors
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DataType, StringType

def output_csv(df, path):
    udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
    new_df = df.withColumn('features', udf(df.features))
    
    new_df.write.csv(path, header=True)
    
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    new_df = df.withColumn('features', udf(df.features))
    
    return new_df

Output to pickle file

In [21]:
df_id2texticd9, topicd9_mapper = get_id_to_texticd9("hadm_id", 10)
df_id2featurelabel = create_TFIDF(df_id2texticd9, numFeatures=40000)

print topicd9_mapper
print df_id2featurelabel.dtypes
df_id2featurelabel.show()

output_csv(df_id2featurelabel, "./data/DATA_TFIDF_HADM_TOP10CAT")

{u'584': 0, u'401': 6, u'428': 4, u'414': 3, u'518': 2, u'272': 5, u'276': 1, u'250': 7, u'285': 8, u'427': 9}
[('id', 'bigint'), ('584', 'bigint'), ('276', 'bigint'), ('518', 'bigint'), ('414', 'bigint'), ('428', 'bigint'), ('272', 'bigint'), ('401', 'bigint'), ('250', 'bigint'), ('285', 'bigint'), ('427', 'bigint'), ('features', 'vector')]
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|    id|584|276|518|414|428|272|401|250|285|427|            features|
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|117760|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|(40000,[69,372,69...|
|129030|  0|  0|  0|  0|  0|  1|  1|  0|  1|  0|(40000,[13,32,83,...|
|172040|  1|  1|  0|  1|  0|  0|  0|  0|  0|  0|(40000,[10,69,152...|
|156170|  1|  0|  0|  0|  1|  0|  0|  1|  1|  1|(40000,[3,78,130,...|
|199180|  0|  0|  0|  1|  1|  0|  0|  1|  0|  0|(40000,[48,62,80,...|
|167440|  0|  0|  1|  1|  0|  1|  0|  1|  0|  0|(40000,[207,264,2...|
|178710|  1|  1|  0|  0|  

[Test] Load csv file
count should be the same with the sql query

In [22]:
testdf = read_csv("./data/DATA_TFIDF_HADM_TOP10CAT")
print testdf.count()
testdf.show()

44419
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|    id|584|276|518|414|428|272|401|250|285|427|            features|
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|185344|  0|  0|  0|  1|  0|  0|  0|  0|  0|  1|(40000,[20,32,69,...|
|126464|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|(40000,[66,207,26...|
|169474|  0|  0|  0|  0|  0|  1|  1|  0|  0|  0|(40000,[63,80,207...|
|180054|  1|  0|  0|  0|  0|  0|  1|  0|  0|  0|(40000,[32,115,13...|
|137734|  0|  0|  0|  0|  0|  1|  0|  0|  0|  1|(40000,[48,148,20...|
|121864|  0|  0|  0|  0|  1|  0|  1|  0|  0|  1|(40000,[273,379,8...|
|115884|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|(40000,[100,361,5...|
|105994|  0|  0|  0|  0|  1|  0|  1|  0|  0|  0|(40000,[20,32,207...|
|110594|  0|  0|  0|  1|  0|  1|  0|  1|  0|  0|(40000,[78,107,14...|
|176144|  0|  0|  0|  1|  1|  1|  0|  1|  0|  0|(40000,[62,130,20...|
|134744|  0|  0|  0|  1|  1|  1|  0|  1|  0|  1|(40000,[2,207,307...|
|101394|  0|  

In [23]:
spark.sql("""
SELECT icd9_code
FROM diagnoses_icd_m2
GROUP BY icd9_code
ORDER BY COUNT(DISTINCT hadm_id) DESC
LIMIT 10
""").show()
    
id_to_topicd9, topicd9 = get_id_to_topicd9("hadm_id", 10)
print id_to_topicd9.count()

spark.sql("""
SELECT COUNT(DISTINCT hadm_id) AS hadm_count
FROM diagnoses_icd_m2
WHERE icd9_code IN
    (SELECT icd9_code
    FROM diagnoses_icd_m2
    GROUP BY icd9_code
    ORDER BY COUNT(DISTINCT hadm_id) DESC
    LIMIT 10)
""").show()

+---------+
|icd9_code|
+---------+
|      401|
|      427|
|      276|
|      272|
|      414|
|      250|
|      428|
|      518|
|      285|
|      584|
+---------+

44419
+----------+
|hadm_count|
+----------+
|     44419|
+----------+



In [None]:
#sc.stop()