In [64]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.functions import collect_list, count

from operator import add

In [49]:
# Spark Constants
APP_NAME = 'assignment1'
MASTER = 'local[*]'

# Column Constants
PATIENT_COLUMN = "PATIENT"
CODE_COLUMN = "CODE"

CONDITIONS_COLUMN = "CONDITIONS"

# Input Constants
INPUT_FILE = 'conditions-sample.csv'
SUPPORT_THRESHOLD = 1000

In [50]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).getOrCreate()

23/03/04 20:21:29 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [51]:
ds = spark.read.csv(INPUT_FILE, header=True, inferSchema=True)
ds.schema

                                                                                

StructType([StructField('START', TimestampType(), True), StructField('STOP', TimestampType(), True), StructField('PATIENT', StringType(), True), StructField('ENCOUNTER', StringType(), True), StructField('CODE', LongType(), True), StructField('DESCRIPTION', StringType(), True)])

In [55]:
conditions = ds.select(CODE_COLUMN).distinct()
conditions.show(5)

+---------+
|     CODE|
+---------+
| 74400008|
|449868002|
| 38822007|
|254637007|
| 47693006|
+---------+
only showing top 5 rows



                                                                                

In [56]:
patient_conditions = ds.groupBy(PATIENT_COLUMN).agg(collect_list(CODE_COLUMN).alias(CONDITIONS_COLUMN))
patient_conditions.show(5)

[Stage 130:>                                                        (0 + 1) / 1]

+--------------------+--------------------+
|             PATIENT|          CONDITIONS|
+--------------------+--------------------+
|000157a3-5aca-4f2...|[367498001, 15777...|
|00016311-4402-47d...|[43878008, 728920...|
|00032f7e-ee14-409...|[59621000, 440540...|
|00036790-f159-4cd...|[59621000, 744000...|
|00047353-ffa1-4c9...|[162864005, 19169...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [57]:
elems_k1 = ds.groupBy(CODE_COLUMN).agg(count("*").alias("COUNT")).filter("count > 1000")
elems_k1.show(5)

[Stage 133:>                                                        (0 + 1) / 1]

+---------+-----+
|     CODE|COUNT|
+---------+-----+
| 74400008| 3234|
| 38822007| 1273|
| 75498004| 4099|
| 16114001| 2098|
|283371005| 2043|
+---------+-----+
only showing top 5 rows



                                                                                

In [68]:
rdd = patient_conditions.rdd \
  .flatMap(lambda v: [(c, 1) for c in v[CONDITIONS_COLUMN]]) \
  .reduceByKey(add)

rdd.collect()

                                                                                

[(367498001, 919),
 (15777000, 20702),
 (271737000, 20753),
 (239873007, 3609),
 (55822004, 7901),
 (10509002, 33816),
 (43878008, 9962),
 (72892002, 29779),
 (444814009, 72504),
 (198992004, 1395),
 (59621000, 17966),
 (44054006, 4522),
 (302870006, 4457),
 (237602007, 4364),
 (19169002, 11810),
 (254837009, 1261),
 (422034002, 1183),
 (1551000119108, 678),
 (97331000119101, 245),
 (368581000119106, 1552),
 (403190006, 1597),
 (44465007, 7477),
 (195662009, 41484),
 (74400008, 3234),
 (428251008, 3234),
 (83664006, 491),
 (64859006, 3216),
 (70704007, 3129),
 (58150001, 2293),
 (443165006, 1227),
 (162864005, 21513),
 (283371005, 2043),
 (40055000, 14933),
 (410429000, 2565),
 (429007001, 2565),
 (201834006, 1558),
 (36971009, 3990),
 (230265002, 424),
 (230690007, 3444),
 (16114001, 2098),
 (126906006, 1798),
 (92691004, 1648),
 (431855005, 1907),
 (127013003, 2041),
 (53741008, 4028),
 (398254007, 1469),
 (370247008, 2065),
 (307731004, 1018),
 (62106007, 4018),
 (79586000, 1712),
 