In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from operator import add

# Exercise 1.1

In [2]:
# Spark Constants
APP_NAME = 'assignment1'
MASTER = 'local[*]'

# Column Constants
PATIENT_COLUMN = "PATIENT"
CODE_COLUMN = "CODE"

CONDITIONS_COLUMN = "CONDITIONS"

# Input Constants
INPUT_FILE = 'conditions.csv'
SUPPORT_THRESHOLD = 1000

In [3]:
def combinations(elems, size, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
    ]
    
    if len(base) == 0: return []

    return combinations(elems, size, base)


def frequent_combinations(elems, size, combinations, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
        if len(base_comb) - 1 < size or (base_comb in combinations and base_comb[1:] + (elem,) in combinations)
    ]

    if len(base) == 0: return []

    return frequent_combinations(elems, size, combinations, base)

In [4]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).getOrCreate()

23/03/18 16:21:29 WARN Utils: Your hostname, pedro-duarte resolves to a loopback address: 127.0.1.1; using 192.168.0.103 instead (on interface wlp2s0)
23/03/18 16:21:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/18 16:21:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/18 16:21:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/03/18 16:21:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [5]:
ds = spark.read.csv(INPUT_FILE, header=True, inferSchema=True)
ds.schema

                                                                                

StructType([StructField('START', TimestampType(), True), StructField('STOP', TimestampType(), True), StructField('PATIENT', StringType(), True), StructField('ENCOUNTER', StringType(), True), StructField('CODE', LongType(), True), StructField('DESCRIPTION', StringType(), True)])

In [10]:
patient_conditions = ds.rdd \
  .map(lambda v: (v[PATIENT_COLUMN], {v[CODE_COLUMN]})) \
  .reduceByKey(lambda v1, v2: v1.union(v2)) \
  .mapValues(sorted) \
  .collect()

print(patient_conditions[:5])

Exception in thread "serve RDD 42" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

[('28a3cdb7-1db1-4148-8280-8a4e5b4f99e0', [19169002, 72892002, 156073000, 284551006]), ('3826037f-19e0-4c7b-98e5-4e9578472f67', [24079001, 55822004, 65966004, 162864005]), ('e32e0069-2d3f-4b7b-b420-3269c94723ad', [16114001, 162864005, 195662009]), ('887ad9bb-bd72-44cf-8e5e-8aff7fbdeed4', [40275004, 44465007, 72892002, 195662009, 444814009]), ('8e763f75-614b-4ef7-aa86-ce459dd3142e', [10509002, 70704007, 128613002, 195662009, 703151001])]


In [22]:

base_results = sc.parallelize(patient_conditions) \
  .flatMap(lambda v: [(c, 1) for c in v[1]]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

base_elements = base_results.map(lambda v: v[0]).collect()
base_results = base_results.collectAsMap()

len(base_elements) # 131

23/03/18 16:54:17 WARN TaskSetManager: Stage 47 contains a task of very large size (21666 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

131

In [12]:
filtered_patient_conditions = sc.parallelize(patient_conditions) \
  .map(lambda v: [c for c in v[1] if c in base_elements]) \
  .cache()

In [18]:
pairs_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in combinations(v, 2)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

frequent_pairs = pairs_results.map(lambda v: v[0]).collect()
pairs_results = pairs_results.collectAsMap()

len(frequent_pairs) # 2940

23/03/18 16:36:17 WARN TaskSetManager: Stage 31 contains a task of very large size (21666 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

2940

In [21]:
triples_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in frequent_combinations(v, 3, frequent_pairs)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

frequent_triples = triples_results.map(lambda v: v[0]).collect()
triples_results = triples_results.collectAsMap()

len(frequent_triples) # 13395

23/03/18 16:40:23 WARN TaskSetManager: Stage 43 contains a task of very large size (21666 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

13395

# Exercise 1.2

In [41]:
# Input Constants
STD_LIFT_THRESHOLD = .2

In [37]:
def create_line(v, get_combination_support, union_support_results):
    n = len(patient_conditions)

    combination_support = get_combination_support(v[:-1])
    element_support = base_results[v[-1]]
    union_support = union_support_results[tuple(sorted(v))]

    combination_probability = combination_support/n
    elem_probability = element_support/n
    
    confidence = union_support/combination_support
    interest = confidence - elem_probability
    lift = confidence/elem_probability

    z = max(combination_probability+elem_probability-1, 1/n)/(combination_probability*elem_probability)
    std_lift = (lift - z)/(1/max(combination_probability, elem_probability) - z)

    return (v, std_lift, lift, confidence, interest)

create_pair_line = lambda v: create_line(v, lambda c: base_results[c[0]], pairs_results)
create_triple_line = lambda v: create_line(v, lambda c: pairs_results[c], triples_results)

In [42]:
pairs_rules = sc.parallelize(frequent_pairs) \
  .flatMap(lambda v: [create_pair_line(v), create_pair_line(v[::-1])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

len(pairs_rules)

                                                                                

2418

In [43]:
triples_rules = sc.parallelize(frequent_triples) \
  .flatMap(lambda v: [create_triple_line(v), create_triple_line(v[1:] + v[:1]), create_triple_line(v[:1] + v[2:] + v[1:2])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

len(triples_rules)

                                                                                

23247