# MDLE - Exercise 1.1
### Frequent itemsets and association rules - Similar items
##### Authors: Pedro Duarte 97673, Pedro Monteiro 97484

Import necessary modules

In [6]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from operator import add

Declare constants

In [7]:
# Spark Constants
APP_NAME = 'assignment1'
MASTER = 'local[*]'

# Column Constants
PATIENT_COLUMN = "PATIENT"
CODE_COLUMN = "CODE"

CONDITIONS_COLUMN = "CONDITIONS"

# Input Constants
INPUT_FILE = 'conditions.csv'
SUPPORT_THRESHOLD = 1000

Define combinations and frequent_combinations functions 

Function: combinations(elems, size, base=[])
- generate combinations of elements from a given list `elems` of integers

Function: frequent_combinations(elems, size, combinations, base=[])
- generates all frequent combinations of size `size` from the list `elems` that appear in the list of previously generated combinations `combinations`
- `base` parameter can be used to provide a starting list of combinations to build upon
- if `base` is not provided, the function will use all single-element combinations from `elems`

In [8]:
def combinations(elems, size, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
    ]
    
    if len(base) == 0: return []

    return combinations(elems, size, base)


def frequent_combinations(elems, size, combinations, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
        if len(base_comb) - 1 < size or (base_comb in combinations and base_comb[1:] + (elem,) in combinations)
    ]

    if len(base) == 0: return []

    return frequent_combinations(elems, size, combinations, base)

Configuration and Initialization of Spark

- Parameters:
    - `APP_NAME` (string): the name of the Spark application
    - `MASTER` (string): the URL of the Spark master node
<br></br>
- Returns:
    - `sc` (SparkContext): the Spark context for the given application and master
    - `spark` (SparkSession): the Spark session for the given application and master

In [9]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).getOrCreate()

Reading and Parsing Data from CSV File

- Parameters:
    - `INPUT_FILE` (string): the path to the input CSV file
<br></br>
- Returns:
    - `ds` (DataFrame): the parsed data as a Spark DataFrame

In [10]:
ds = spark.read.csv(INPUT_FILE, header=True, inferSchema=True)
ds.schema

                                                                                

StructType([StructField('START', TimestampType(), True), StructField('STOP', TimestampType(), True), StructField('PATIENT', StringType(), True), StructField('ENCOUNTER', StringType(), True), StructField('CODE', LongType(), True), StructField('DESCRIPTION', StringType(), True)])

Extracting and Aggregating Patient Conditions

1. Maps each row to a patient ID and a set containing their conditions.
2. Reduces the data by patient ID, combining the sets of condition codes for each patient into a single set.
3. Sorts the condition codes for each patient in ascending order.
4. Collects the resulting data into a list of tuples, where each tuple contains a patient ID and their sorted set of condition codes.


In [11]:
patient_conditions = ds.rdd \
  .map(lambda v: (v[PATIENT_COLUMN], {v[CODE_COLUMN]})) \
  .reduceByKey(lambda v1, v2: v1.union(v2)) \
  .mapValues(sorted) \
  .collect()

print(patient_conditions[:5])

                                                                                

[('3826037f-19e0-4c7b-98e5-4e9578472f67', [24079001, 55822004, 65966004, 162864005]), ('8e763f75-614b-4ef7-aa86-ce459dd3142e', [10509002, 70704007, 128613002, 195662009, 703151001]), ('b7ee9259-1f13-412a-830c-a53684b82cc3', [15777000, 40055000, 49436004, 59621000, 65275009, 68496003, 92691004, 126906006, 162573006, 162864005, 235919008, 254632001, 271737000, 370143000, 403190006, 444814009, 67811000119102]), ('2593819d-f0ff-470b-95da-656e8340255c', [10509002, 19169002, 35999006, 72892002, 195662009, 198992004, 232353008, 398254007, 444814009]), ('0800eff6-6e91-4014-8349-52023f4975b7', [10509002, 15777000, 19169002, 59621000, 64859006, 68496003, 162864005, 444814009])]


Generating Frequent Itemsets
1. Converts the patient condition into a set of itemsets by flattening the list of conditions for each patient.
2. Reduces the data by itemset, counting the number of occurrences of each itemset.
3. Filters itemsets that do not meet the minimum support threshold.
4. Caches the resulting RDD for faster access in subsequent iterations.
5. Collects the frequent itemsets into a Python dictionary.
6. Computes the number of frequent itemsets generated.

In [12]:

base_results = sc.parallelize(patient_conditions) \
  .flatMap(lambda v: [(c, 1) for c in v[1]]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache() # Cache the resulting RDD for faster access in subsequent iterations

base_elements = base_results.map(lambda v: v[0]).collect()
base_results = base_results.collectAsMap()

len(base_elements) # 131

23/03/19 12:20:42 WARN TaskSetManager: Stage 4 contains a task of very large size (7229 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

131

In [13]:
filtered_patient_conditions = sc.parallelize(patient_conditions) \
  .map(lambda v: [c for c in v[1] if c in base_elements]) \
  .cache()

In [14]:
pairs_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in combinations(v, 2)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

frequent_pairs = pairs_results.map(lambda v: v[0]).collect()
pairs_results = pairs_results.collectAsMap()

len(frequent_pairs) # 2940

23/03/19 12:20:46 WARN TaskSetManager: Stage 8 contains a task of very large size (7229 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

2940

In [15]:
triples_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in frequent_combinations(v, 3, frequent_pairs)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

frequent_triples = triples_results.map(lambda v: v[0]).collect()
triples_results = triples_results.collectAsMap()

len(frequent_triples) # 13395

23/03/19 12:20:54 WARN TaskSetManager: Stage 12 contains a task of very large size (7229 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

13395

# Exercise 1.2

In [16]:
# Input Constants
STD_LIFT_THRESHOLD = .2

In [17]:
def create_line(v, get_combination_support, union_support_results):
    n = len(patient_conditions)

    combination_support = get_combination_support(v[:-1])
    element_support = base_results[v[-1]]
    union_support = union_support_results[tuple(sorted(v))]

    combination_probability = combination_support/n
    elem_probability = element_support/n
    
    confidence = union_support/combination_support
    interest = confidence - elem_probability
    lift = confidence/elem_probability

    z = max(combination_probability+elem_probability-1, 1/n)/(combination_probability*elem_probability)
    std_lift = (lift - z)/(1/max(combination_probability, elem_probability) - z)

    return (v, std_lift, lift, confidence, interest)

create_pair_line = lambda v: create_line(v, lambda c: base_results[c[0]], pairs_results)
create_triple_line = lambda v: create_line(v, lambda c: pairs_results[c], triples_results)

In [18]:
pairs_rules = sc.parallelize(frequent_pairs) \
  .flatMap(lambda v: [create_pair_line(v), create_pair_line(v[::-1])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

len(pairs_rules)

                                                                                

2418

In [19]:
triples_rules = sc.parallelize(frequent_triples) \
  .flatMap(lambda v: [create_triple_line(v), create_triple_line(v[1:] + v[:1]), create_triple_line(v[:1] + v[2:] + v[1:2])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

len(triples_rules)

                                                                                

23247