# MDLE - Exercise 1.1
### Frequent itemsets and association rules - Similar items
##### Authors: Pedro Duarte 97673, Pedro Monteiro 97484

Import necessary modules

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from operator import add

Declare constants

In [None]:
# Spark Constants
APP_NAME = 'assignment1'
MASTER = 'local[*]'

# Column Constants
PATIENT_COLUMN = "PATIENT"
CODE_COLUMN = "CODE"

CONDITIONS_COLUMN = "CONDITIONS"

# Input Constants
INPUT_FILE = 'conditions.csv.gz'
SUPPORT_THRESHOLD = 1000

Define combinations and frequent_combinations functions 

Function: combinations(elems, size, base=[])
- generate combinations of elements from a given list `elems` of integers

Function: frequent_combinations(elems, size, combinations, base=[])
- generates all frequent combinations of size `size` from the list `elems` that appear in the list of previously generated combinations `combinations`
- `base` parameter can be used to provide a starting list of combinations to build upon
- if `base` is not provided, the function will use all single-element combinations from `elems`

In [None]:
# create our combinations function
def combinations(elems, size, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
    ]
    
    if len(base) == 0: return []

    return combinations(elems, size, base)


def frequent_combinations(elems, size, combinations, base=[]):
    if len(elems) == 0 or size > len(elems): return []
    if len(base) == 0: base = [(elem,) for elem in elems]
    if len(base[0]) == size: return base
    
    base = [
        base_comb + (elem,) 
        for base_comb in base
        for elem in elems
        if elem > base_comb[-1]
        if len(base_comb) - 1 < size or (base_comb in combinations and base_comb[1:] + (elem,) in combinations)
    ]

    if len(base) == 0: return []

    return frequent_combinations(elems, size, combinations, base)

Configuration and Initialization of Spark

- Parameters:
    - `APP_NAME` (string): the name of the Spark application
    - `MASTER` (string): the URL of the Spark master node
<br></br>
- Returns:
    - `sc` (SparkContext): the Spark context for the given application and master
    - `spark` (SparkSession): the Spark session for the given application and master

In [None]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).getOrCreate()

Reading and Parsing Data from CSV File

- Parameters:
    - `INPUT_FILE` (string): the path to the input CSV file
<br></br>
- Returns:
    - `ds` (DataFrame): the parsed data as a Spark DataFrame

In [None]:
ds = spark.read.csv(INPUT_FILE, header=True, inferSchema=True)
ds.schema

Extracting and Aggregating Patient Conditions

1. Maps each row to a patient ID and a set containing their conditions.
2. Reduces the data by patient ID, combining the sets of condition codes for each patient into a single set.
3. Sorts the condition codes for each patient in ascending order.
4. Collects the resulting data into a list of tuples, where each tuple contains a patient ID and their sorted set of condition codes.


In [None]:
patient_conditions = ds.rdd \
  .map(lambda v: (v[PATIENT_COLUMN], {v[CODE_COLUMN]})) \
  .reduceByKey(lambda v1, v2: v1.union(v2)) \
  .mapValues(sorted) \
  .collect()

print(patient_conditions[:5])

Generating Frequent Itemsets
1. Converts the patient condition into a set of itemsets by flattening the list of conditions for each patient.
2. Reduces the data by itemset, counting the number of occurrences of each itemset.
3. Filters itemsets that do not meet the minimum support threshold.
4. Caches the resulting RDD for faster access in subsequent iterations.
5. Collects the frequent itemsets into a Python dictionary.
6. Computes the number of frequent itemsets generated.

In [None]:

base_results = sc.parallelize(patient_conditions) \
  .flatMap(lambda v: [(c, 1) for c in v[1]]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache() # Cache the resulting RDD for faster access in subsequent iterations

base_elements = base_results.map(lambda v: v[0]).collect()
base_results = base_results.collectAsMap()

len(base_elements) # 131

Create patient conditions and remove elements that are not in the base_elements list

`sc.parallelize()` method is used to create the RDD from the patient_conditions list. <br>
`map()` method is used to apply the filtering transformation to each patient condition. <br>
`cache()` method is used to cache the resulting filtered list in memory.

In [None]:
filtered_patient_conditions = sc.parallelize(patient_conditions) \
  .map(lambda v: [c for c in v[1] if c in base_elements]) \
  .cache()

Compute frequent pairs of elements from the `filtered_patient_conditions` RDD

`flatMap()` is used to generate all pairs of elements in each patient condition using `combinations()` function

`reduceByKey()` is used to aggregate the each pair counts across patient
conditions.

`filter()` is used to remove pairs that do not meet a certain of support threshold

In [None]:
pairs_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in combinations(v, 2)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

top_ten_pairs = pairs_results.sortBy(lambda x: -x[1]).toDF().head(10)
frequent_pairs = pairs_results.map(lambda v: v[0]).collect()
pairs_results = pairs_results.collectAsMap()

#len(frequent_pairs) ==> 2940

Compute frequent triples of elements from the `filtered_patient_conditions`

`flatMap()` is used to generate all triples of elements in each patient condition using `frequent_combinations()` function
- `frequent_combinations()` filters pairs that are not frequent based on the list of frequent pairs previous computed.

`reduceByKey()` used to aggregate the counts of each triple 

`filter()` is used to remove triples that do not meet the minimum support threshold

In [None]:
triples_results = filtered_patient_conditions \
  .flatMap(lambda v: [(c, 1) for c in frequent_combinations(v, 3, frequent_pairs)]) \
  .reduceByKey(add) \
  .filter(lambda x: x[1] >= SUPPORT_THRESHOLD) \
  .cache()

top_ten_triplets = triples_results.sortBy(lambda x: -x[1]).toDF().head(10)
frequent_triples = triples_results.map(lambda v: v[0]).collect()
triples_results = triples_results.collectAsMap()

# len(frequent_triples) ==> 13395

Show top 10 pair results 

In [None]:
top_ten_pairs

Show top 10 triplets results 

In [None]:
top_ten_triplets

# Exercise 1.2

Declare constants

In [None]:
# Input Constants
STD_LIFT_THRESHOLD = .2

Analyze patient data and determine the relationship between various patient conditions <br>
<br>
function `create_line` that takes three arguments:
- v: items list representing a combination of patient conditions
- get_combination_support: a function that takes in a combination of patient conditions and returns the support of that combination
- union_support_results: a dictionary containing the support of all possible combinations of patient conditions

Returns:
- tuple containing the combination of patient conditions (variable v), the standard lift, lift, confidence, and interest.

In [None]:
def create_line(v, get_combination_support, union_support_results):
    n = len(patient_conditions)

    combination_support = get_combination_support(v[:-1]) # calculate combination support of patient conditions
    element_support = base_results[v[-1]] # support of the individual element in the combination
    union_support = union_support_results[tuple(sorted(v))] # support of the union of all elements in the combination

    combination_probability = combination_support/n
    elem_probability = element_support/n
    
    # calculate confidence, interest and lift
    confidence = union_support/combination_support
    interest = confidence - elem_probability
    lift = confidence/elem_probability

    z = max(combination_probability+elem_probability-1, 1/n)/(combination_probability*elem_probability)
    std_lift = (lift - z)/(1/max(combination_probability, elem_probability) - z) # calculate the standard lift

    return (v, std_lift, lift, confidence, interest)

# create pair and triple rules
create_pair_line = lambda v: create_line(v, lambda c: base_results[c[0]], pairs_results)
create_triple_line = lambda v: create_line(v, lambda c: pairs_results[c], triples_results)

Create pair rules
- Take a list of frequent pairs 
- Create new pairs of items by reversing the order of the pairs
- Filter pairs that do not meet a certain criterion
- Sort the remaining pairs

`sc.parallelize(frequent_pairs)` create a Spark RDD from the `frequent_pairs` list <br>
`flatMap` used to transform the frequent_pairs list RDD into an RDD of pairs of items  <br>
`filter` removes pairs where the second element of the pair (lift value) is less than a predefined `STD_LIFT_THRESHOLD` <br>
`sortBy` sorts pairs in ascending order <br>
`collect` return the results as a list of tuples<br>

In [None]:
pairs_rules = sc.parallelize(frequent_pairs) \
  .flatMap(lambda v: [create_pair_line(v), create_pair_line(v[::-1])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

Write results to `pair_rules.txt` file

In [None]:
with open('pair_rules.txt', 'w') as file:
  file.write('({})->{:<30}{:<30}{:<30}{:<30}{:<30}\n'
             .format("X", "Y", "Standardised Lift", "Lift", "Confidence", "Interest"))
  for item in pairs_rules:
    file.write('({:<})->{:<20}{:<30}{:<30}{:<30}{:<30}\n'.format(
    item[0][0], item[0][1], item[1], item[2], item[3], item[4]
    ))

Create triplet rules

- Takes a list of frequent triples
- Create new triples by shifting the elements and combining them in various ways
- Filter triples that do not meet a certain criterion
- Sort triples

`sc.parallelize(frequent_triples):` create an RDD from the frequent_triples list <br>
`flatMap` transform the RDD created from the `frequent_triples` list into an RDD of triples of items<br>
`filter` remove triples where the second element (lift value) is less than a predefined `STD_LIFT_THRESHOLD` <br>
`sortBy` sort the triples in ascending order based on the second element (lift value) <br>
`collect` return data as a list of tuples

In [None]:
triples_rules = sc.parallelize(frequent_triples) \
  .flatMap(lambda v: [create_triple_line(v), create_triple_line(v[1:] + v[:1]), create_triple_line(v[:1] + v[2:] + v[1:2])]) \
  .filter(lambda v: v[1] > STD_LIFT_THRESHOLD) \
  .sortBy(lambda v: v[1]) \
  .collect()

Write results to `triplets_rules.txt` file

In [None]:
with open('triplet_rules.txt', 'w') as file:
  file.write('({},{})->{:<30}{:<30}{:<30}{:<30}{:<30}\n'
             .format("X", "Y", "Z", "Standardised Lift", "Lift", "Confidence", "Interest"))
  for item in triples_rules:
    file.write('({},{})->{:<20}{:<30}{:<30}{:<30}{:<30}\n'.format(
    item[0][0], item[0][1], item[0][2], item[1], item[2], item[3], item[4]
    ))