# Data Reading and Preprocessing


In [40]:
import pandas as pd
import numpy as np
from itertools import combinations

## Specify the input file here

File needs to be a **csv** of the following format:

````
item1, item2, item3, ... so on
 , t, ...
t, t, t,...
t, t, ...
... so on...```
````


In [41]:
df = pd.read_csv("transactions_binarized.csv", low_memory=False)

In [42]:
df.head()

Unnamed: 0,asparagus,almonds,antioxydant juice,asparagus.1,avocado,babies food,bacon,barbecue sauce,black tea,blueberries,...,turkey,vegetables mix,water spray,white wine,whole weat flour,whole wheat pasta,whole wheat rice,yams,yogurt cake,zucchini
0,,t,t,,t,,,,,,...,,t,,,t,,,t,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,t,,,,,,...,t,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,t,,,


Indexing each item from the header of the data file.


In [43]:
item_list = list(df.columns)
item_dict = dict()

for i, item in enumerate(item_list):
    item_dict[item] = i + 1

item_dict

{' asparagus': 1,
 'almonds': 2,
 'antioxydant juice': 3,
 'asparagus': 4,
 'avocado': 5,
 'babies food': 6,
 'bacon': 7,
 'barbecue sauce': 8,
 'black tea': 9,
 'blueberries': 10,
 'body spray': 11,
 'bramble': 12,
 'brownies': 13,
 'bug spray': 14,
 'burger sauce': 15,
 'burgers': 16,
 'butter': 17,
 'cake': 18,
 'candy bars': 19,
 'carrots': 20,
 'cauliflower': 21,
 'cereals': 22,
 'champagne': 23,
 'chicken': 24,
 'chili': 25,
 'chocolate': 26,
 'chocolate bread': 27,
 'chutney': 28,
 'cider': 29,
 'clothes accessories': 30,
 'cookies': 31,
 'cooking oil': 32,
 'corn': 33,
 'cottage cheese': 34,
 'cream': 35,
 'dessert wine': 36,
 'eggplant': 37,
 'eggs': 38,
 'energy bar': 39,
 'energy drink': 40,
 'escalope': 41,
 'extra dark chocolate': 42,
 'flax seed': 43,
 'french fries': 44,
 'french wine': 45,
 'fresh bread': 46,
 'fresh tuna': 47,
 'fromage blanc': 48,
 'frozen smoothie': 49,
 'frozen vegetables': 50,
 'gluten free bar': 51,
 'grated cheese': 52,
 'green beans': 53,
 'gree

Extracting the transactions from the data.


In [44]:
transactions = list()

for i, row in df.iterrows():
    transaction = set()

    for item in item_dict:
        if row[item] == "t":
            transaction.add(item_dict[item])
    transactions.append(transaction)

transactions

[{2,
  3,
  5,
  34,
  40,
  49,
  54,
  55,
  61,
  66,
  73,
  82,
  92,
  93,
  98,
  103,
  107,
  112,
  115,
  118},
 {16, 38, 70},
 {28},
 {5, 111},
 {39, 55, 72, 73, 117},
 {66},
 {44, 116},
 {64, 96, 100},
 {50, 55, 101},
 {44},
 {38, 87},
 {31},
 {16, 32, 38, 73, 111},
 {23, 31, 101},
 {73, 93},
 {73},
 {24, 26, 32, 61, 66, 81, 98},
 {38, 111},
 {9, 24, 38, 42, 47, 73, 93, 101, 109, 111},
 {44, 61, 70, 72, 89},
 {26, 38, 85, 86, 90, 97, 98},
 {91, 102},
 {11, 55, 58, 73, 83, 101},
 {5, 16, 52, 61, 85, 98, 110, 114},
 {38},
 {5, 46, 72, 84, 100, 101},
 {9, 39, 41, 49, 56, 72, 73, 93, 101},
 {102},
 {24, 26, 38, 44, 73},
 {50, 73, 101, 118},
 {60, 64, 67, 108},
 {5, 26, 38, 73},
 {44, 104, 111},
 {26, 50, 105},
 {31},
 {41, 88, 93, 101},
 {39, 44},
 {56, 73, 90},
 {18, 34, 73},
 {23, 55, 88},
 {101},
 {41, 47, 49, 50, 61, 73, 101, 117},
 {101},
 {62, 70, 100, 102},
 {41},
 {5, 11, 13, 44, 55, 62, 83, 100},
 {22, 24, 30, 73},
 {14, 73},
 {5, 76},
 {9, 16, 55},
 {13, 26, 55, 101,

---

## Utility Functions


**get_support** function evaluates the support value for a set given all the transactions.


In [45]:
def get_support(transactions, item_set):
    match_count = 0
    for transaction in transactions:
        if item_set.issubset(transaction):
            match_count += 1

    return float(match_count / len(transactions))

---

**self_join** performs join based on the last level valid sets. It joins each sets together by performing union and if the length exceeds the current level, it will skip that set.


In [46]:
def self_join(frequent_item_sets_per_level, level):
    current_level_candidates = list()
    last_level_items = frequent_item_sets_per_level[level - 1]

    if len(last_level_items) == 0:
        return current_level_candidates

    for i in range(len(last_level_items)):
        for j in range(i + 1, len(last_level_items)):
            itemset_i = last_level_items[i][0]
            itemset_j = last_level_items[j][0]
            union_set = itemset_i.union(itemset_j)

            if union_set not in current_level_candidates and len(union_set) == level:
                current_level_candidates.append(union_set)

    return current_level_candidates

---

**pruning** function prunes the candidate sets evaluated after completing the self-join part. For each itemset, it finds all its subsets by dropping a single elements from it and checks if that subset was present in the previous level or not. If that subset was not present in the previous level, then the current set is not valid and must not be used, and is thus pruned.


In [47]:
def get_single_drop_subsets(item_set):
    single_drop_subsets = list()
    for item in item_set:
        temp = item_set.copy()
        temp.remove(item)
        single_drop_subsets.append(temp)

    return single_drop_subsets


def is_valid_set(item_set, prev_level_sets):
    single_drop_subsets = get_single_drop_subsets(item_set)

    for single_drop_set in single_drop_subsets:
        if single_drop_set not in prev_level_sets:
            return False
    return True


def pruning(frequent_item_sets_per_level, level, candidate_set):
    post_pruning_set = list()
    if len(candidate_set) == 0:
        return post_pruning_set

    prev_level_sets = list()
    for item_set, _ in frequent_item_sets_per_level[level - 1]:
        prev_level_sets.append(item_set)

    for item_set in candidate_set:
        if is_valid_set(item_set, prev_level_sets):
            post_pruning_set.append(item_set)

    return post_pruning_set

---

## Apriori Algorithm


In [48]:
from collections import defaultdict



def apriori(min_support):

    frequent_item_sets_per_level = defaultdict(list)


    print("level : 1", end=" ")

    for item in range(1, len(item_list) + 1):

        support = get_support(transactions, {item})

        if support >= min_support:

            frequent_item_sets_per_level[1].append(({item}, support))

    for level in range(2, len(item_list) + 1):

        print(level, end=" ")

        current_level_candidates = self_join(frequent_item_sets_per_level, level)

        post_pruning_candidates = pruning(
            frequent_item_sets_per_level, level, current_level_candidates
        )

        if len(post_pruning_candidates) == 0:

            break

        for item_set in post_pruning_candidates:

            support = get_support(transactions, item_set)

            if support >= min_support:

                frequent_item_sets_per_level[level].append((item_set, support))

    return frequent_item_sets_per_level

### Specify the **minimum support** value here


In [49]:
min_support = 0.005
frequent_item_sets_per_level = apriori(min_support)

level : 1 2 

3 4 5 

Debug print statements to check the number of frequent sets calculated for each level.


In [50]:
for level in frequent_item_sets_per_level:
    print(len(frequent_item_sets_per_level[level]))

101
451
173
0


Debug statement to check the frequent sets calculated.


In [51]:
for level in frequent_item_sets_per_level:
    print(frequent_item_sets_per_level[level])

[({2}, 0.020397280362618318), ({3}, 0.008932142381015865), ({5}, 0.03332888948140248), ({7}, 0.008665511265164644), ({8}, 0.010798560191974404), ({9}, 0.014264764698040262), ({10}, 0.009198773496867084), ({11}, 0.011465137981602452), ({13}, 0.03372883615517931), ({14}, 0.008665511265164644), ({15}, 0.005865884548726837), ({16}, 0.0871883748833489), ({17}, 0.030129316091187842), ({18}, 0.08105585921877083), ({19}, 0.009732035728569524), ({20}, 0.01533128916144514), ({22}, 0.025729902679642713), ({23}, 0.04679376083188908), ({24}, 0.05999200106652446), ({25}, 0.0061325156645780565), ({26}, 0.1638448206905746), ({29}, 0.010531929076123183), ({30}, 0.008398880149313425), ({31}, 0.08038928142914278), ({32}, 0.0510598586855086), ({34}, 0.03186241834422077), ({37}, 0.013198240234635382), ({38}, 0.17970937208372217), ({39}, 0.027063058258898813), ({40}, 0.026663111585121985), ({41}, 0.0793227569657379), ({42}, 0.011998400213304892), ({43}, 0.009065457938941474), ({44}, 0.1709105452606319), ({4

---

## Generating Association Rules

Prepare input for calculating association rules: Create a dictionary of each frequent itemset against its support value.


In [52]:
item_support_dict = dict()
item_list = list()

key_list = list(item_dict.keys())
val_list = list(item_dict.values())

for level in frequent_item_sets_per_level:
    for set_support_pair in frequent_item_sets_per_level[level]:
        for i in set_support_pair[0]:
            item_list.append(key_list[val_list.index(i)])
        item_support_dict[frozenset(item_list)] = set_support_pair[1]
        item_list = list()

Debug statement to check the values in the dictionary created.


In [53]:
item_support_dict

{frozenset({'almonds'}): 0.020397280362618318,
 frozenset({'antioxydant juice'}): 0.008932142381015865,
 frozenset({'avocado'}): 0.03332888948140248,
 frozenset({'bacon'}): 0.008665511265164644,
 frozenset({'barbecue sauce'}): 0.010798560191974404,
 frozenset({'black tea'}): 0.014264764698040262,
 frozenset({'blueberries'}): 0.009198773496867084,
 frozenset({'body spray'}): 0.011465137981602452,
 frozenset({'brownies'}): 0.03372883615517931,
 frozenset({'bug spray'}): 0.008665511265164644,
 frozenset({'burger sauce'}): 0.005865884548726837,
 frozenset({'burgers'}): 0.0871883748833489,
 frozenset({'butter'}): 0.030129316091187842,
 frozenset({'cake'}): 0.08105585921877083,
 frozenset({'candy bars'}): 0.009732035728569524,
 frozenset({'carrots'}): 0.01533128916144514,
 frozenset({'cereals'}): 0.025729902679642713,
 frozenset({'champagne'}): 0.04679376083188908,
 frozenset({'chicken'}): 0.05999200106652446,
 frozenset({'chili'}): 0.0061325156645780565,
 frozenset({'chocolate'}): 0.1638448

### Utility Function

**find_subset** finds all the subsets of the given itemset.


In [54]:
def find_subset(item, item_length):
    combs = []
    for i in range(1, item_length + 1):
        combs.append(list(combinations(item, i)))

    subsets = []
    for comb in combs:
        for elt in comb:
            subsets.append(elt)

    return subsets

**association_rules** generates the association rules in accordance with the given _minimum confidence_ value and the provided dictionary of itemsets against their support values. For itemsets of more than one element, it first finds all their subsets. For every subset A, it calculates the set B = itemset-A. If B is not empty, the confidence of B is calculated. If this value is more than _minimum confidence_ value, the rule _A->B_ is added to the list.


In [55]:
def association_rules(min_confidence, support_dict):
    rules = list()
    for item, support in support_dict.items():
        item_length = len(item)

        if item_length > 1:
            subsets = find_subset(item, item_length)

            for A in subsets:
                B = item.difference(A)

                if B:
                    A = frozenset(A)

                    AB = A | B

                    confidence = support_dict[AB] / support_dict[A]
                    if confidence >= min_confidence:
                        rules.append((A, B, confidence))

    return rules

### Specify Minimum confidence value here


In [56]:
association_rules = association_rules(
    min_confidence=0.5, support_dict=item_support_dict
)

---

### Printing the output in the required format


In [57]:
print("Number of rules: ", len(association_rules), "\n")

for rule in association_rules:
    print(
        '{0} -> {1} <confidence: {2}>'.format(set(rule[0]), set(rule[1]), rule[2]))

Number of rules:  20 

{'chicken', 'chocolate'} -> {'mineral water'} <confidence: 0.5181818181818182>
{'olive oil', 'chocolate'} -> {'mineral water'} <confidence: 0.5040650406504065>
{'soup', 'chocolate'} -> {'mineral water'} <confidence: 0.5526315789473685>
{'cooking oil', 'eggs'} -> {'mineral water'} <confidence: 0.5454545454545454>
{'ground beef', 'eggs'} -> {'mineral water'} <confidence: 0.5066666666666667>
{'frozen vegetables', 'ground beef'} -> {'mineral water'} <confidence: 0.5433070866141732>
{'frozen vegetables', 'ground beef'} -> {'spaghetti'} <confidence: 0.5118110236220472>
{'frozen vegetables', 'olive oil'} -> {'mineral water'} <confidence: 0.5764705882352941>
{'frozen vegetables', 'soup'} -> {'mineral water'} <confidence: 0.6333333333333333>
{'frozen vegetables', 'olive oil'} -> {'spaghetti'} <confidence: 0.5058823529411764>
{'ground beef', 'milk'} -> {'mineral water'} <confidence: 0.503030303030303>
{'ground beef', 'pancakes'} -> {'mineral water'} <confidence: 0.51376146