In [190]:
# Lab #5 - Association Rules Mining
# Generating rules - only one item on the right side

from collections import Counter
from pprint import pprint
import itertools
import copy
import pandas as pd

In [191]:
from collections import Counter

def frequentItems(transactions, support):
    counter = Counter()
    for trans in transactions:
        counter.update(frozenset([t]) for t in trans)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def generateCandidates(L, k):
    candidates = set()
    for a in L:
        for b in L:
            union = a | b
            if len(union) == k and a != b:
                candidates.add(union)
    return candidates

def filterCandidates(transactions, itemsets, support):
    counter = Counter()
    for trans in transactions:
        subsets = [itemset for itemset in itemsets if itemset.issubset(trans)]
        counter.update(subsets)
    return set(item for item in counter if counter[item]/len(transactions) >= support), counter

def apriori(transactions, support):
    result = list()
    resultc = Counter()
    candidates, counter = frequentItems(transactions, support)
    result += candidates
    resultc += counter
    k = 2
    while candidates:
        candidates = generateCandidates(candidates, k)
        candidates,counter = filterCandidates(transactions, candidates, support)
        result += candidates
        resultc += counter
        k += 1
    resultc = {item:(resultc[item]/len(transactions)) for item in resultc}
    return result, resultc

In [192]:
def generate_posibility(itemset):
    items = list(itemset)
    res = []
    for item in items:
        new_list = copy.deepcopy(items)
        new_list.remove(item)
        new_listr = []
        new_listr.append(item)
        res.append((new_list, new_listr))
    
    return res

In [193]:
def generate_rules(itemsets, supports, min_confidence, metric):
    generated_rules = []

    for itemset in itemsets:
        if len(itemset) < 2:
            continue

        for entry in generate_posibility(itemset):
            
            left_side, right_side = entry
            
            if metric == "confidence":
                rule_confidence = supports[itemset] / supports[frozenset(entry[0])]
                
            elif metric == "lift":
                rule_confidence = supports[itemset] / (supports[frozenset(entry[0])] * supports[frozenset(entry[1])])
                
            elif metric == "conviction":
                rule_confidence = ((1-supports[frozenset(entry[1])]) / (1-supports[itemset] / supports[frozenset(entry[0])])) \
                if (1-supports[itemset] / supports[frozenset(entry[0])]) != 0 else 0

            else:
                raise ValueError("Metric must be confidence or lift or conviction.")
            if rule_confidence >= min_confidence:
                generated_rules.append((left_side, right_side, round(rule_confidence, 3), round(supports[itemset], 3)))     
    return sorted(generated_rules, key=lambda i: (i[3], i[2]), reverse=True)

In [194]:
def load_csv_to_list_of_lists(filename):
    dataset = pd.read_csv(filename)
    return [[col + "=" + str(row[col]) for col in list(dataset)] for index, row in dataset.iterrows()]

In [195]:
def print_topn_rules(rules, n):
    for rule in rules[:n]:
        print("{} -> {} - SUPPORT: {} - CONFIDENCE: {}".format(rule[0], rule[1], rule[3], rule[2]))

In [196]:
dataset = [
    ['bread', 'milk'],
    ['bread', 'diaper', 'beer', 'egg'],
    ['milk', 'diaper', 'beer', 'cola'],
    ['bread', 'milk', 'diaper', 'beer'],
    ['bread', 'milk', 'diaper', 'cola'],
]

In [200]:
n_print = 20
result, resultc = apriori(dataset, 0.3)

print("CONFIDENCE:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="confidence"), n_print)
print()
print("LIFT:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="lift"), n_print)
print()
print("CONVICTION:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="conviction"), n_print)

CONFIDENCE:
['beer'] -> ['diaper'] - SUPPORT: 0.6 - CONFIDENCE: 1.0
['bread'] -> ['diaper'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['diaper'] -> ['bread'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['bread'] -> ['milk'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['milk'] -> ['bread'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['diaper'] -> ['beer'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['milk'] -> ['diaper'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['diaper'] -> ['milk'] - SUPPORT: 0.6 - CONFIDENCE: 0.75
['cola'] -> ['diaper'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['cola'] -> ['milk'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['milk', 'beer'] -> ['diaper'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['beer', 'bread'] -> ['diaper'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['milk', 'cola'] -> ['diaper'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['diaper', 'cola'] -> ['milk'] - SUPPORT: 0.4 - CONFIDENCE: 1.0
['beer'] -> ['milk'] - SUPPORT: 0.4 - CONFIDENCE: 0.667
['beer'] -> ['bread'] - SUPPORT: 0.4 - CONFIDENCE: 0.667
['diaper', 'beer'] -> ['milk'] - SUPPORT: 0.4 - CONFI

In [201]:
result, resultc = apriori(load_csv_to_list_of_lists("./bank-data.csv"), 0.3)

print("CONFIDENCE:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="confidence"), n_print)
print()
print("LIFT:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="lift"), n_print)
print()
print("CONVICTION:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="conviction"), n_print)

CONFIDENCE:
['save_act=YES'] -> ['current_act=YES'] - SUPPORT: 0.532 - CONFIDENCE: 0.771
['current_act=YES'] -> ['save_act=YES'] - SUPPORT: 0.532 - CONFIDENCE: 0.701
['mortgage=NO'] -> ['current_act=YES'] - SUPPORT: 0.502 - CONFIDENCE: 0.77
['current_act=YES'] -> ['mortgage=NO'] - SUPPORT: 0.502 - CONFIDENCE: 0.662
['married=YES'] -> ['current_act=YES'] - SUPPORT: 0.488 - CONFIDENCE: 0.74
['current_act=YES'] -> ['married=YES'] - SUPPORT: 0.488 - CONFIDENCE: 0.644
['married=YES'] -> ['save_act=YES'] - SUPPORT: 0.462 - CONFIDENCE: 0.699
['save_act=YES'] -> ['married=YES'] - SUPPORT: 0.462 - CONFIDENCE: 0.669
['mortgage=NO'] -> ['save_act=YES'] - SUPPORT: 0.45 - CONFIDENCE: 0.691
['save_act=YES'] -> ['mortgage=NO'] - SUPPORT: 0.45 - CONFIDENCE: 0.652
['mortgage=NO'] -> ['married=YES'] - SUPPORT: 0.435 - CONFIDENCE: 0.668
['married=YES'] -> ['mortgage=NO'] - SUPPORT: 0.435 - CONFIDENCE: 0.659
['pep=NO'] -> ['current_act=YES'] - SUPPORT: 0.407 - CONFIDENCE: 0.748
['current_act=YES'] -> ['pe

In [204]:
result, resultc = apriori(get_dataset_uci("./zoo.csv"), 0.3)

print("CONFIDENCE:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="confidence"), n_print)
print()
print("LIFT:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="lift"), n_print)
print()
print("CONVICTION:")
print_topn_rules(generate_rules(result, resultc, 0.5, metric="conviction"), n_print)

CONFIDENCE:
['domestic=False'] -> ['venomous=False'] - SUPPORT: 0.802 - CONFIDENCE: 0.92
['venomous=False'] -> ['domestic=False'] - SUPPORT: 0.802 - CONFIDENCE: 0.871
['backbone=True'] -> ['venomous=False'] - SUPPORT: 0.782 - CONFIDENCE: 0.952
['venomous=False'] -> ['backbone=True'] - SUPPORT: 0.782 - CONFIDENCE: 0.849
['fins=False'] -> ['venomous=False'] - SUPPORT: 0.762 - CONFIDENCE: 0.917
['venomous=False'] -> ['fins=False'] - SUPPORT: 0.762 - CONFIDENCE: 0.828
['breathes=True'] -> ['fins=False'] - SUPPORT: 0.752 - CONFIDENCE: 0.95
['fins=False'] -> ['breathes=True'] - SUPPORT: 0.752 - CONFIDENCE: 0.905
['breathes=True'] -> ['venomous=False'] - SUPPORT: 0.743 - CONFIDENCE: 0.938
['venomous=False'] -> ['breathes=True'] - SUPPORT: 0.743 - CONFIDENCE: 0.806
['tail=True'] -> ['backbone=True'] - SUPPORT: 0.733 - CONFIDENCE: 0.987
['backbone=True'] -> ['tail=True'] - SUPPORT: 0.733 - CONFIDENCE: 0.892
['airborne=False'] -> ['feathers=False'] - SUPPORT: 0.723 - CONFIDENCE: 0.948
['feathers