
---
---
<strong style="font-size:42pt;">Eclat</strong> 

---
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [41]:
with open('../q3/browsing.txt', 'r') as file:
    data = file.readlines()
    
baskets = []
for line in data:
    baskets.append([product for product in line.split()])

In [76]:
class Eclat:
    def __init__(self, min_support=100):
        self.min_support = min_support
        self.itemsets = []

    def fit(self, transactions):
        self.transactions = transactions
        self.min_support_count = self.min_support
        self.items = self._get_items()
        self._eclat([], list(self.items.keys()))

    def _get_items(self):
        items = {}
        for i, transaction in enumerate(self.transactions):
            for item in transaction:
                if item not in items:
                    items[item] = set()
                items[item].add(i)
        return items

    def _eclat(self, prefix, items):
        while items:
            item = items.pop(0)
            new_prefix = prefix + [item]
            item_support = self.items[item]

            if len(item_support) >= self.min_support_count:
                self.itemsets.append((new_prefix, len(item_support)))
                suffix_items = []
                for i in range(len(items)):
                    new_item_support = item_support & self.items[items[i]]
                    if len(new_item_support) >= self.min_support_count:
                        suffix_items.append(items[i])
                        self.items[items[i]] = new_item_support
                self._eclat(new_prefix, suffix_items)

    def get_itemsets(self):
        return self.itemsets

In [75]:
def get_full_support(itemset, transactions):
    count = 0
    for transaction in transactions:
        if set(itemset).issubset(set(transaction)):
            count += 1
    return count

In [78]:
eclat = Eclat(min_support=100)
eclat.fit(baskets)
frequent_itemsets = eclat.get_itemsets()

frequent_itemsets_dict = {tuple(sorted(itemset)): support for itemset, support in frequent_itemsets}


In [79]:
print("Frequent itemsets:")
for itemset, support in frequent_itemsets:
    print(f"Itemset: {itemset}, Support: {support}")

Frequent itemsets:
Itemset: ['FRO11987'], Support: 104
Itemset: ['ELE17451'], Support: 3875
Itemset: ['ELE17451', 'SNA90258'], Support: 113
Itemset: ['ELE17451', 'GRO99222'], Support: 148
Itemset: ['ELE17451', 'ELE26917'], Support: 314
Itemset: ['ELE17451', 'ELE26917', 'DAI62779'], Support: 160
Itemset: ['ELE17451', 'SNA30755'], Support: 111
Itemset: ['ELE17451', 'GRO73461'], Support: 580
Itemset: ['ELE17451', 'GRO73461', 'GRO30386'], Support: 103
Itemset: ['ELE17451', 'GRO73461', 'FRO40251'], Support: 159
Itemset: ['ELE17451', 'GRO73461', 'DAI75645'], Support: 121
Itemset: ['ELE17451', 'DAI22896'], Support: 193
Itemset: ['ELE17451', 'SNA99873'], Support: 270
Itemset: ['ELE17451', 'GRO56989'], Support: 129
Itemset: ['ELE17451', 'FRO78087'], Support: 218
Itemset: ['ELE17451', 'ELE59935'], Support: 181
Itemset: ['ELE17451', 'DAI22177'], Support: 203
Itemset: ['ELE17451', 'ELE66810'], Support: 154
Itemset: ['ELE17451', 'GRO94758'], Support: 227
Itemset: ['ELE17451', 'SNA55952'], Support: 

In [80]:
frequent_doubletons = []
frequent_triplets = []
for itemset, support in frequent_itemsets_dict.items():
    if len(itemset) == 2:
        frequent_doubletons.append((itemset, support))
    elif len(itemset) == 3:
        frequent_triplets.append((itemset, support))

In [81]:
print(frequent_doubletons)

[(('ELE17451', 'SNA90258'), 113), (('ELE17451', 'GRO99222'), 148), (('ELE17451', 'ELE26917'), 314), (('ELE17451', 'SNA30755'), 111), (('ELE17451', 'GRO73461'), 580), (('DAI22896', 'ELE17451'), 193), (('ELE17451', 'SNA99873'), 270), (('ELE17451', 'GRO56989'), 129), (('ELE17451', 'FRO78087'), 218), (('ELE17451', 'ELE59935'), 181), (('DAI22177', 'ELE17451'), 203), (('ELE17451', 'ELE66810'), 154), (('ELE17451', 'GRO94758'), 227), (('ELE17451', 'SNA55952'), 123), (('DAI48891', 'ELE17451'), 121), (('ELE11111', 'ELE17451'), 121), (('ELE17451', 'FRO92261'), 127), (('ELE17451', 'FRO32293'), 219), (('DAI95741', 'ELE17451'), 102), (('ELE17451', 'GRO30386'), 103), (('ELE17451', 'FRO16142'), 152), (('DAI35347', 'ELE17451'), 158), (('ELE17451', 'SNA93860'), 182), (('ELE17451', 'SNA72163'), 272), (('DAI55911', 'ELE17451'), 127), (('ELE17451', 'FRO31317'), 359), (('ELE17451', 'SNA59903'), 127), (('ELE17451', 'GRO15017'), 171), (('ELE17451', 'FRO98184'), 112), (('ELE17451', 'GRO59710'), 408), (('DAI639

In [82]:
ass_double_rules = {}
for itemset, support in frequent_doubletons:
    item1, item2 = itemset
    
    ass_double_rules[(item1, item2)] = support / get_full_support([item1], baskets)
    ass_double_rules[(item2, item1)] = support / get_full_support([item2], baskets)

sorted_rules = sorted(ass_double_rules.items(), key=lambda x: x[1], reverse=True)
top_doubleton_rules = [rule for rule in sorted_rules if rule[1] > 0.5]

print(f"Top rules with confidence above 0.5:\n {top_doubleton_rules}")
print(f"\nNumber of top rules: {len(top_doubleton_rules)}")

Top rules with confidence above 0.5:
 [(('DAI43868', 'SNA82528'), 0.972972972972973), (('GRO89004', 'ELE25077'), 0.698051948051948), (('SNA44451', 'DAI18527'), 0.5828571428571429), (('FRO17734', 'ELE28189'), 0.5815602836879432), (('DAI46755', 'FRO81176'), 0.5803921568627451), (('SNA30859', 'GRO24246'), 0.53125)]

Number of top rules: 6


In [83]:
ass_triplet_rules = {}
for itemset, support in frequent_triplets:
    item1, item2, item3 = itemset
    ass_triplet_rules[(item1, item2, item3)] = support / get_full_support([item1, item2], baskets)
    ass_triplet_rules[(item1, item3, item2)] = support / get_full_support([item1, item3], baskets)
    ass_triplet_rules[(item2, item3, item1)] = support / get_full_support([item2, item3], baskets)

sorted_triplet_rules = sorted(ass_triplet_rules.items(), key=lambda x: x[1], reverse=True)
top_triplet_rules = [rule for rule in sorted_triplet_rules if rule[1]>0.5]
print(f"Top rules with confidence above 0.5:\n {top_triplet_rules}")
print(f"\n Number of top rules: {len(top_triplet_rules)}")

Top rules with confidence above 0.5:
 [(('ELE17451', 'SNA18336', 'ELE92920'), 0.8539325842696629), (('ELE17451', 'GRO85051', 'SNA80324'), 0.728110599078341), (('DAI42083', 'ELE17451', 'DAI92600'), 0.6324324324324324), (('DAI85309', 'SNA18336', 'ELE17451'), 0.6127167630057804), (('DAI85309', 'SNA18336', 'ELE92920'), 0.6127167630057804), (('ELE17451', 'ELE92920', 'SNA18336'), 0.59375), (('DAI92600', 'ELE17451', 'DAI42083'), 0.5735294117647058), (('DAI85309', 'ELE92920', 'ELE17451'), 0.527363184079602), (('DAI85309', 'ELE92920', 'SNA18336'), 0.527363184079602), (('ELE17451', 'ELE26917', 'DAI62779'), 0.5095541401273885), (('ELE92920', 'SNA18336', 'ELE17451'), 0.5010989010989011)]

 Number of top rules: 11
