In [None]:
from google.colab import files

uploaded = files.upload()

file_path = 'browsing.txt'
with open(file_path, 'wb') as file:
    file.write(uploaded[file_path])

Saving browsing.txt to browsing.txt


In [None]:
from itertools import combinations

def load_data(file_path):
    trans = []
    with open(file_path, 'r') as file:
        for line in file:
            tran = frozenset(line.strip().split())
            trans.append(tran)
    return trans

def find_candidates(freq_sets, k):
    candidates = set()
    for i in range(len(freq_sets)):
        for j in range(i+1, len(freq_sets)):
            itemset1 = freq_sets[i]
            itemset2 = freq_sets[j]
            if len(itemset1.union(itemset2)) == k:
                candidates.add(itemset1.union(itemset2))
    return candidates

def apriori_algorithm(trans, min_support):
    itemsets = set(frozenset([item]) for tran in trans for item in tran)

    freq_sets = []
    k = 1
    while itemsets:
        counts = {}
        for tran in trans:
            for itemset in itemsets:
                if itemset.issubset(tran):
                    counts[itemset] = counts.get(itemset, 0) + 1

        itemsets = {itemset for itemset, count in counts.items() if count >= min_support}
        freq_sets.extend(itemsets)

        if k == 1:
            print(f"1st pass frequent items: {len(itemsets)}")
        elif k == 2:
            print(f"2nd pass frequent items: {len(itemsets)}")

        k += 1
        itemsets = find_candidates(list(itemsets), k)

    return freq_sets

def find_top_pairs(freq_sets, trans, top_n):
    pairs = [itemset for itemset in freq_sets if len(itemset) == 2]
    pair_counts = {pair: 0 for pair in pairs}

    for tran in trans:
        for pair in pairs:
            if pair.issubset(tran):
                pair_counts[pair] += 1

    sorted_pairs = sorted(pairs, key=lambda pair: (pair_counts[pair], pair), reverse=True)
    return sorted_pairs[:top_n]

def find_top_triples(freq_sets, trans, top_n):
    triples = [itemset for itemset in freq_sets if len(itemset) == 3]
    triple_counts = {triple: 0 for triple in triples}

    for tran in trans:
        for triple in triples:
            if triple.issubset(tran):
                triple_counts[triple] += 1

    sorted_triples = sorted(triples, key=lambda triple: (triple_counts[triple], triple), reverse=True)
    return sorted_triples[:top_n]

def find_top_rules_pairs(freq_sets, trans, top_n):
    rules = []
    for itemset in freq_sets:
        if len(itemset) == 2:
            item1, item2 = sorted(itemset)
            confidence1 = find_confidence_score({item1}, {item2}, trans)
            confidence2 = find_confidence_score({item2}, {item1}, trans)
            rules.append(({item1}, {item2}, confidence1))
            rules.append(({item2}, {item1}, confidence2))

    sorted_rules = sorted(rules, key=lambda rule: (rule[2], rule[0]), reverse=True)
    return sorted_rules[:top_n]

def find_top_rules_triples(freq_sets, trans, top_n):
    rules = []
    for itemset in freq_sets:
        if len(itemset) == 3:
            item1, item2, item3 = sorted(itemset)
            confidence1 = find_confidence_score({item1, item2}, {item3}, trans)
            confidence2 = find_confidence_score({item1, item3}, {item2}, trans)
            confidence3 = find_confidence_score({item2, item3}, {item1}, trans)
            rules.append((({item1, item2}, {item3}), confidence1))
            rules.append((({item1, item3}, {item2}), confidence2))
            rules.append((({item2, item3}, {item1}), confidence3))

    sorted_rules = sorted(rules, key=lambda rule: (rule[1], rule[0][0], rule[0][1]), reverse=True)
    return sorted_rules[:top_n]

def find_confidence_score(antecedent, consequent, trans):
    antecedent_support = 0
    itemset_support = 0
    for tran in trans:
        if antecedent.issubset(tran):
            antecedent_support += 1
            if consequent.issubset(tran):
                itemset_support += 1

    confidence = itemset_support / antecedent_support
    return confidence

file_path = 'browsing.txt'
trans = load_data(file_path)

min_support = 100

freq_sets = apriori_algorithm(trans, min_support)

#top 5 pairs with highest support
top_pairs = find_top_pairs(freq_sets, trans, top_n=5)
print("\nTop 5 pairs by support with frequency values")
for pair in top_pairs:
    pair_count = sum(1 for tran in trans if pair.issubset(tran))
    print(f"{pair} support = {pair_count}")

#top 5 rules with highest confidence
top_rules_pairs = find_top_rules_pairs(freq_sets, trans, top_n=5)
print("\nTop 5 rules by confidence with confidence scores:")
for antecedent, consequent, confidence in top_rules_pairs:
    print(f"{tuple(antecedent)} -----> {tuple(consequent)} confidence = {confidence}")

# top 5 triples with highest support
top_triples = find_top_triples(freq_sets, trans, top_n=5)
print("\nTop 5 rules by support with frequency values:")
for triple in top_triples:
    triple_count = sum(1 for tran in trans if triple.issubset(tran))
    print(f"{triple} support = {triple_count}")

#top 5 triple rules with highest confidence
top_rules_triples = find_top_rules_triples(freq_sets, trans, top_n=5)
print("\nTop 5 rules for triples by confidence with confidence scores:")
for rule, confidence in top_rules_triples:
    antecedent, consequent = rule
    print(f"{tuple(antecedent)} -----> {tuple(consequent)} confidence = {confidence}")

1st pass frequent items: 647
2nd pass frequent items: 1334

Top 5 pairs by support with frequency values
frozenset({'DAI62779', 'ELE17451'}) support = 1592
frozenset({'FRO40251', 'SNA80324'}) support = 1412
frozenset({'FRO40251', 'DAI75645'}) support = 1254
frozenset({'GRO85051', 'FRO40251'}) support = 1213
frozenset({'GRO73461', 'DAI62779'}) support = 1139

Top 5 rules by confidence with confidence scores:
('DAI93865',) -----> ('FRO40251',) confidence = 1.0
('GRO85051',) -----> ('FRO40251',) confidence = 0.999176276771005
('GRO38636',) -----> ('FRO40251',) confidence = 0.9906542056074766
('ELE12951',) -----> ('FRO40251',) confidence = 0.9905660377358491
('DAI88079',) -----> ('FRO40251',) confidence = 0.9867256637168141

Top 5 rules by support with frequency values:
frozenset({'FRO40251', 'SNA80324', 'DAI75645'}) support = 550
frozenset({'FRO40251', 'DAI62779', 'SNA80324'}) support = 476
frozenset({'GRO85051', 'FRO40251', 'SNA80324'}) support = 471
frozenset({'SNA18336', 'ELE92920', 'D