<a href="https://colab.research.google.com/github/nurin07/Data-Warehousing-and-Data-Mining/blob/main/Lab_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

file_path = '/content/sports.txt'
transactions = []

with open(file_path, 'r') as file:
    next(file)
    for line in file:
        parts = line.strip().split(',')
        transactions.append(parts[1:])

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = apriori(df, min_support=0.15, use_colnames=True)

min_confidence = 0.7
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)

print("Frequent Itemsets:\n", frequent_itemsets)
if not rules.empty:
    print("\nAssociation Rules:\n", rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo association rules found with confidence ≥", min_confidence)


Frequent Itemsets:
     support        itemsets
0  0.352941  (cricket ball)
1  0.392157   (cricket bat)
2  0.431373      (football)
3  0.352941        (gloves)
4  0.254902     (ice cream)
5  0.411765         (juice)
6  0.274510  (water bottle)

No association rules found with confidence ≥ 0.7


In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

file_path = '/content/sports.txt'
transactions = []

with open(file_path, 'r') as file:
    next(file)
    for line in file:
        parts = line.strip().split(',')
        transactions.append([item.strip() for item in parts[1:] if item.strip()])

total_tx = len(transactions)
min_support = 0.15
min_confidence = 0.7

def get_support(itemset, transactions):
    count = sum(1 for tx in transactions if itemset.issubset(set(tx)))
    return count / total_tx

def apriori(transactions, min_support):
    item_counts = defaultdict(int)

    for tx in transactions:
        for item in tx:
            item_counts[frozenset([item])] += 1

    frequent_itemsets = {item: count for item, count in item_counts.items() if count / total_tx >= min_support}
    all_frequent = frequent_itemsets.copy()
    current_freq = list(frequent_itemsets.keys())
    k = 2

    while current_freq:
        candidates = set()
        for i in range(len(current_freq)):
            for j in range(i + 1, len(current_freq)):
                union = current_freq[i] | current_freq[j]
                if len(union) == k:
                    candidates.add(union)

        candidate_counts = defaultdict(int)
        for tx in transactions:
            tx_set = set(tx)
            for candidate in candidates:
                if candidate.issubset(tx_set):
                    candidate_counts[candidate] += 1

        current_freq = [item for item in candidate_counts if candidate_counts[item] / total_tx >= min_support]
        all_frequent.update({item: candidate_counts[item] for item in current_freq})
        k += 1

    return all_frequent

def generate_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        support_itemset = frequent_itemsets[itemset] / total_tx
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                support_ante = get_support(antecedent, transactions)
                support_cons = get_support(consequent, transactions)
                confidence = support_itemset / support_ante
                lift = confidence / support_cons
                if confidence >= min_confidence:
                    rules.append({
                        'antecedents': set(antecedent),
                        'consequents': set(consequent),
                        'support': round(support_itemset, 2),
                        'confidence': round(confidence, 2),
                        'lift': round(lift, 2)
                    })
    return rules

frequent_itemsets_raw = apriori(transactions, min_support)
rules = generate_rules(frequent_itemsets_raw, transactions, min_confidence)

frequent_itemsets_df = pd.DataFrame([{
    'itemsets': set(item),
    'support': round(count / total_tx, 2)
} for item, count in frequent_itemsets_raw.items()])

rules_df = pd.DataFrame(rules)

print("Frequent Itemsets:\n", frequent_itemsets_df)

if not rules_df.empty:
    print("\nAssociation Rules:\n", rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo association rules found with confidence ≥", min_confidence)


Frequent Itemsets:
          itemsets  support
0      {football}     0.43
1  {cricket ball}     0.35
2        {gloves}     0.35
3   {cricket bat}     0.39
4         {juice}     0.41
5  {water bottle}     0.27
6     {ice cream}     0.25

No association rules found with confidence ≥ 0.7


In [None]:
import pandas as pd
from itertools import combinations
from collections import defaultdict

file_path = 'space.txt'
transactions = []

with open(file_path, 'r') as file:
    next(file)
    for line in file:
        parts = line.strip().split(',')
        transactions.append([item.strip() for item in parts[1:] if item.strip()])

total_tx = len(transactions)
min_support = 0.15
min_confidence = 0.7

def get_support(itemset, transactions):
    count = sum(1 for tx in transactions if itemset.issubset(set(tx)))
    return count / total_tx

def apriori(transactions, min_support):
    item_counts = defaultdict(int)

    for tx in transactions:
        for item in tx:
            item_counts[frozenset([item])] += 1

    frequent_itemsets = {item: count for item, count in item_counts.items() if count / total_tx >= min_support}
    all_frequent = frequent_itemsets.copy()
    current_freq = list(frequent_itemsets.keys())
    k = 2

    while current_freq:
        candidates = set()
        for i in range(len(current_freq)):
            for j in range(i + 1, len(current_freq)):
                union = current_freq[i] | current_freq[j]
                if len(union) == k:
                    candidates.add(union)

        candidate_counts = defaultdict(int)
        for tx in transactions:
            tx_set = set(tx)
            for candidate in candidates:
                if candidate.issubset(tx_set):
                    candidate_counts[candidate] += 1

        current_freq = [item for item in candidate_counts if candidate_counts[item] / total_tx >= min_support]
        all_frequent.update({item: candidate_counts[item] for item in current_freq})
        k += 1

    return all_frequent

def generate_rules(frequent_itemsets, transactions, min_confidence):
    rules = []
    for itemset in frequent_itemsets:
        if len(itemset) < 2:
            continue
        support_itemset = frequent_itemsets[itemset] / total_tx
        for i in range(1, len(itemset)):
            for antecedent in combinations(itemset, i):
                antecedent = frozenset(antecedent)
                consequent = itemset - antecedent
                support_ante = get_support(antecedent, transactions)
                support_cons = get_support(consequent, transactions)
                confidence = support_itemset / support_ante
                lift = confidence / support_cons
                if confidence >= min_confidence:
                    rules.append({
                        'antecedents': set(antecedent),
                        'consequents': set(consequent),
                        'support': round(support_itemset, 2),
                        'confidence': round(confidence, 2),
                        'lift': round(lift, 2)
                    })
    return rules

frequent_itemsets_raw = apriori(transactions, min_support)
rules = generate_rules(frequent_itemsets_raw, transactions, min_confidence)

frequent_itemsets_df = pd.DataFrame([{
    'itemsets': set(item),
    'support': round(count / total_tx, 2)
} for item, count in frequent_itemsets_raw.items()])

rules_df = pd.DataFrame(rules)

print("Frequent Itemsets:\n", frequent_itemsets_df)

if not rules_df.empty:
    print("\nAssociation Rules:\n", rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo association rules found with confidence ≥", min_confidence)


Frequent Itemsets:
                      itemsets  support
0               {Robotic Arm}     0.33
1              {Food Packets}     0.39
2              {Sleeping Bag}     0.31
3                 {Treadmill}     0.27
4                {Space Suit}     0.31
5                {3D Printer}     0.27
6  {Carbon Dioxide Scrubbers}     0.24

No association rules found with confidence ≥ 0.7


In [None]:
import pandas as pd
from collections import defaultdict, Counter

file_path = 'sports.txt'
transactions = []

with open(file_path, 'r') as file:
    next(file)
    for line in file:
        parts = line.strip().split(',')
        transactions.append([item.strip() for item in parts[1:] if item.strip()])

total_tx = len(transactions)
min_support = 0.15
min_conf = 0.7

item_counts = Counter()
for tx in transactions:
    for item in tx:
        item_counts[item] += 1

item_counts = {item: count for item, count in item_counts.items() if count / total_tx >= min_support}
items_ordered = sorted(item_counts, key=lambda x: (-item_counts[x], x))

def reorder_transaction(tx):
    return [item for item in items_ordered if item in tx]

ordered_tx = [reorder_transaction(tx) for tx in transactions]

class FPNode:
    def __init__(self, item, parent):
        self.item = item
        self.count = 1
        self.parent = parent
        self.children = {}
        self.link = None

class FPTree:
    def __init__(self, transactions):
        self.root = FPNode(None, None)
        self.header_table = {}
        for tx in transactions:
            self.insert(tx)

    def insert(self, tx):
        node = self.root
        for item in tx:
            if item not in node.children:
                node.children[item] = FPNode(item, node)
                if item not in self.header_table:
                    self.header_table[item] = node.children[item]
                else:
                    curr = self.header_table[item]
                    while curr.link is not None:
                        curr = curr.link
                    curr.link = node.children[item]
            else:
                node.children[item].count += 1
            node = node.children[item]

def mine_tree(tree, suffix, freq_items):
    items = sorted(tree.header_table.items(), key=lambda x: item_counts[x[0]])
    for item, node in items:
        new_suffix = suffix + [item]
        support = 0
        conditional_patterns = []

        while node:
            support += node.count
            path = []
            parent = node.parent
            while parent and parent.item:
                path.append(parent.item)
                parent = parent.parent
            if path:
                conditional_patterns.append((path[::-1], node.count))
            node = node.link

        if support / total_tx >= min_support:
            freq_items[frozenset(new_suffix)] = support

            cond_tx = []
            for path, count in conditional_patterns:
                cond_tx.extend([path] * count)
            if cond_tx:
                cond_tree = FPTree(cond_tx)
                mine_tree(cond_tree, new_suffix, freq_items)

fp_tree = FPTree(ordered_tx)
frequent_itemsets = {}
mine_tree(fp_tree, [], frequent_itemsets)

frequent_itemsets_df = pd.DataFrame([{
    'itemsets': set(item),
    'support': round(count / total_tx, 2)
} for item, count in frequent_itemsets.items()])

def get_support(itemset):
    return sum(1 for tx in transactions if itemset.issubset(set(tx))) / total_tx

rules = []
for itemset in frequent_itemsets:
    if len(itemset) < 2:
        continue
    support = frequent_itemsets[itemset] / total_tx
    for i in range(1, len(itemset)):
        from itertools import combinations
        for antecedent in combinations(itemset, i):
            antecedent = frozenset(antecedent)
            consequent = itemset - antecedent
            support_ante = get_support(antecedent)
            support_cons = get_support(consequent)
            confidence = support / support_ante
            lift = confidence / support_cons
            if confidence >= min_conf:
                rules.append({
                    'antecedents': set(antecedent),
                    'consequents': set(consequent),
                    'support': round(support, 2),
                    'confidence': round(confidence, 2),
                    'lift': round(lift, 2)
                })

rules_df = pd.DataFrame(rules)

print("Frequent Itemsets:\n", frequent_itemsets_df)
if not rules_df.empty:
    print("\nAssociation Rules:\n", rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo rules meet the minimum confidence threshold.")


Frequent Itemsets:
          itemsets  support
0     {ice cream}     0.25
1  {water bottle}     0.27
2  {cricket ball}     0.35
3        {gloves}     0.35
4   {cricket bat}     0.39
5         {juice}     0.41
6      {football}     0.43

No rules meet the minimum confidence threshold.


In [None]:
import pandas as pd
from collections import defaultdict, Counter

file_path = 'space.txt'
transactions = []

with open(file_path, 'r') as file:
    next(file)
    for line in file:
        parts = line.strip().split(',')
        transactions.append([item.strip() for item in parts[1:] if item.strip()])

total_tx = len(transactions)
min_support = 0.15
min_conf = 0.7

item_counts = Counter()
for tx in transactions:
    for item in tx:
        item_counts[item] += 1

item_counts = {item: count for item, count in item_counts.items() if count / total_tx >= min_support}
items_ordered = sorted(item_counts, key=lambda x: (-item_counts[x], x))

def reorder_transaction(tx):
    return [item for item in items_ordered if item in tx]

ordered_tx = [reorder_transaction(tx) for tx in transactions]

class FPNode:
    def __init__(self, item, parent):
        self.item = item
        self.count = 1
        self.parent = parent
        self.children = {}
        self.link = None

class FPTree:
    def __init__(self, transactions):
        self.root = FPNode(None, None)
        self.header_table = {}
        for tx in transactions:
            self.insert(tx)

    def insert(self, tx):
        node = self.root
        for item in tx:
            if item not in node.children:
                node.children[item] = FPNode(item, node)
                if item not in self.header_table:
                    self.header_table[item] = node.children[item]
                else:
                    curr = self.header_table[item]
                    while curr.link is not None:
                        curr = curr.link
                    curr.link = node.children[item]
            else:
                node.children[item].count += 1
            node = node.children[item]

def mine_tree(tree, suffix, freq_items):
    items = sorted(tree.header_table.items(), key=lambda x: item_counts[x[0]])
    for item, node in items:
        new_suffix = suffix + [item]
        support = 0
        conditional_patterns = []

        while node:
            support += node.count
            path = []
            parent = node.parent
            while parent and parent.item:
                path.append(parent.item)
                parent = parent.parent
            if path:
                conditional_patterns.append((path[::-1], node.count))
            node = node.link

        if support / total_tx >= min_support:
            freq_items[frozenset(new_suffix)] = support

            cond_tx = []
            for path, count in conditional_patterns:
                cond_tx.extend([path] * count)
            if cond_tx:
                cond_tree = FPTree(cond_tx)
                mine_tree(cond_tree, new_suffix, freq_items)

fp_tree = FPTree(ordered_tx)
frequent_itemsets = {}
mine_tree(fp_tree, [], frequent_itemsets)

frequent_itemsets_df = pd.DataFrame([{
    'itemsets': set(item),
    'support': round(count / total_tx, 2)
} for item, count in frequent_itemsets.items()])

def get_support(itemset):
    return sum(1 for tx in transactions if itemset.issubset(set(tx))) / total_tx

rules = []
for itemset in frequent_itemsets:
    if len(itemset) < 2:
        continue
    support = frequent_itemsets[itemset] / total_tx
    for i in range(1, len(itemset)):
        from itertools import combinations
        for antecedent in combinations(itemset, i):
            antecedent = frozenset(antecedent)
            consequent = itemset - antecedent
            support_ante = get_support(antecedent)
            support_cons = get_support(consequent)
            confidence = support / support_ante
            lift = confidence / support_cons
            if confidence >= min_conf:
                rules.append({
                    'antecedents': set(antecedent),
                    'consequents': set(consequent),
                    'support': round(support, 2),
                    'confidence': round(confidence, 2),
                    'lift': round(lift, 2)
                })

rules_df = pd.DataFrame(rules)

print("Frequent Itemsets:\n", frequent_itemsets_df)
if not rules_df.empty:
    print("\nAssociation Rules:\n", rules_df[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
else:
    print("\nNo rules meet the minimum confidence threshold.")


Frequent Itemsets:
                      itemsets  support
0  {Carbon Dioxide Scrubbers}     0.24
1                 {Treadmill}     0.27
2                {3D Printer}     0.27
3              {Sleeping Bag}     0.31
4                {Space Suit}     0.31
5               {Robotic Arm}     0.33
6              {Food Packets}     0.39

No rules meet the minimum confidence threshold.
