In [26]:
import pandas as pd
from itertools import combinations
from collections import defaultdict
import pandas as pd
import time
import matplotlib.pyplot as plt


def to_dataframe(transactions):
    # We iterate each element for each of the baskets and add it to the set of items
    df = pd.DataFrame(columns=sorted(list({item for transaction in transactions for item in transaction})))
    for transaction in transactions:
        # For each transaction we add a new row of 0s and 1s depending on whether the item is in the transaction or not
        df.loc[len(df)] = [1 if item in transaction else 0 for item in df.columns]
    return df
    

def load_sample_dataset():
    """
    Load a sample dataset for demonstration purposes.
    
    Each list within the main list represents a transaction with several items.
    """
    transactions = [
        ['Bread', 'Milk'],
        ['Bread', 'Diapers', 'Beer', 'Eggs'],
        ['Milk', 'Diapers', 'Beer', 'Cola'],
        ['Bread', 'Milk', 'Diapers', 'Beer'],
        ['Bread', 'Milk', 'Diapers', 'Cola']
    ]
    #Import dataset from data folder where each row is a transaction
    dataset = pd.read_csv('./data/transactions.dat', header=None)

    dataset = dataset.values.tolist()
    return transactions

def apriori(transactions, min_support):
    """
    Implements the A-Priori algorithm for finding frequent itemsets.

    :param transactions: List of transactions (each transaction is a list of items)
    :param min_support: Minimum support threshold
    :return: Dictionary of frequent itemsets with their support
    """

    L = dict() # Candidate k-itemsets
    timings = [] # List to store the time taken for each iteration

    # Function to get itemsets of a given size
    """
     Gets the possible itemsets for a given size
    :param transactions: List of transactions (each transaction is a list of items)
    :param size: Size of the itemsets to generate
    """
    def get_new_itemsets(k):
        next_itemsets = set()

        if k == 1:
            for elem in list(transactions.columns):
                next_itemsets.add(tuple(elem, ))
        else:
            # We augment the dimension of the previous k-1-frequent itemsets with the 1-frequent itemsets
            for itemset in L[k-1]:
                for item in L[1]:
                    if item not in itemset:
                        new_itemset = tuple(sorted(list(itemset) + [item]))
                        next_itemsets.add(new_itemset)

        return next_itemsets


    """
    Calculates the support for each itemset in the list of itemsets. Counting how many times one itemset appears in all transactions.
    :param transactions: List of transactions (each transaction is a list of items)
    :param itemsets: List of itemsets to calculate the support for
    set_itemset = set(itemset)
            itemset_len = len(set_itemset)
            for transaction in set_transactions:
                # Early exit if the transaction is smaller than the itemset (improves performance) because itemset ('Diapers', 'Beer', 'Cola') can't be subset of
                # this transaction {'Milk', 'Bread'}  
                if len(transaction) < itemset_len:
                   # print("Transaction is smaller than itemset", transaction, itemset)
                    continue
                if set_itemset.issubset(transaction):
                    support[itemset] += 1
    """
# Function to calculate support for itemsets
    def calculate_support(itemsets):
        supports = {}

        for itemset in itemsets:
            relevant_df = df[list(itemset)]
            # Count the number of transactions where all items = 1
            support_count = (relevant_df.sum(axis=1) == len(itemset)).sum()
            supports[itemset] = support_count

        return supports

    start_time = time.time()

    #Main part of the algorithm
    k = 1  # Starting with individual items, later increasing he k for pairs, triples, etc.
    L[k] = set() # Frequent k-itemsets
    next_itemsets = get_new_itemsets(k)
    # print("Before:", next_itemsets)

    # Loop through each level (single items, pairs, triples, etc.)
    while next_itemsets:
        # Calculate support for each itemset
        itemset_support = calculate_support(next_itemsets)
        # Select itemsets with support greater or equal to min_support
        L[k] = set(filter(lambda item: item[1] >= min_support, itemset_support.items())) #Filter is more efficient than simple iteration
        #print("k-frequent sets:", k, L[k])
        
        
        end_time = time.time()
        timings.append((k, end_time - start_time))

        k += 1
        L[k] = set()
        next_itemsets = get_new_itemsets(k)

    return L, timings

#sample_transactions = load_sample_dataset()
#min_support = 2  #  threshold
#frequent_itemsets, timings = apriori(sample_transactions, min_support)



In [27]:
# We create a matrix of 0s and 1s where each row is a transaction and each column is an item
df = to_dataframe(load_sample_dataset())
df.to_csv('data/df.csv', index=False)

KeyboardInterrupt: 

In [None]:
#timings_df = pd.DataFrame(timings, columns=['k', 'Time (seconds)'])


#print(timings_df.to_string(index=False))
# Plotting
#timings_df.plot(x='k', y='Time (seconds)', kind='line', marker='o', title='Apriori Algorithm Performance by k')
#plt.xlabel('k (Size of Itemsets)')
#plt.ylabel('Time (seconds)')
#plt.grid(True)
#plt.show()