In [1]:
import numpy as np
import pandas as pd
import csv
from mlxtend.preprocessing import TransactionEncoder
import time

## Pre-processing CSV File

In [2]:
# Open the CSV file
with open("D:\\393- assignment\\grocery_transactions.csv", newline='') as csvfile:

    # Read the CSV file into a dictionary
    data = csv.DictReader(csvfile)

    # Create an empty list to hold the transactions
    transactions = []

    # Loop through each row in the CSV file
    for row in data:
        
        # Create a list to hold the items in this transaction
        transaction_items = []
        
        # Loop through each column in the row (excluding the first column, which is the transaction ID)
        for column in list(row.keys())[1:]:
            
            # If the item in this column is not empty, add it to the list of items in this transaction
            if row[column] != '':
                transaction_items.append(row[column])
        
        # Add this transaction to the list of transactions
        transactions.append(transaction_items)

#create matrix visualization of the dataset

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)

df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,apples,bananas,beef,bread,carrots,cheese,chicken,eggs,lettuce,milk,onions,pasta,pork,potatoes,rice,tomatoes
0,True,True,True,True,False,True,False,False,True,False,False,False,False,False,True,False
1,False,True,True,False,False,True,False,False,True,False,False,True,False,True,False,True
2,True,False,True,False,False,False,True,True,False,True,False,False,True,False,False,True
3,False,False,True,True,True,False,False,False,False,True,True,True,False,False,False,True
4,True,True,True,False,False,False,False,True,True,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,False,False,True,False,False,False,False,True,False,False,True,True,False,True,True,True
21996,True,False,False,True,True,True,False,False,True,False,True,True,False,False,False,False
21997,True,False,True,True,False,False,False,False,True,True,False,False,True,False,True,False
21998,False,False,True,True,False,False,True,False,False,True,False,True,False,True,False,True


## FP-Growth Algorithm implementation

In [3]:
#Importing Libraries
from mlxtend.frequent_patterns import fpgrowth
#FP-growth algorithm runtime
start_time = time.time()
frequent_itemsets_2=fpgrowth(df,min_support=0.1, use_colnames=True)
print("--- Total runtime of mlxtend FP-Growth implementation is: %s seconds ---" % (time.time() - start_time))
frequent_itemsets_2

--- Total runtime of mlxtend FP-Growth implementation is: 0.624267578125 seconds ---


Unnamed: 0,support,itemsets
0,0.442864,(lettuce)
1,0.441500,(apples)
2,0.440182,(bananas)
3,0.438227,(beef)
4,0.437727,(bread)
...,...,...
131,0.170182,"(rice, onions)"
132,0.172409,"(lettuce, onions)"
133,0.175773,"(onions, apples)"
134,0.173773,"(onions, pork)"


## Generate association rules

In [7]:
from mlxtend.frequent_patterns import association_rules

start_time = time.time()
result = association_rules(frequent_itemsets_2, metric='confidence', min_threshold=0.4)
#filter out unnecessary information, the final table will only show antecedents, consequents, and their confidence.
result = result[['antecedents', 'consequents', 'confidence', 'support']]
# Sort the result by confidence in descending order
result = result.sort_values(by='confidence', ascending=False)
print("--- Total runtime of mlxtend association rules generator is: %s seconds ---" % (time.time() - start_time))
result

--- Total runtime of mlxtend association rules generator is: 0.015705108642578125 seconds ---


Unnamed: 0,antecedents,consequents,confidence,support
24,(cheese),(lettuce),0.414505,0.180818
15,(beef),(milk),0.412302,0.180682
1,(bananas),(lettuce),0.411710,0.181227
94,(chicken),(potatoes),0.411460,0.176909
106,(carrots),(apples),0.409979,0.179273
...,...,...,...,...
59,(tomatoes),(cheese),0.400063,0.173136
60,(tomatoes),(bananas),0.400063,0.173136
93,(chicken),(beef),0.400042,0.172000
7,(bananas),(milk),0.400041,0.176091


In [8]:
#Sort by support in descending order
result = result[['antecedents', 'consequents', 'support']]
result = result.sort_values(by='support', ascending=False)
result

Unnamed: 0,antecedents,consequents,support
1,(bananas),(lettuce),0.181227
2,(lettuce),(bananas),0.181227
24,(cheese),(lettuce),0.180818
25,(lettuce),(cheese),0.180818
16,(milk),(beef),0.180682
...,...,...,...
97,(chicken),(cheese),0.173636
65,(tomatoes),(carrots),0.173636
59,(tomatoes),(cheese),0.173136
60,(tomatoes),(bananas),0.173136
