In [None]:
import csv
from tqdm import tqdm
import pandas as pd
from mlxtend import frequent_patterns as fp

In [2]:
dataset = []

with open("../datasets/online_retail.csv") as f:
    print("READING DATABASE")
    next(f)
    for row in tqdm(csv.reader(f)):
        if row[0][0] != "C":
            dataset.append({
                "InvoiceNo": row[0],
                "StockCode": row[1],
                "Description": row[2],
                "Quantity": int(row[3]),
                "InvoiceDate": row[4],
                "UnitPrice": float(row[5]),
                "CustomerID": row[6],
                "Country": row[7]
            })
dataset.sort(key= lambda l: l["InvoiceNo"])
print("DONE")

READING DATABASE


541909it [00:01, 349905.70it/s]


DONE


In [3]:
print("AGGREGATING DATASET")
item_sets = {}
items = set()
prev = ""
for t in tqdm(dataset):
    if t["InvoiceNo"] != prev:
        prev = t["InvoiceNo"]
        item_sets[prev] = set()
    item_sets[t["InvoiceNo"]].add(t["Description"])
    items.add(t["Description"])
print("DONE")

items = list(items)

AGGREGATING DATASET


100%|██████████████████████████████| 532621/532621 [00:00<00:00, 1058726.53it/s]

DONE





In [4]:
print("GENERATING MLXTEND MATRIX (which actually is a sparse matrix)")
pa_matrix = []
for set in tqdm(item_sets.values()):
    tmp = []
    for item in items:
        tmp.append(1 if item in set else 0)
    pa_matrix.append(tmp)
print("DONE")

GENERATING MLXTEND MATRIX (which actually is a sparse matrix)


100%|███████████████████████████████████| 22064/22064 [00:11<00:00, 1903.75it/s]

DONE





In [5]:
print("GENERATING PANDAS DATAFRAME")
df = pd.DataFrame(data=pa_matrix, columns=items)
print("DONE")

GENERATING PANDAS DATAFRAME
DONE


In [12]:
print("EXTRACTING ASSOCIATION RULES")
fi = fp.fpgrowth(df, min_support=0.02, use_colnames=True)
print("Number of associaton rules", len(fi))
#print(fi.to_string())
rules = fp.association_rules(fi, metric='confidence', min_threshold=0.85)
print("DONE: found ", len(rules), " association rules")

EXTRACTING ASSOCIATION RULES
Number of associaton rules 303
DONE: found  2  association rules


In [7]:
with open("association_rules.txt", "w") as f:
    f.write(rules.to_string())

In [14]:
# using apriori algorithm
fi_apriori = fp.apriori(df, min_support=0.1, use_colnames=True)
print("With apriori algorithm: ", len(fi_apriori))
rules_apriori = fp.association_rules(fi_apriori, metric='confidence', min_threshold=0.85)
print(rules_apriori.to_string())

With apriori algorithm:  1
Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction]
Index: []
