In [49]:
import csv
from collections import defaultdict
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth

In [33]:
rows = []

with open('online_retail.csv') as retail:
    for row in csv.reader(retail):

        if len(row) != 8:
            continue
        rows.append(row)

In [34]:
labels = [el.lower() for el in rows[0]]
# print(rows[0], '\n', rows[1])
del rows[0]

In [35]:
print(len(rows))

for i, row in enumerate(rows):
    if row[0].startswith('C'):
        del rows[i]

print(len(rows))

541909
536442


In [36]:
invoice_aggregation = defaultdict(lambda : set())

for invoice in rows:
    invoice_aggregation[invoice[0]].add(invoice[2])

In [38]:
all_items = set()

for el in invoice_aggregation.values():
    all_items.update(el)

all_items = sorted(list(all_items))

In [43]:
presence_matrix = [[int(item in invoice) for item in all_items] for invoice in invoice_aggregation.values()]

In [44]:
# another more efficient version
def get_presence_matrix(invoices, all_items):
    item_pos_dict = { k: v for v, k in enumerate(all_items) }
    presence_matrix = []
    for invoice in invoices.values():
        row = [0] * len(all_items)
        for item in invoice:
            row[item_pos_dict[item]] = 1
        presence_matrix.append(row)
    return presence_matrix

presence_matrix_v2 = get_presence_matrix(invoice_aggregation, all_items)

In [47]:
len(presence_matrix_v2[0])

4214

In [48]:
df = pd.DataFrame(data=presence_matrix_v2, columns=all_items)

In [50]:

for minsup in [0.5, 0.1, 0.05, 0.02, 0.01]:
    freq_items = fpgrowth(df, minsup)
    print(f'{minsup} => {len(freq_items)}')

# Notice how there is no frequent itemset that shows up in 50% of the transactions.
# Instead, there is an itemset that shows up in 10% of the transactions. Let's see what it is.

0.5 => 0
0.1 => 0
0.05 => 12
0.02 => 239
0.01 => 1173


In [53]:
fpgrowth(df, 0.09)

Unnamed: 0,support,itemsets
0,0.093885,(3910)


In [54]:
all_items[3910]

'WHITE HANGING HEART T-LIGHT HOLDER'

In [55]:
# Let's make sure that it really appears in at least 10% of the transactions by counting
# the number of transactions containing it.
# We can access df.values to extract a the Numpy matrix behind the DataFrame we just built.

100 * df.values[:, 3910].sum() / len(df)

9.388547128524834