In [61]:
import csv
from collections import defaultdict
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from mlxtend.frequent_patterns import apriori
from timeit import timeit

In [33]:
rows = []

with open('online_retail.csv') as retail:
    for row in csv.reader(retail):

        if len(row) != 8:
            continue
        rows.append(row)

In [34]:
labels = [el.lower() for el in rows[0]]
# print(rows[0], '\n', rows[1])
del rows[0]

In [35]:
print(len(rows))

for i, row in enumerate(rows):
    if row[0].startswith('C'):
        del rows[i]

print(len(rows))

541909
536442


In [36]:
invoice_aggregation = defaultdict(lambda : set())

for invoice in rows:
    invoice_aggregation[invoice[0]].add(invoice[2])

In [38]:
all_items = set()

for el in invoice_aggregation.values():
    all_items.update(el)

all_items = sorted(list(all_items))

In [43]:
presence_matrix = [[int(item in invoice) for item in all_items] for invoice in invoice_aggregation.values()]

In [44]:
# another more efficient version
def get_presence_matrix(invoices, all_items):
    item_pos_dict = { k: v for v, k in enumerate(all_items) }
    presence_matrix = []
    for invoice in invoices.values():
        row = [0] * len(all_items)
        for item in invoice:
            row[item_pos_dict[item]] = 1
        presence_matrix.append(row)
    return presence_matrix

presence_matrix_v2 = get_presence_matrix(invoice_aggregation, all_items)

In [47]:
len(presence_matrix_v2[0])

4214

In [48]:
df = pd.DataFrame(data=presence_matrix_v2, columns=all_items)

In [50]:

for minsup in [0.5, 0.1, 0.05, 0.02, 0.01]:
    freq_items = fpgrowth(df, minsup)
    print(f'{minsup} => {len(freq_items)}')

# Notice how there is no frequent itemset that shows up in 50% of the transactions.
# Instead, there is an itemset that shows up in 10% of the transactions. Let's see what it is.

0.5 => 0
0.1 => 0
0.05 => 12
0.02 => 239
0.01 => 1173


In [53]:
fpgrowth(df, 0.09)

Unnamed: 0,support,itemsets
0,0.093885,(3910)


In [54]:
all_items[3910]

'WHITE HANGING HEART T-LIGHT HOLDER'

In [55]:
# Let's make sure that it really appears in at least 10% of the transactions by counting
# the number of transactions containing it.
# We can access df.values to extract a the Numpy matrix behind the DataFrame we just built.

100 * df.values[:, 3910].sum() / len(df)

9.388547128524834

In [56]:
freq_items = fpgrowth(df, 0.02)
freq_items[freq_items["itemsets"].map(len) > 1]

Unnamed: 0,support,itemsets
198,0.026465,"(162, 166)"
199,0.022253,"(3976, 3972)"
200,0.020437,"(2842, 3910)"
201,0.023946,"(2051, 1863)"
202,0.034185,"(1861, 1863)"
203,0.021345,"(1876, 1861)"
204,0.029933,"(1876, 1863)"
205,0.024194,"(1851, 1863)"
206,0.021386,"(1862, 1863)"
207,0.020107,"(3552, 2866)"


In [58]:
# Now that we have extracted a list of frequent itemsets, we can compute some association rules. The next exercise will focus on automatically extracting all meaningful association rules. In this exercise, instead, we will extract association rules manually for a single itemset, such as (2056, 2051).
# We can extract the two rules: 2056 => 2051 and 2051 => 2056. Let's compute the confidence of these two rules.

M = df.values # matrix from the df dataframe
support_2056 = len(M[M[:, 2056] == 1])/len(M)
support_2051 = len(M[M[:, 2051] == 1])/len(M)
support_both = len(M[(M[:, 2056] == 1) & (M[:, 2051] == 1)])/len(M)
print(f"Confidence 2056 => 2051: {support_both / support_2056}")
print(f"Confidence 2051 => 2056: {support_both / support_2051}")

Confidence 2056 => 2051: 0.510989010989011
Confidence 2051 => 2056: 0.3522727272727273


In [60]:
# The association_rules() function takes as first argument the output of fpgrowth()
# (i.e. a DataFrame with frequent itemsets and their support). The second argument is the matric
# that we want to use for filtering association rules. As recommended by the exercise, we are going
# to use the confidence. As threshold value, as recommended, we will be using 0.85. As a side exercise,
# you can tweak these parameters and observe how the results vary.

fi = fpgrowth(df, 0.01)
association_rules(fi, 'confidence', 0.85)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(3552, 725)",(2866),0.01573,0.043062,0.013459,0.855643,19.870115,0.012782,6.628972
1,"(3552, 725, 726)",(2866),0.011189,0.043062,0.010115,0.904059,20.994453,0.009633,9.97424
2,"(3552, 3987, 726)",(2866),0.01251,0.043062,0.010734,0.858086,19.926842,0.010196,6.743076
3,"(1874, 1876, 1861)",(1863),0.012758,0.087114,0.011065,0.867314,9.956024,0.009953,6.88004
4,"(3297, 3343)",(3344),0.012221,0.021758,0.010941,0.89527,41.146758,0.010675,9.340633
5,"(3987, 725, 726)",(2866),0.012056,0.043062,0.010363,0.859589,19.961751,0.009844,6.815267
6,"(3566, 1863)",(1095),0.011643,0.029231,0.01028,0.882979,30.207101,0.00994,8.295664
7,"(2661, 2910)",(1604),0.016226,0.042566,0.014161,0.872774,20.503829,0.013471,7.525428
8,"(3008, 2661, 2910)",(1604),0.013666,0.042566,0.012427,0.909366,21.363475,0.011846,10.563684
9,"(1604, 2661, 2910)",(3008),0.014161,0.04492,0.012427,0.877551,19.535996,0.011791,7.799822


In [67]:

print("FP-growth", timeit(lambda: fpgrowth(df, 0.02), number=1), "seconds")
print("Apriori", timeit(lambda: apriori(df, 0.02), number=1), "seconds")

FP-growth 5.937266699998872 seconds
Apriori 10.613411700000142 seconds
