### Sample program for Association Analysis (Market Basket Analysis) using FP-Growth  

#### Import libraries  

In [2]:
import pandas as pd
import numpy as np
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [3]:
csv_in = 'groceries-col.csv'

#### Read CSV file  

In [4]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=None)
print(df.shape)
print(df.info())
display(df.head())

(9835, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9835 entries, 0 to 9834
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       9835 non-null   object
 1   1       7676 non-null   object
 2   2       6033 non-null   object
 3   3       4734 non-null   object
 4   4       3729 non-null   object
 5   5       2874 non-null   object
 6   6       2229 non-null   object
 7   7       1684 non-null   object
 8   8       1246 non-null   object
 9   9       896 non-null    object
 10  10      650 non-null    object
 11  11      468 non-null    object
 12  12      351 non-null    object
 13  13      273 non-null    object
 14  14      196 non-null    object
 15  15      141 non-null    object
 16  16      95 non-null     object
 17  17      66 non-null     object
 18  18      52 non-null     object
 19  19      38 non-null     object
 20  20      29 non-null     object
 21  21      18 non-null     object
 22  22      14 no

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,,,,,,,...,,,,,,,,,,
1,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
2,whole milk,,,,,,,,,,...,,,,,,,,,,
3,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
4,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,


In [5]:
df = df.replace(np.nan, '00nan')
display(df.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,citrus fruit,semi-finished bread,margarine,ready soups,00nan,00nan,00nan,00nan,00nan,00nan,...,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan
1,tropical fruit,yogurt,coffee,00nan,00nan,00nan,00nan,00nan,00nan,00nan,...,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan
2,whole milk,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,...,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan
3,pip fruit,yogurt,cream cheese,meat spreads,00nan,00nan,00nan,00nan,00nan,00nan,...,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan
4,other vegetables,whole milk,condensed milk,long life bakery product,00nan,00nan,00nan,00nan,00nan,00nan,...,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan,00nan


In [6]:
ser_items = pd.Series(df.values.flatten())
top_items = ser_items.value_counts()
print(top_items.head())

00nan               271353
whole milk            2513
other vegetables      1903
rolls/buns            1809
soda                  1715
dtype: int64


In [7]:
id2item = sorted(list(set(df.values.flatten())))  # sort to fix order of items
print(len(id2item))  # debug
print(id2item[:5])  # debug
item2id = {}
for i in range(len(id2item)):
    item2id[id2item[i]] = i

170
['00nan', 'Instant food products', 'UHT-milk', 'abrasive cleaner', 'artif. sweetener']


In [8]:
df_id = df.applymap(lambda x: item2id[x])
display(df_id.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,32,134,90,120,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,159,168,36,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,167,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,111,168,41,93,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,104,167,37,87,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
invoices = []
for i in range(df_id.shape[0]):
    ser = df_id.loc[i]
    s = ser[ ser>0 ]
    invoices.append(s)
print(len(invoices))

9835


In [10]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 15)


Wall time: 1.71 s


In [11]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.9)


Wall time: 81.4 ms


In [12]:
print(rules)


{(84, 121): ((13,), 0.9047619047619048), (55, 125, 163): ((167,), 1.0), (41, 104, 153): ((167,), 0.9375), (51, 159, 163): ((167,), 0.9), (66, 159, 163): ((104,), 0.9047619047619048), (125, 132, 159, 168): ((167,), 0.9375)}


In [13]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [14]:
display(df_res.sort_values(by='Conf', ascending=False))


Unnamed: 0,LHS,RHS,Conf
1,"(55, 125, 163)","(167,)",1.0
2,"(41, 104, 153)","(167,)",0.9375
5,"(125, 132, 159, 168)","(167,)",0.9375
0,"(84, 121)","(13,)",0.904762
4,"(66, 159, 163)","(104,)",0.904762
3,"(51, 159, 163)","(167,)",0.9


In [15]:
print(id2item[55])
print(id2item[125])
print(id2item[163])
print(id2item[167])

flour
root vegetables
whipped/sour cream
whole milk


In [16]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [17]:
display(df_res.sort_values(by='Conf', ascending=False))


Unnamed: 0,LHS,RHS,Conf,Lift
1,"(55, 125, 163)","(167,)",1.0,3.913649
2,"(41, 104, 153)","(167,)",0.9375,3.669046
5,"(125, 132, 159, 168)","(167,)",0.9375,3.669046
0,"(84, 121)","(13,)",0.904762,11.235269
4,"(66, 159, 163)","(104,)",0.904762,4.67595
3,"(51, 159, 163)","(167,)",0.9,3.522284
