#### Import libraries  

In [1]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [2]:
csv_in = 'groceries-col.csv'

#### Read CSV file  

In [9]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
df = df.astype(str)
print(df.shape)
print(df.info())
display(df.head())

(9834, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9834 entries, 0 to 9833
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   citrus fruit         9834 non-null   object
 1   semi-finished bread  9834 non-null   object
 2   margarine            9834 non-null   object
 3   ready soups          9834 non-null   object
 4   Unnamed: 4           9834 non-null   object
 5   Unnamed: 5           9834 non-null   object
 6   Unnamed: 6           9834 non-null   object
 7   Unnamed: 7           9834 non-null   object
 8   Unnamed: 8           9834 non-null   object
 9   Unnamed: 9           9834 non-null   object
 10  Unnamed: 10          9834 non-null   object
 11  Unnamed: 11          9834 non-null   object
 12  Unnamed: 12          9834 non-null   object
 13  Unnamed: 13          9834 non-null   object
 14  Unnamed: 14          9834 non-null   object
 15  Unnamed: 15          9834 non-null   object


Unnamed: 0,citrus fruit,semi-finished bread,margarine,ready soups,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,tropical fruit,yogurt,coffee,,,,,,,,...,,,,,,,,,,
1,whole milk,,,,,,,,,,...,,,,,,,,,,
2,pip fruit,yogurt,cream cheese,meat spreads,,,,,,,...,,,,,,,,,,
3,other vegetables,whole milk,condensed milk,long life bakery product,,,,,,,...,,,,,,,,,,
4,whole milk,butter,yogurt,rice,abrasive cleaner,,,,,,...,,,,,,,,,,


In [11]:
id2item = sorted(list(set(df.values.flatten())))

In [13]:
item2id = {}
for i in range(len(id2item)):
    item2id[id2item[i]] = i

In [16]:
#df['StockCode_ID'] = df['StockCode'].map(lambda x: sc2id[x])
df_id = df.applymap(lambda x: item2id[x])
display(df_id.head())

Unnamed: 0,citrus fruit,semi-finished bread,margarine,ready soups,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,159,168,35,95,95,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95
1,167,95,95,95,95,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95
2,111,168,40,92,95,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95
3,104,167,36,86,95,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95
4,167,16,168,122,2,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95


#### Grouping transaction data by InvoiceNo  

In [10]:
invoices = []
for r in df.groupby('InvoiceNo'):
    #print(len(r))  # debug
    #print(r[0])  # debug
    #print(r[1])  # debug
    #break  # debug
    s1 = set(r[1]['StockCode_ID'])
    invoices.append(list(s1))
print(len(invoices))

2757


#### Market Basket Analysis by FP-Growth  

In [11]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 40)

Wall time: 475 ms


In [12]:
#print(patterns)

In [13]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.8)

Wall time: 1.01 ms


In [14]:
print(rules)

{(1473,): ((1475,), 0.803921568627451), (1332,): ((1331,), 0.8243243243243243), (1474, 1478): ((1475,), 0.8035714285714286), (1475, 1478): ((1474,), 0.8490566037735849), (138, 1156): ((137,), 0.8333333333333334), (140, 1156): ((137,), 0.8448275862068966), (1154, 1156): ((137,), 0.828125)}


In [15]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [16]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
3,"(1475, 1478)","(1474,)",0.849057
5,"(140, 1156)","(137,)",0.844828
4,"(138, 1156)","(137,)",0.833333
6,"(1154, 1156)","(137,)",0.828125
1,"(1332,)","(1331,)",0.824324
0,"(1473,)","(1475,)",0.803922
2,"(1474, 1478)","(1475,)",0.803571


#### Get original StockCode  

In [17]:
print(id2sc[1474])
print(id2sc[1475])
print(id2sc[1478])

22726
22727
22730


#### Calculation of Lift  

In [18]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [19]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
3,"(1475, 1478)","(1474,)",0.849057,18.287883
5,"(140, 1156)","(137,)",0.844828,11.883621
4,"(138, 1156)","(137,)",0.833333,11.721939
6,"(1154, 1156)","(137,)",0.828125,11.648677
1,"(1332,)","(1331,)",0.824324,28.057558
0,"(1473,)","(1475,)",0.803922,16.540386
2,"(1474, 1478)","(1475,)",0.803571,16.533182
