In [43]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

In [44]:
csv_in = 'dm-end1-4.csv'

In [45]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(1000, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Invoice  1000 non-null   object
 1   Item     1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
None


Unnamed: 0,Invoice,Item
0,T1008,112
1,T1011,147
2,T1022,195
3,T1086,140
4,T1033,178


In [46]:
print("Unique number of invoices:", df['Invoice'].nunique())
print("Unique number of item:", df['Item'].nunique())

Unique number of invoices: 99
Unique number of item: 103


In [47]:
id2sc = sorted(list(set(df['Item'])))
sc2id = {}
for i in range(len(id2sc)):
    sc2id[id2sc[i]] = i

In [48]:
df['Item_ID'] = df['Item'].map(lambda x: sc2id[x])
display(df.head())

Unnamed: 0,Invoice,Item,Item_ID
0,T1008,112,11
1,T1011,147,46
2,T1022,195,94
3,T1086,140,39
4,T1033,178,77


In [49]:
invoices = []
for r in df.groupby('Invoice'):
    #print(len(r))  # debug
    #print(r[0])  # debug
    #print(r[1])  # debug
    #break  # debug
    s1 = set(r[1]['Item_ID'])
    invoices.append(list(s1))
print(len(invoices))

99


In [50]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 5)


Wall time: 3.99 ms


In [51]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.3)

Wall time: 0 ns


In [52]:
print(rules)

{(30,): ((57,), 0.45454545454545453), (57,): ((30,), 0.5555555555555556), (29,): ((85,), 0.3333333333333333), (65,): ((29,), 0.5), (0,): ((24,), 0.45454545454545453), (24,): ((0,), 0.45454545454545453), (26,): ((50,), 0.4166666666666667), (50,): ((26,), 0.38461538461538464), (72,): ((81,), 0.38461538461538464), (85,): ((29,), 0.35714285714285715)}


In [53]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [54]:
display(df_res.sort_values(by='Conf', ascending=False))


Unnamed: 0,LHS,RHS,Conf
1,"(57,)","(30,)",0.555556
3,"(65,)","(29,)",0.5
0,"(30,)","(57,)",0.454545
4,"(0,)","(24,)",0.454545
5,"(24,)","(0,)",0.454545
6,"(26,)","(50,)",0.416667
7,"(50,)","(26,)",0.384615
8,"(72,)","(81,)",0.384615
9,"(85,)","(29,)",0.357143
2,"(29,)","(85,)",0.333333


In [56]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [57]:
display(df_res.sort_values(by='Conf', ascending=False))


Unnamed: 0,LHS,RHS,Conf,Lift
1,"(57,)","(30,)",0.555556,5.0
3,"(65,)","(29,)",0.5,3.3
0,"(30,)","(57,)",0.454545,5.0
4,"(0,)","(24,)",0.454545,4.090909
5,"(24,)","(0,)",0.454545,4.090909
6,"(26,)","(50,)",0.416667,3.173077
7,"(50,)","(26,)",0.384615,3.173077
8,"(72,)","(81,)",0.384615,3.173077
9,"(85,)","(29,)",0.357143,2.357143
2,"(29,)","(85,)",0.333333,2.357143
