# Pattern Mining using mlxtend and prefixspan (preferred version)

1. mlxtend for frequent pattern and association mining
2. prefixspan for sequential frequent pattern

### 1. Define dataset

In [1]:
dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

### 2. Import common libraries

In [2]:
import pandas as pd

### 3. mlxtend for frequent pattern and association mining

In [3]:
#!pip install mlxtend --upgrade

In [4]:
# transform dataset for fpgrowth algorithm from mlxtend

from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df_te = pd.DataFrame(te_ary, columns=te.columns_)
df_te

Unnamed: 0,Apple,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Unicorn,Yogurt
0,False,False,False,True,False,True,True,True,True,False,True
1,False,False,True,True,False,True,False,True,True,False,True
2,True,False,False,True,False,True,True,False,False,False,False
3,False,True,False,False,False,True,True,False,False,True,True
4,False,True,False,True,True,True,False,False,True,False,False


In [5]:
# frequent patter mining using fpgrowth from mlxtend (fpgrowth is much faster compared with apriori)

from mlxtend.frequent_patterns import fpgrowth

df_fpgrowth = fpgrowth(df_te, min_support=0.6)
df_fpgrowth

Unnamed: 0,support,itemsets
0,1.0,(5)
1,0.8,(3)
2,0.6,(10)
3,0.6,(8)
4,0.6,(6)
5,0.8,"(3, 5)"
6,0.6,"(10, 5)"
7,0.6,"(8, 3)"
8,0.6,"(8, 5)"
9,0.6,"(8, 3, 5)"


In [6]:
# user friendly output

def transactio_decode(df, df_te):
    supports = [df.loc[idx, 'support'] for idx in df.index]
    patterns = []
    for idx in df_fpgrowth.index:
        itemset = df_fpgrowth.loc[idx, 'itemsets']
        patterns.append([df_te.columns[item] for item in itemset])

    df_td = pd.DataFrame({'support':supports, 'itemsets':patterns})
    return df_td
    
df_fp = transactio_decode(df_fpgrowth, df_te) # use this line for user friendly output only
df_fp

Unnamed: 0,support,itemsets
0,1.0,[Kidney Beans]
1,0.8,[Eggs]
2,0.6,[Yogurt]
3,0.6,[Onion]
4,0.6,[Milk]
5,0.8,"[Eggs, Kidney Beans]"
6,0.6,"[Yogurt, Kidney Beans]"
7,0.6,"[Onion, Eggs]"
8,0.6,"[Onion, Kidney Beans]"
9,0.6,"[Onion, Eggs, Kidney Beans]"


In [7]:
# Association mining using association_rules from mlxtend

from mlxtend.frequent_patterns import association_rules
# df_ar = association_rules(df_fpgrowth, metric = "confidence", min_threshold = 0.60) # use this for fast processing
df_ar = association_rules(df_fp, metric = "confidence", min_threshold = 0.60) # use this for user friendly output
df_ar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Eggs),(Kidney Beans),0.8,1.0,0.8,1.0,1.0,0.0,inf,0.0
1,(Kidney Beans),(Eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0,0.0
2,(Kidney Beans),(Yogurt),1.0,0.6,0.6,0.6,1.0,0.0,1.0,0.0
3,(Yogurt),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
4,(Onion),(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
5,(Eggs),(Onion),0.8,0.6,0.6,0.75,1.25,0.12,1.6,1.0
6,(Onion),(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0
7,(Kidney Beans),(Onion),1.0,0.6,0.6,0.6,1.0,0.0,1.0,0.0
8,"(Onion, Kidney Beans)",(Eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf,0.5
9,"(Onion, Eggs)",(Kidney Beans),0.6,1.0,0.6,1.0,1.0,0.0,inf,0.0


### 4. PrefixSpan for mining frequent Sequential patterns

PrefixSpan discovers all frequent sequential patterns occurring in a sequence database

In [8]:
# !pip install -U prefixspan

In [9]:
from prefixspan import PrefixSpan

ps = PrefixSpan(dataset)

In [10]:
ps.frequent(2) # get sequential patterns with minimum support 2

[(3, ['Milk']),
 (3, ['Milk', 'Kidney Beans']),
 (2, ['Milk', 'Kidney Beans', 'Eggs']),
 (2, ['Milk', 'Kidney Beans', 'Yogurt']),
 (2, ['Milk', 'Eggs']),
 (2, ['Milk', 'Yogurt']),
 (3, ['Onion']),
 (2, ['Onion', 'Nutmeg']),
 (2, ['Onion', 'Nutmeg', 'Kidney Beans']),
 (2, ['Onion', 'Nutmeg', 'Kidney Beans', 'Eggs']),
 (2, ['Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt']),
 (2, ['Onion', 'Nutmeg', 'Kidney Beans', 'Yogurt']),
 (2, ['Onion', 'Nutmeg', 'Eggs']),
 (2, ['Onion', 'Nutmeg', 'Eggs', 'Yogurt']),
 (2, ['Onion', 'Nutmeg', 'Yogurt']),
 (3, ['Onion', 'Kidney Beans']),
 (3, ['Onion', 'Kidney Beans', 'Eggs']),
 (2, ['Onion', 'Kidney Beans', 'Eggs', 'Yogurt']),
 (2, ['Onion', 'Kidney Beans', 'Yogurt']),
 (3, ['Onion', 'Eggs']),
 (2, ['Onion', 'Eggs', 'Yogurt']),
 (2, ['Onion', 'Yogurt']),
 (2, ['Nutmeg']),
 (2, ['Nutmeg', 'Kidney Beans']),
 (2, ['Nutmeg', 'Kidney Beans', 'Eggs']),
 (2, ['Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt']),
 (2, ['Nutmeg', 'Kidney Beans', 'Yogurt']),
 (2, [

In [11]:
# Put the output in a dataframe
supports = [element[0] for element in ps.frequent(2)]
patterns = [frozenset(element[1]) for element in ps.frequent(2)]

df_ps = pd.DataFrame({'support':supports, 'itemsets':patterns})
df_ps

Unnamed: 0,support,itemsets
0,3,(Milk)
1,3,"(Kidney Beans, Milk)"
2,2,"(Kidney Beans, Eggs, Milk)"
3,2,"(Kidney Beans, Milk, Yogurt)"
4,2,"(Eggs, Milk)"
5,2,"(Milk, Yogurt)"
6,3,(Onion)
7,2,"(Onion, Nutmeg)"
8,2,"(Onion, Kidney Beans, Nutmeg)"
9,2,"(Eggs, Onion, Kidney Beans, Nutmeg)"


In [12]:
# apply association mining on frequent sequential patterns discovered by prefixspan

df_sar = association_rules(df_ps, metric = "confidence", min_threshold = 0.60)
df_sar

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Kidney Beans),(Milk),5.0,3.0,3.0,0.600000,0.200000,-12.0,-5.0,0.000000
1,(Milk),(Kidney Beans),3.0,5.0,3.0,1.000000,0.200000,-12.0,inf,-2.000000
2,"(Kidney Beans, Milk)",(Eggs),3.0,4.0,2.0,0.666667,0.166667,-10.0,-9.0,-1.666667
3,"(Eggs, Milk)",(Kidney Beans),2.0,5.0,2.0,1.000000,0.200000,-8.0,inf,-1.333333
4,(Milk),"(Kidney Beans, Eggs)",3.0,4.0,2.0,0.666667,0.166667,-10.0,-9.0,-1.666667
...,...,...,...,...,...,...,...,...,...,...
160,(Yogurt),"(Kidney Beans, Eggs)",3.0,4.0,2.0,0.666667,0.166667,-10.0,-9.0,-1.666667
161,(Kidney Beans),(Yogurt),5.0,3.0,3.0,0.600000,0.200000,-12.0,-5.0,0.000000
162,(Yogurt),(Kidney Beans),3.0,5.0,3.0,1.000000,0.200000,-12.0,inf,-2.000000
163,(Yogurt),(Eggs),3.0,4.0,2.0,0.666667,0.166667,-10.0,-9.0,-1.666667
