# Frequent Pattern Mining
CSE4063 - Data Mining Project-2

In [1]:
# import packages
import tracemalloc
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import time
import linecache
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

Empty items are removed later

In [2]:
with open('../input/freqmining/cse4063-spring2020-project-2-dataset-fpm.csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)
data[:10]

[['Lassi', 'Coffee Powder', 'Butter', 'Yougurt', 'Ghee', 'Cheese', ''],
 ['Ghee', 'Coffee Powder', ''],
 ['Lassi', 'Tea Powder', 'Butter', 'Cheese', ''],
 ['Cheese', 'Tea Powder', 'Panner', 'Coffee Powder', 'Butter', 'Bread', ''],
 ['Cheese', 'Yougurt', 'Coffee Powder', 'Sugar', 'Butter', 'Sweet', ''],
 ['Sugar', 'Tea Powder', 'Ghee', 'Sweet', 'Panner', 'Milk', ''],
 ['Sweet', 'Coffee Powder', ''],
 ['Butter', 'Ghee', 'Panner', ''],
 ['Sweet', 'Tea Powder', 'Butter', 'Yougurt', 'Sugar', 'Cheese', ''],
 ['Panner', 'Ghee', '']]

 Displays memory usage for given function name

In [3]:
def calculate_mem_use(snapshot, key_type='lineno', func=""):
    snapshot = snapshot.filter_traces((
        tracemalloc.Filter(False, "<frozen importlib._bootstrap>"),
        tracemalloc.Filter(False, "<unknown>"),
    ))
    top_stats = snapshot.statistics(key_type)
    total = 0
    for index, stat in enumerate(top_stats, 1):
        frame = stat.traceback[0]
        if func in frame.filename or "data" in frame.filename:
            total = total + stat.size
    return total

# **Prepare data**

In [4]:
te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)
del df[""]  # remove empty item
df

Unnamed: 0,Bread,Butter,Cheese,Coffee Powder,Ghee,Lassi,Milk,Panner,Sugar,Sweet,Tea Powder,Yougurt
0,False,True,True,True,True,True,False,False,False,False,False,True
1,False,False,False,True,True,False,False,False,False,False,False,False
2,False,True,True,False,False,True,False,False,False,False,True,False
3,True,True,True,True,False,False,False,True,False,False,True,False
4,False,True,True,True,False,False,False,False,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
12521,True,False,True,False,False,False,True,True,True,False,False,False
12522,True,False,True,True,False,False,False,False,True,False,False,False
12523,True,False,True,False,False,False,True,False,False,False,False,True
12524,True,False,True,False,True,False,False,False,True,False,False,True


# **Apriori**

In [5]:
start = time.time()
tracemalloc.start()
df_apriori = apriori(df, min_support=0.2, use_colnames=True)    
snapshot = tracemalloc.take_snapshot()
done = time.time()
elapsed = done - start
mem_use = calculate_mem_use(snapshot, func="apriori")
print("Mem. use : %1.f B" %(mem_use))
print("Elapsed Time : ",elapsed," sn")
df_apriori

Mem. use : 17678 B
Elapsed Time :  0.022913694381713867  sn


Unnamed: 0,support,itemsets
0,0.437809,(Bread)
1,0.43757,(Butter)
2,0.437171,(Cheese)
3,0.439805,(Coffee Powder)
4,0.439885,(Ghee)
5,0.433658,(Lassi)
6,0.441162,(Milk)
7,0.434616,(Panner)
8,0.43765,(Sugar)
9,0.43773,(Sweet)


# **FP-Growth**

In [6]:
start = time.time()
df_fpgrowth = fpgrowth(df, min_support=0.2, use_colnames=True)
snapshot = tracemalloc.take_snapshot()
done = time.time()
elapsed = done - start
mem_use = calculate_mem_use(snapshot, func="fpgrowth")
print("Mem. use : %1.f B" %(mem_use))
print("Elapsed Time : ",elapsed," sn")
df_fpgrowth

Mem. use : 2608 B
Elapsed Time :  0.46457576751708984  sn


Unnamed: 0,support,itemsets
0,0.439885,(Ghee)
1,0.439805,(Coffee Powder)
2,0.439326,(Yougurt)
3,0.43757,(Butter)
4,0.437171,(Cheese)
5,0.433658,(Lassi)
6,0.429746,(Tea Powder)
7,0.437809,(Bread)
8,0.434616,(Panner)
9,0.43773,(Sweet)


# **ECLAT**

When we use association_rules method with the metric = "support" it uses ECLAT algorithm.  

see https://github.com/Nikronic/Machine-Learning-Models/blob/master/Part%205%20-%20Association%20Rule%20Learning/Section%2017%20-%20Eclat/eclat.py

In [7]:
start = time.time()
df_eclat = association_rules(df_apriori, metric='support',min_threshold= 0.2)
snapshot = tracemalloc.take_snapshot()
done = time.time()
elapsed = done - start
mem_use = calculate_mem_use(snapshot, func="association_rules")
print("Mem. use : %1.f B" %(mem_use))
print("Elapsed Time : ",elapsed," sn")
df_eclat

Mem. use : 33464 B
Elapsed Time :  0.010829687118530273  sn


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Cheese),(Bread),0.437171,0.437809,0.20198,0.462016,1.055291,0.010582,1.044995
1,(Bread),(Cheese),0.437809,0.437171,0.20198,0.461342,1.055291,0.010582,1.044873
2,(Coffee Powder),(Bread),0.439805,0.437809,0.20182,0.458885,1.04814,0.009269,1.038949
3,(Bread),(Coffee Powder),0.437809,0.439805,0.20182,0.460977,1.04814,0.009269,1.039279
4,(Lassi),(Bread),0.433658,0.437809,0.200064,0.46134,1.053747,0.010204,1.043684
5,(Bread),(Lassi),0.437809,0.433658,0.200064,0.456966,1.053747,0.010204,1.042921
6,(Milk),(Bread),0.441162,0.437809,0.200942,0.455483,1.040369,0.007797,1.032458
7,(Bread),(Milk),0.437809,0.441162,0.200942,0.458972,1.040369,0.007797,1.032917
8,(Panner),(Bread),0.434616,0.437809,0.203577,0.468406,1.069885,0.013298,1.057556
9,(Bread),(Panner),0.437809,0.434616,0.203577,0.464989,1.069885,0.013298,1.056771
