# Compare PyApriori with other Python packages

Packages:
- mlxtend.frequent_patterns.apriori
- PyFIM
- apyori
- pyapriori (this package)

Data:
- accidents
- connect
- chess
- mushroom

Source: Frequent Itemset Mining Implementations Repository (http://fimi.uantwerpen.be/data/) 

## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import csv
from mlxtend.preprocessing import TransactionEncoder

Load datasets

In [2]:
dataset = [['chleba', 'mliko'], ['chleba', 'rohliky'], ['rohliky', 'mliko'], ['mliko', 'rohliky', 'chleba']]

all_features = set()
for tr in dataset:
    trset = set(tr)
    all_features = all_features.union(trset)
allf = list(all_features)  

data = []
for feat in allf:
    print(feat)
    dataf = []
    for i, tr in enumerate(dataset):
        if feat in tr:
            dataf.append(i)
    data.append(set(dataf))
data = np.array(data)
    
data

mliko
rohliky
chleba


array([{0, 2, 3}, {1, 2, 3}, {0, 1, 3}], dtype=object)

In [3]:
def load_dataset(datafile):
    # pyapriori
    dataset = [i.strip().split() for i in open(datafile).readlines()]
    all_features = set()
    for tr in dataset:
        trset = set(tr)
        all_features = all_features.union(trset)
    allf = list(all_features)  

    data = []
    for feat in allf:
        dataf = []
        for i, tr in enumerate(dataset):
            if feat in tr:
                dataf.append(i)
        data.append(set(dataf))
    data = np.array(data)
    
    
    #dataset = [i.strip().split() for i in open(datafile).readlines()]
    # Number of transactions
    transactions_count = len(dataset)
    # One Hot Encoding
    #te = TransactionEncoder()
    #te_dummies = te.fit(dataset).transform(dataset)
    # Dataframe with One Hot Encoding
    #df = pd.DataFrame(te_dummies, columns=te.columns_)
    # CSC Matrix with One Hot Encoding

    
    
    return data, dataset, transactions_count

## Modeling

In [4]:
import time
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
import fim
from apyori import apriori as apyori_apriori
from pyapriori import PyApriori
from interruptingcow import timeout
from efficient_apriori import apriori as efapriori

The model fit is killed in 30s.

In [5]:
def run_models(minsup, data, dataset, transactions_count):

    print("")
    print("pyapriori")
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
            _, result = py_apriori.fit(data)
        end = time.time()
        pyapriori_t = end - start
        pyapriori_count = len(result)  
    except Exception as inst:
        print(inst)
        print(inst.args)  
        pyapriori_t = None
        pyapriori_count = None 
    print('pyapriori Done in ' + str(pyapriori_t) + 's')    
    
    print("")
    print("fim")
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = fim.apriori(dataset, supp=minsup)
        end = time.time()
        fim_t = end - start
        fim_count = len(result)  
    except Exception as inst:
        print(inst)
        print(inst.args)  
        fim_t = None
        fim_count = None 
    print('fim Done in ' + str(fim_t) + 's')    
    
    print("")
    print("apyori")
    try:
        #raise NameError('HiThere')
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = list(apyori_apriori(dataset, min_support=(minsup/100)))
        end = time.time()
        apyori_t = end - start
        apyori_count = len(result)
    except Exception as inst:
        print(inst)
        print(inst.args)  
        apyori_t = None
        apyori_count = None
    print('apyori Done in ' + str(apyori_t) + 's')   
    
    print("")
    print("effecient apriori")
    try:
        #raise NameError('HiThere')
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = list(efapriori(dataset, min_support=(minsup/100))[0])
        end = time.time()
        efapriori_t = end - start
        efapriori_count = len(result)
    except Exception as inst:
        print(inst)
        print(inst.args)  
        efapriori_t = None
        efapriori_count = None
    print('efapriori Done in ' + str(apyori_t) + 's')   
        
    
    return fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count

## Run test

Parameters

In [6]:
supports = [20]#,50, 60, 70
datafiles = ['data/mushroom.dat'] #'data/accidents.dat', 'data/chess.dat', 

Test

In [7]:
#import cupy as cp
#cp.cuda.set_allocator(cp.cuda.MemoryPool(cp.cuda.malloc_managed).malloc)
#cp.cuda.set_allocator(cp.cuda.MemoryPool().malloc)
test_result = {}
for minsup in supports:
    for datafile in datafiles:
        print((minsup, datafile))
        data, dataset, transactions_count = load_dataset(datafile)
        fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count = run_models(minsup, data, dataset, transactions_count)
        test_result[(minsup, datafile)] = {'fim_t':fim_t, 'fim_count':fim_count, 'apyori_t':apyori_t, 'apyori_count':apyori_count, 'pyapriori_t':pyapriori_t, 'pyapriori_count':pyapriori_count }

(20, 'data/mushroom.dat')

pyapriori
time
0.003607034683227539
0.19685578346252441
5.645897150039673
1.020545482635498
pyapriori Done in 6.8745951652526855s

fim
fim Done in 0.08940815925598145s

apyori

()
apyori Done in Nones

effecient apriori

()
efapriori Done in Nones


Results

In [8]:
test_result

{(20, 'data/mushroom.dat'): {'fim_t': 0.08940815925598145,
  'fim_count': 53582,
  'apyori_t': None,
  'apyori_count': None,
  'pyapriori_t': 6.8745951652526855,
  'pyapriori_count': 53583}}