# Compare PyApriori with other Python packages

Packages:
- mlxtend.frequent_patterns.apriori
- PyFIM
- apyori
- pyapriori (this package)

Data:
- accidents
- connect
- chess
- mushroom

Source: Frequent Itemset Mining Implementations Repository (http://fimi.uantwerpen.be/data/) 

## Data Preparation

In [1]:
import pandas as pd
import csv
from mlxtend.preprocessing import TransactionEncoder
from scipy.sparse import csc_matrix
from cupyx.scipy.sparse import csr_matrix as csr_matrix_cupy
import numpy as np
import cupy as cp

Load datasets

In [2]:
def load_dataset(datafile):
    dataset = [i.strip().split() for i in open(datafile).readlines()]
    # Number of transactions
    transactions_count = len(dataset)
    # One Hot Encoding
    te = TransactionEncoder()
    te_dummies = te.fit(dataset).transform(dataset)
    # Dataframe with One Hot Encoding
    df = pd.DataFrame(te_dummies, columns=te.columns_)
    # Numpy Array
    numpy_array = np.array(te_dummies)
    # CuPy Array
    #cupy_array = cp.array(numpy_array )
    cupy_array = cp.array(np.array([]))
    # CSC Matrix with One Hot Encoding
    dataset_csc = csc_matrix(te_dummies)
    # Cupy CSC Matrix with One Hot Encoding
    dataset_csr_cupy = csr_matrix_cupy(dataset_csc)
    return df, dataset, dataset_csc, dataset_csr_cupy, transactions_count, numpy_array, cupy_array

## Modeling

In [3]:
import time
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
import fim
from apyori import apriori as apyori_apriori
from pyapriori import PyApriori
from interruptingcow import timeout

The model fit is killed in 30s.

In [4]:
def run_models(minsup, df, dataset, dataset_csc, dataset_csr_cupy, transactions_count, numpy_array, cupy_array):
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = mlxtend_apriori(df, min_support=(minsup/100), use_colnames=True)
        end = time.time()
        mlxtend_t = end - start
        mlxtend_count = result.shape[0]
    except Exception as inst:
        print(inst)
        mlxtend_t = None
        mlxtend_count = None
    print('mlxtend Done in ' + str(mlxtend_t) + 's')
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = fim.apriori(dataset, supp=minsup)
        end = time.time()
        fim_t = end - start
        fim_count = len(result)  
    except Exception as inst:
        print(inst)
        fim_t = None
        fim_count = None 
    print('fim Done in ' + str(fim_t) + 's') 
    print('result')
    print(result[:10])
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            result = list(apyori_apriori(dataset, min_support=(minsup/100)))
        end = time.time()
        apyori_t = end - start
        apyori_count = len(result)
    except Exception as inst:
        print(inst)
        apyori_t = None
        apyori_count = None
    print('apyori Done in ' + str(apyori_t) + 's')      
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
            _, result = py_apriori.fit(dataset_csc)
        end = time.time()
        pyapriori_t = end - start
        pyapriori_count = len(result)  
    except Exception as inst:
        print(inst)
        pyapriori_t = None
        pyapriori_count = None 
    print('pyapriori Done in ' + str(pyapriori_t) + 's')
    print('result')
    print(result[:10])
    

    start = time.time()
    with timeout(30, exception=RuntimeError):
        py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
        _, result = py_apriori.fit(dataset_csr_cupy)
    end = time.time()
    pyapriori_cupy_t = end - start
    pyapriori_cupy_count = len(result) 
    print('pyapriori_cupy Done in ' + str(pyapriori_cupy_t) + 's') 
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
            _, result = py_apriori.fit(dataset_csr_cupy)
        end = time.time()
        pyapriori_cupy_t = end - start
        pyapriori_cupy_count = len(result) 
    except Exception as inst:
        print(inst)
        pyapriori_cupy_t = None
        pyapriori_cupy_count = None 
    print('pyapriori_cupy Done in ' + str(pyapriori_cupy_t) + 's') 
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
            _, result = py_apriori.fit(numpy_array)
        end = time.time()
        numpy_t = end - start
        numpy_count = len(result)  
    except Exception as inst:
        print(inst)
        numpy_t = None
        numpy_count = None 
    print('numpy Done in ' + str(numpy_t) + 's')  
    
    try:
        start = time.time()
        with timeout(30, exception=RuntimeError):
            py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
            _, result = py_apriori.fit(cupy_array)
        end = time.time()
        cupy_t = end - start
        cupy_count = len(result)  
    except Exception as inst:
        print(inst)
        cupy_t = None
        cupy_count = None 
    print('cupy Done in ' + str(cupy_t) + 's') 
    
    return mlxtend_t, mlxtend_count, fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count, pyapriori_cupy_t, pyapriori_cupy_count, numpy_t, numpy_count, cupy_t, cupy_count

## Run test

Parameters

In [5]:
supports = [50, 60, 70]
datafiles = ['data/accidents.dat', 'data/chess.dat', 'data/mushroom.dat']

2.39

Test

In [6]:
test_result = {}
for minsup in supports:
    for datafile in datafiles:
        print((minsup, datafile))
        df, dataset, dataset_csc, dataset_csr_cupy, transactions_count, numpy_array, cupy_array = load_dataset(datafile)
        mlxtend_t, mlxtend_count, fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count, pyapriori_cupy_t, pyapriori_cupy_count, numpy_t, numpy_count, cupy_t, cupy_count = run_models(minsup, df, dataset, dataset_csc, dataset_csr_cupy, transactions_count, numpy_array, cupy_array)
        test_result[(minsup, datafile)] = {'mlxtend_t': mlxtend_t, 'mlxtend_count':mlxtend_count, 'fim_t':fim_t, 'fim_count':fim_count, 'apyori_t':apyori_t, 'apyori_count':apyori_count, 'pyapriori_t':pyapriori_t, 'pyapriori_count':pyapriori_count, 'pyapriori_cupy_t':pyapriori_cupy_t, 'pyapriori_cupy_count':pyapriori_cupy_count, 'numpy_t':numpy_t, 'numpy_count':numpy_count, 'cupy_t':cupy_t, 'cupy_count':cupy_count}

(50, 'data/accidents.dat')

mlxtend Done in Nones
fim Done in 1.6964342594146729s
result
[(('23',), 173309), (('23', '18'), 172849), (('23', '18', '12'), 172759), (('23', '18', '12', '17'), 172745), (('23', '18', '17'), 172835), (('23', '12'), 173172), (('23', '12', '17'), 173158), (('23', '17'), 173295), (('30',), 183641), (('30', '21'), 170258)]

apyori Done in Nones
axis 0 is out of bounds for array of dimension 0
pyapriori Done in Nones
result
[(('23',), 173309), (('23', '18'), 172849), (('23', '18', '12'), 172759), (('23', '18', '12', '17'), 172745), (('23', '18', '17'), 172835), (('23', '12'), 173172), (('23', '12', '17'), 173158), (('23', '17'), 173295), (('30',), 183641), (('30', '21'), 170258)]


ValueError: could not interpret dimensions

Results

In [None]:
test_result