# Compare PyApriori with other Python packages

Packages:
- mlxtend.frequent_patterns.apriori
- PyFIM
- apyori
- pyapriori (this package)

Data:
- accidents
- connect
- chess
- mushroom

Source: Frequent Itemset Mining Implementations Repository (http://fimi.uantwerpen.be/data/) 

## Data Preparation

In [1]:
import pandas as pd
import csv
from mlxtend.preprocessing import TransactionEncoder
from scipy.sparse import csc_matrix
from cupyx.scipy.sparse import csr_matrix as csr_matrix_cupy

Load datasets

In [2]:
def load_dataset(datafile):
    dataset = [i.strip().split() for i in open(datafile).readlines()]
    # Number of transactions
    transactions_count = len(dataset)
    # One Hot Encoding
    te = TransactionEncoder()
    te_dummies = te.fit(dataset).transform(dataset)
    # Dataframe with One Hot Encoding
    df = pd.DataFrame(te_dummies, columns=te.columns_)
    # CSC Matrix with One Hot Encoding
    dataset_csc = csc_matrix(te_dummies)
    # Cupy CSC Matrix with One Hot Encoding
    dataset_csr_cupy = csr_matrix_cupy(dataset_csc)
    return df, dataset, dataset_csc, dataset_csr_cupy, transactions_count

## Modeling

In [3]:
import time
from mlxtend.frequent_patterns import apriori as mlxtend_apriori
import fim
from apyori import apriori as apyori_apriori
from pyapriori import PyApriori

In [4]:
def run_models(minsup, df, dataset, dataset_csc, dataset_csr_cupy, transactions_count):
    try:
        start = time.time()
        result = mlxtend_apriori(df, min_support=(minsup/100), use_colnames=True)
        end = time.time()
        mlxtend_t = end - start
        mlxtend_count = result.shape[0]
    except:
        mlxtend_t = None
        mlxtend_count = None
    
    try:
        start = time.time()
        result = fim.apriori(dataset, supp=minsup)
        end = time.time()
        fim_t = end - start
        fim_count = len(result)  
    except:
        fim_t = None
        fim_count = None 
    
    try:
        start = time.time()
        result = list(apyori_apriori(dataset, min_support=(minsup/100)))
        end = time.time()
        apyori_t = end - start
        apyori_count = len(result)
    except:
        apyori_t = None
        apyori_count = None
    
    try:
        start = time.time()
        py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
        _, result = py_apriori.fit(dataset_csc)
        end = time.time()
        pyapriori_t = end - start
        pyapriori_count = len(result)  
    except:
        pyapriori_t = None
        pyapriori_count = None 
    
    try:
        start = time.time()
        py_apriori = PyApriori(round(transactions_count * (minsup/100)), 1)
        _, result = py_apriori.fit(dataset_csr_cupy)
        end = time.time()
        pyapriori_cupy_t = end - start
        pyapriori_cupy_count = len(result) 
    except:
        pyapriori_cupy_t = None
        pyapriori_cupy_count = None    
    
    return mlxtend_t, mlxtend_count, fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count, pyapriori_cupy_t, pyapriori_cupy_count

## Run test

Parameters

In [5]:
supports = [40, 60, 80]
datafiles = ['data/accidents.dat', 'data/connect.dat', 'data/chess.dat', 'data/mushroom.dat']

Test

In [None]:
test_result = {}
for minsup in supports:
    for datafile in datafiles:
        print((minsup, datafile))
        df, dataset, dataset_csc, dataset_csr_cupy, transactions_count = load_dataset(datafile)
        mlxtend_t, mlxtend_count, fim_t, fim_count, apyori_t, apyori_count, pyapriori_t, pyapriori_count, pyapriori_cupy_t, pyapriori_cupy_count = run_models(minsup, df, dataset, dataset_csc, dataset_csr_cupy, transactions_count)
        test_result[(minsup, datafile)] = {'mlxtend_t': mlxtend_t, 'mlxtend_count':mlxtend_count, 'fim_t':fim_t, 'fim_count':fim_count, 'apyori_t':apyori_t, 'apyori_count':apyori_count, 'pyapriori_t':pyapriori_t, 'pyapriori_count':pyapriori_count, 'pyapriori_cupy_t':pyapriori_cupy_t, 'pyapriori_cupy_count':pyapriori_cupy_count}

(40, 'data/accidents.dat')


Results

In [None]:
test_result