In [1]:
import fim
from scipy.spatial import distance

### These correspond to the items in the database and have to be hardcoded

In [None]:
ITEMS = 'item1', 'item2', 'item3', 'item4', 'item5'  

### Necessary functions

In [2]:
def subset_of(list01, list02):
    ''' Check if list01 is subset of list02'''
    for item in list01:
        if item not in list02:
            return False
    return True

In [3]:
def create_database_dict(transactions):
    ''' Create a dictionary that stores transactions-supports pairs.
        key: transaction
        value: [og_support, 0]
        Values are 2-item lists for storing og support and recreated support'''
    item_dict = {}
    for item in transactions:
        item = tuple(sorted(item))
        if item_dict.get(item):
            item_dict[item][0] += 1/len(transactions)
        else:
            item_dict[item] = [1/len(transactions), 0]
    return item_dict   

In [4]:
def recreate_database(itemset_collection):
    ''' Recreate the database using heuristic algorithm. 
        This function uses a reverse sorted list which stores itemsets with their support_count 
        and then starting from the second itemset checks if the itemset is a subset of the preceding elements 
        and if it is it updates its support count. '''
    _ = [[sorted(list(item[0])), item[1]] for item in itemset_collection]
    sorted_list = sorted(_, key=lambda x: len(x[0]), reverse=True)
    n = len(sorted_list)
    for i in range(1, n):  # iterate list starting from the second item 
        for j in range(i):  # iterate the preceding items
            if subset_of(sorted_list[i][0], sorted_list[j][0]):  # check is item is subset of preceding item
                sorted_list[i][1] -= sorted_list[j][1]  # if it is update its support
        if sorted_list[i][1] < 0:
            sorted_list[i][1] = 0
    return sorted_list        


In [5]:
def get_negbord(support):
    ''' Calculate the negative border of a transactional database  '''
    freq_0 = fim.apriori(item_list, supp=0, target='s' , report='a')
    _ = list(filter(lambda x: x[1] < support, freq_0))
    if _:
        min_len = len(min(_, key=lambda x: len(x[0]))[0])
        if len(ITEMS) == len(items) or min_len == 1:
            return list(filter(lambda x: len(x[0]) == min_len, _))
    return []

### Import the database transactions from file

In [6]:
with open('./datasets/5_items/dataset_100.csv', 'r') as f:
    items = []
    item_list = []
    for line in f:
        temp_line = line.strip()[:-1].replace('"', '').split(',')
        for item in temp_line:
            if item not in items:
                items.append(item)
        item_list.append(temp_line)

### Create collections of itemsets

In [9]:
freq = fim.apriori(item_list, supp=30, target='s' , report='s')
closed = fim.apriori(item_list, supp=30, target='c' , report='s')
maximal = fim.apriori(item_list, supp=30, target='m' , report='s')
freq_nb = freq + get_negbord(30)
max_nb = maximal + get_negbord(30)

### Create the dictionary that will store trans-support pairs and recreate the database

In [8]:
recr_db = recreate_database(freq)
database_dict = create_database_dict(item_list)

### Update the trans-support pairs dictionary

In [15]:
for item in recr_db:
    _ = tuple(item[0])
    if _ in database_dict:
        database_dict[_][1] = item[1]/100
    else:
         database_dict[_] = [0, item[1]/100]

### Calculate the KL distance between og and recr dbs

In [18]:
distance.jensenshannon([value[0] for value in database_dict.values()], [value[1] for value in database_dict.values()])