In [1]:
import fim
from copy import deepcopy
from scipy.spatial import distance

In [2]:
def subset_of(set01, set02):
    ''' Check if set01 is subset of set02'''
    for item in set01:
        if item not in set02:
            return False
    return True

In [3]:
def create_trans_dict(collection):
    '''Create a transaction dictionary from a collection 
        of transactions. 
        Key: length of transactions
        Value: dictionary of transactions:support counts'''
    trans_dict = {}
    for item in collection:
        sorted_item = tuple(sorted(item[0]))
        n = len(sorted_item)
        if trans_dict.get(n):
            if trans_dict[n].get(sorted_item):
                trans_dict[n][sorted_item] += item[1]                
            else:
                trans_dict[n][sorted_item] = item[1]
        else:
            trans_dict[n] = {sorted_item : item[1]} 
    return trans_dict

In [4]:
def trans_support_dict(transactions):
    ''' Create a dictionary that stores transactions-supports pairs.
        key: transaction
        value: [og_support, 0]'''
    item_dict = {}
    for item in transactions:
        if item_dict.get(tuple(sorted(item))):
            item_dict[tuple(sorted(item))][0] += 1/len(transactions)
        else:
            item_dict[tuple(sorted(item))] = [1/len(transactions), 0]
    return item_dict   

In [5]:
def update_supcount(dict1, dict2):
    '''Update supcount of itemsets in dict1 based
        on itemsets-supcounts in dict2'''
    for key1 in dict1.keys():
        for key2 in dict2.keys():
            if subset_of(key1, key2):
                dict1[key1] -= dict2[key2]            

In [6]:
def get_union(my_list):
    '''Get the union of list items'''
    a = set(my_list[0])
    for i in range(1, len(my_list)):
        a = a.union(set(my_list[i]))
    return tuple(a) 

In [7]:
def update_cardinalities(a_dict):
    if len(a_dict) == 1:
        return a_dict
    temp_dict = deepcopy(a_dict)
    max_length = max(temp_dict.keys())
    for i in range(max_length-1, 0, -1):
        for j in range(i+1, max_length+1):
            update_supcount(temp_dict[i], temp_dict[j])
    return temp_dict

In [8]:
def get_superset_dict(a_dict):
    max_length = max(a_dict.keys())
    for i in range(max_length-1, 0, -1):
        for itemset1, sup1 in a_dict[i].items():
            if sup1 < 0 :
                superset_list = []
                for j in range(i+1, max_length+1):
                    for itemset2, sup2 in a_dict[j].items():
                        if sup2 > 0 and subset_of(itemset1, itemset2):
                            superset_list.append(itemset2)
                    if len(superset_list) > 1:
                        return get_union(superset_list), sup1
                    elif len(superset_list) == 1:
                        return superset_list[0], sup1
    return [], 0

In [9]:
def recreate_database(a_dict):
    temp_dict = update_cardinalities(a_dict)
    if temp_dict is a_dict:
        return a_dict
    superset, cardinality = get_superset_dict(temp_dict)
    if not superset:
        return temp_dict    
    sup = 0
    sup_len = len(superset)
    max_length = max(a_dict.keys())
    if max_length < sup_len:
        a_dict[sup_len] = {superset: abs(cardinality)}
    elif max_length > sup_len:
        for i in range(sup_len+1, max_length+1):
            for item, value in temp_dict[i].items():
                if subset_of(superset, item):
                    sup += value
            if sup:
                if superset in a_dict[sup_len]:
                    a_dict[sup_len][superset] += (sup + abs(cardinality))
                else:
                    a_dict[sup_len][superset] = sup + abs(cardinality)
                break
    else:
        if superset in a_dict[sup_len]:
            a_dict[sup_len][superset] += abs(cardinality)
        else:
            a_dict[sup_len][superset] = abs(cardinality)
    return recreate_database(a_dict)

In [10]:
with open(r'datasets\5_items\dataset_05_100_03.csv', 'r') as f:
    items = []
    item_list = []
    for line in f:
        temp_line = line.strip()[:-1].replace('"', '').split(',')
        for item in temp_line:
            if item not in items:
                items.append(item)
        item_list.append(temp_line)

In [11]:
freq = fim.apriori(item_list, supp=0, target='s' , report='a')
maximal = fim.apriori(item_list, supp=30, target='m' , report='a')
closed = fim.apriori(item_list, supp=30, target='c' , report='a')

In [12]:
trans_dict = create_trans_dict(freq)

In [13]:
compare_dict = trans_support_dict(item_list)

In [14]:
recreated_db = recreate_database(trans_dict)

In [15]:
for transactions in recreated_db.values():
    for trans, sup in transactions.items():
        if sup:
            if trans in compare_dict.keys():
                compare_dict[trans][1] = sup/100
            else:
                compare_dict[trans] = [0, sup/100]

In [16]:
distance.jensenshannon([value[0] for value in compare_dict.values()], [value[1] for value in compare_dict.values()])

3.4946320651646176e-09