# Apriori

In [37]:
import pandas as pd
import itertools
from tqdm.notebook import tqdm
import csv

In [2]:
def stage_1(minimum_support:int, c_1:dict) -> dict:
    """
    input: candidate and minimum_support
    return: frequent itemset.
    
    c_1: candidate
    """
    
    l_1 = {}
    for key, value in c_1.items():
        if value >= minimum_support:
            l_1[key] = value 
    return l_1

In [3]:
def check_subset_frequency(combination:tuple, frequency_itemset:dict, n:int) -> bool:
    """
    combination: a tuple. e.g. ('32', '38')
    frequency_itemset: a dictionary. e.g. {'32': 15167, '38': 15596}
    """
    if n > 1 :    
        subsets = list(itertools.combinations(combination, n))
    else:
        subsets = combination
        
    for iter1 in subsets:
        if not iter1 in frequency_itemset:
            return False
        
    return True

In [4]:
def sublist(lst1:list, lst2:list) -> bool:
    """
    lst1: list or set
    lst2: list or set
    Check if lst1 is subset of lst2
    """
    return set(lst1) <= set(lst2)

In [5]:
def stage_n(l2:dict, records:list, minimum_support_count:int, n:int) -> tuple:
    """
    return candidates and frequent itemsets
    """
    if n == 2:
        l1 = sorted(list(l2.keys()))
        L1 = list(itertools.combinations(l1, 2))
        c2 = {}
        l2 = {}
        for iter1 in L1:
            count = 0
            for iter2 in records:
                if sublist(iter1, iter2):
                    count+=1
            c2[iter1] = count
        for key, value in c2.items():
            if value >= minimum_support_count:
                if check_subset_frequency(key, l1, 1):
                    l2[key] = value 

        return c2, l2
    
    else:
        l2 = list(l2.keys())
        L2 = sorted(list(set([item for t in l2 for item in t])))
        L2 = list(itertools.combinations(L2, n))
        c3 = {}
        l3 = {}
        for iter1 in L2:
            count = 0
            for iter2 in records:
                if sublist(iter1, iter2):
                    count+=1
            c3[iter1] = count
        for key, value in c3.items():
            if value >= minimum_support_count:
                if check_subset_frequency(key, l2, n-1):
                    l3[key] = value 

        return c3, l3

In [6]:
def support_count(itemset, itemlist) -> int:
    """
    return support
    """
    return itemlist[itemset]

## main

In [40]:
data = pd.read_csv('input.txt',header=None)

# define minimum support and minimum confidence===============================================
minimum_support_count = len(data) * 0.01 * int(input('Enter mininum support (% of total data):'))
minimum_support_count = round(minimum_support_count)
minimum_confidence = float(input('Enter mininum confidence (%):'))
print('minimum_support:',minimum_support_count)
print('minimum_confidence:',minimum_confidence)
print()

records = [] # save all transaction

c_1 = {} # 1-itemsets candidates 
for i in range(len(data)):
    transactions = data.values[i,0].split()
    records.append(transactions)
    for item in transactions:
        c_1[item] = c_1.get(item,0) +1    
l_1 = stage_1(minimum_support_count, c_1) # frequet 1-itemsets

cand_list = [] # save all k-itemsets candidates 
freq_list = [] # save all frequet k-itemsets
cand_list.append(c_1)
freq_list.append(l_1)

# calc all possible candidates and frequet k-itemsets================================
n = 1
frequent_itemset = {} # save all frequet itemsets
while True:
    c_n, l_n = stage_n(freq_list[n-1], records, minimum_support_count, n+1)
    cand_list.append(c_n)
    freq_list.append(l_n)
    print('{}-itemsets finished'.format(n+1))
    print('-'*20)
    n += 1
    if l_n == {}:
        break        
for freq in freq_list:
    frequent_itemset.update(freq)
print()

# output ===============================================================
with open('output.txt', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    
    for n in range(len(freq_list)):
        if n == 0:
            continue
        elif n == 1:
            freq = freq_list[n] # e.g. {('12','3'):5, ('6','2'):6, ...}
            sets = [] # save all combination e.g. [[('110',), ('38',)], [('170',), ('38',)], ...]
            for itemset in list(freq.keys()):
                subsets = list(itertools.combinations(itemset, 1))
                sets.append(subsets)

            list_l2 = list(freq.keys()) # e.g [('12','13'), ('14','15'), ...]

            for i in range(0, len(list_l2)):
                for list_ in sets[i]:
                    a = list_
                    b = set(list_l2[i]) - set(list_)
                    if type(list_) == tuple:
                        if len(list_) == 1:
                            list_ = list_[0]
                    confidence = (support_count(list_l2[i], frequent_itemset)/support_count(list_, frequent_itemset))*100
                    if confidence >= minimum_confidence:
                        print("{} -> {} = {:.2f}".format(a,b,confidence)) 
                        writer.writerow(["{} -> {} = {:.2f}".format(a,b,confidence)])

        else:
            freq = freq_list[n]
            for time in range(1,n+1):
                sets = []
                for iter1 in list(freq.keys()):
                    subsets = list(itertools.combinations(iter1, time))
                    sets.append(subsets)

                list_l3 = list(freq.keys())
                for i in range(0, len(list_l3)):
                    for iter1 in sets[i]:
                        a = iter1
                        b = set(list_l3[i]) - set(iter1)
                        if type(iter1) == tuple:
                            if len(iter1) == 1:
                                iter1 = iter1[0]
                        confidence = (support_count(list_l3[i], frequent_itemset)/support_count(iter1, frequent_itemset))*100
                        if confidence >= minimum_confidence:
                            print("{} -> {} = {:.2f}".format(a,b,confidence))
                            writer.writerow(["{} -> {} = {:.2f}".format(a,b,confidence)])

Enter mininum support (% of total data):5
Enter mininum confidence (%):20
minimum_support: 4408
minimum_confidence: 20.0

2-itemsets finished
--------------------
3-itemsets finished
--------------------
4-itemsets finished
--------------------

('32',) -> {'39'} = 55.75
('32',) -> {'48'} = 52.97
('38',) -> {'39'} = 66.33
('39',) -> {'38'} = 20.41
('38',) -> {'48'} = 50.94
('39',) -> {'41'} = 22.52
('41',) -> {'39'} = 76.37
('39',) -> {'48'} = 57.51
('48',) -> {'39'} = 69.16
('41',) -> {'48'} = 60.34
('48',) -> {'41'} = 21.40
('32',) -> {'39', '48'} = 35.62
('38',) -> {'39', '48'} = 39.13
('41',) -> {'39', '48'} = 49.29
('32', '39') -> {'48'} = 63.89
('32', '48') -> {'39'} = 67.24
('38', '39') -> {'48'} = 58.99
('38', '48') -> {'39'} = 76.81
('39', '48') -> {'38'} = 20.94
('39', '41') -> {'48'} = 64.53
('39', '48') -> {'41'} = 25.28
('41', '48') -> {'39'} = 81.68
