In [437]:
import pandas as pd
import numpy as np 

In [438]:
## load dataset
baskets = []
with open("../data/ItemBasketDataset/transactions.dat", encoding='utf-8', errors='replace') as f:
    for line in f.readlines():
        basket = line.split(" ")
        clean_basket = []
        for i, itemStr in enumerate(basket): 
            if (i != len(basket)-1):
                clean_basket.append(int(itemStr))
        baskets.append(set(clean_basket))
        
baskets = baskets[0:10000]

In [439]:
len(baskets)

10000

In [464]:
import itertools

class Apriori: 
    def __init__(this, baskets, N):
        this.baskets = baskets
        this.C = [set()] * (N+1) ## Initialize with empty set at 0 because we will not use C0. 
        this.L = [set()] * (N+1) ## Initialize with empty set at 0 because we will not use L0. 
        this.S = {}
        this.N = N
        
        
    def computeSupport(this, k): 
        """ walks through the buckets and counts each candidate occurence, saves to S[k] """
        print("Computing support of {}".format(k))
        count = {}
        totalLength = len(this.C[k])
        for candidateRepr in this.C[k]:
            candidate = eval(candidateRepr)
            for basket in this.baskets: 
                if (candidate.issubset(basket)): ## If candidate occurs in basket
                    if (candidateRepr not in count):
                        count[candidateRepr] = 1
                    count[candidateRepr] += 1
            if (len(count) % 1000 == 0): 
                print("Printed {} out of {}".format(len(count), totalLength))
        for candidateRepr in count:
            support = count[candidateRepr]/len(this.baskets) ## Support is expressed as the fraction of transaction in which the candidate occurs
            this.S[candidateRepr] = support
        print("Done computing support")
                
    def createCandidates(this, k): 
        """ creates all candidates C[k] of set length k """
        candidates = set()
        
        # generate all possible new itemsets
        allItemsets = set()
        for itemsetRepr in this.L[k-1]: 
            itemset = eval(itemsetRepr)
            for itemRepr in this.L[1]:
                item = eval(itemRepr)
                allItemsets.add(repr(itemset | item)) ## add the combination of each item and itemset
        
        ## for each itemset:
        #for itemsetRepr in allItemsets: 
        #    if this.containsOnlyFrequentSubsets(eval(itemsetRepr)):
        #        candidates.add(itemsetRepr)
        #        if (len(eval(itemsetRepr)) > 3): 
        #            print("Adding candidate " + itemsetRepr)

        return allItemsets # candidates
                
    def filterCandidates(this, k, s): 
        """ filters out the candidates which do not have support in the original baskets """
        frequent = set()
        for candidateRepr in this.C[k]: 
            if (candidateRepr in this.S): 
                if this.S[candidateRepr] > s:
                    frequent.add(candidateRepr)
        
        return frequent
    
    def containsOnlyFrequentSubsets(this, itemset):
        i = 1 #sets of length 1 and 2 is already checked
        # for each subset length i 2 --> k

        # L[i] is empty
        while i < len(itemset):
            # for each subsets of length i (not optimially efficient)
            for subset in this.findSubsets(itemset, i):
                # if the subset is not in L[i]:
                if (repr(set(subset)) not in this.L[i]):
                    # itemset contains a not frequent subset
                    # print("{} not in L[{}]".format(repr(subset), i))
                    # print(this.L[i])
                    return False
                else:
                    pass
                    # print("{} in L[{}]".format(repr(subset), i))
            i += 1
        return True 
        
    def findSubsets(this, S, length):
        return set(itertools.combinations(S, length))


## Pipeline
- Compute item support
- Let candidates C1 be all items
- Construct L1 by filtering out candidates with low support
- for k --> N:
    - Construct new candidates Ck based on Ck-1 and C1
    - Construct Lk by filtering out candidates with low support 

In [465]:
## Hyperparams
s = 0.01 # Support threshold
N = 10 # the largest sought itemset



In [466]:
A = Apriori(baskets, N)
k = 1
## Let candidates C1 be the set of all items
for basket in baskets: 
    for item in basket: 
        A.C[1].add(repr(set([item]))) #Every candidate is considered a set

# Compute support of all candidates in C[1]
A.computeSupport(k = k)

## Construct L1 by filtering out candidates with low support
A.L[1] = A.filterCandidates(k = k, s = s)

Computing support of 1
Done computing support


In [455]:
print("Found {} singleton candidates, out of which {} had support > {}".format(len(A.C[1]), len(A.L[1]), s))

Found 866 singleton candidates, out of which 379 had support > 0.01


In [456]:
while k < N:
    k += 1 # start with k + 1 = 2
    print("k = {}".format(k))
    # Construct new candidates Ck based on Ck-1 and C1
    A.C[k] = A.createCandidates(k) # 
    
    A.computeSupport(k = k)

    # Construct Lk by filtering out candidates with low support
    A.L[k] = A.filterCandidates(k = k, s = s)
    
    print("With k = {}: Found {} candidates out of which {} had support > {}".format(k, len(A.C[k]), len(A.L[k]), s))

k = 2
Computing support of 2
Printed 1000 out of 78247
Printed 2000 out of 78247
Printed 3000 out of 78247
Printed 4000 out of 78247
Printed 5000 out of 78247
Printed 6000 out of 78247
Printed 7000 out of 78247
Printed 8000 out of 78247
Printed 9000 out of 78247
Printed 10000 out of 78247
Printed 11000 out of 78247
Printed 12000 out of 78247
Printed 13000 out of 78247
Printed 14000 out of 78247
Printed 15000 out of 78247
Printed 16000 out of 78247
Printed 17000 out of 78247
Printed 18000 out of 78247
Printed 19000 out of 78247
Printed 20000 out of 78247
Printed 21000 out of 78247
Printed 22000 out of 78247
Printed 23000 out of 78247
Printed 24000 out of 78247
Printed 25000 out of 78247
Printed 26000 out of 78247
Printed 27000 out of 78247
Printed 28000 out of 78247
Printed 29000 out of 78247
Printed 30000 out of 78247
Printed 31000 out of 78247
Printed 32000 out of 78247
Printed 33000 out of 78247
Printed 34000 out of 78247
Printed 34000 out of 78247
Printed 35000 out of 78247
Printed 

Printed 50000 out of 78248
Printed 51000 out of 78248
Printed 52000 out of 78248
Printed 53000 out of 78248
Printed 54000 out of 78248
Printed 55000 out of 78248
Printed 56000 out of 78248
Printed 57000 out of 78248
Printed 58000 out of 78248
Printed 59000 out of 78248
Printed 60000 out of 78248
Printed 61000 out of 78248
Printed 62000 out of 78248
Printed 63000 out of 78248
Printed 64000 out of 78248
Printed 64000 out of 78248
Printed 65000 out of 78248
Printed 66000 out of 78248
Printed 67000 out of 78248
Printed 68000 out of 78248
Done computing support
With k = 5: Found 78248 candidates out of which 392 had support > 0.01
k = 6
Computing support of 6
Printed 1000 out of 78248
Printed 2000 out of 78248
Printed 3000 out of 78248
Printed 4000 out of 78248
Printed 5000 out of 78248
Printed 6000 out of 78248
Printed 7000 out of 78248
Printed 7000 out of 78248
Printed 8000 out of 78248
Printed 9000 out of 78248
Printed 10000 out of 78248
Printed 11000 out of 78248
Printed 12000 out of 78

Printed 27000 out of 78248
Printed 28000 out of 78248
Printed 29000 out of 78248
Printed 30000 out of 78248
Printed 31000 out of 78248
Printed 32000 out of 78248
Printed 33000 out of 78248
Printed 34000 out of 78248
Printed 34000 out of 78248
Printed 34000 out of 78248
Printed 34000 out of 78248
Printed 35000 out of 78248
Printed 36000 out of 78248
Printed 37000 out of 78248
Printed 37000 out of 78248
Printed 38000 out of 78248
Printed 39000 out of 78248
Printed 40000 out of 78248
Printed 41000 out of 78248
Printed 42000 out of 78248
Printed 43000 out of 78248
Printed 43000 out of 78248
Printed 43000 out of 78248
Printed 44000 out of 78248
Printed 45000 out of 78248
Printed 46000 out of 78248
Printed 47000 out of 78248
Printed 48000 out of 78248
Printed 49000 out of 78248
Printed 50000 out of 78248
Printed 51000 out of 78248
Printed 52000 out of 78248
Printed 53000 out of 78248
Printed 54000 out of 78248
Printed 55000 out of 78248
Printed 56000 out of 78248
Printed 57000 out of 78248
P

In [441]:
print(A.L[8])

{'{296}', '{192}', '{682}', '{385}', '{651}', '{738}', '{208}', '{571}', '{326}', '{471}', '{733}', '{58}', '{43}', '{354}', '{914}', '{970}', '{265}', '{45}', '{110}', '{765}', '{54}', '{941}', '{487}', '{259}', '{373}', '{769}', '{368}', '{48}', '{112}', '{583}', '{132}', '{805}', '{895}', '{258}', '{201}', '{598}', '{789}', '{227}', '{758}', '{100}', '{948}', '{740}', '{472}', '{707}', '{540}', '{422}', '{998}', '{163}', '{357}', '{242}', '{516}', '{893}', '{515}', '{826}', '{229}', '{623}', '{125}', '{527}', '{207}', '{663}', '{694}', '{829, 789}', '{922}', '{641}', '{534}', '{825, 39}', '{21}', '{792}', '{504}', '{978}', '{706}', '{181}', '{789, 829}', '{411}', '{749}', '{906}', '{204}', '{952}', '{841}', '{692}', '{804}', '{266}', '{368, 829}', '{874}', '{405}', '{800}', '{834}', '{12}', '{614}', '{171}', '{335}', '{75}', '{577}', '{992}', '{513}', '{227, 390}', '{168}', '{234}', '{887}', '{308}', '{617}', '{197}', '{173}', '{217}', '{812}', '{336}', '{832}', '{71}', '{704, 825}'

In [381]:
for a in A.findSubsets({1, 2, 3}, 2): 
    print(repr(set(a)))

{1, 2}
{1, 3}
{2, 3}


In [463]:
 A.filterCandidates(5, 0.01)

{'{100}',
 '{104}',
 '{108}',
 '{10}',
 '{110}',
 '{111}',
 '{112}',
 '{115}',
 '{116}',
 '{120}',
 '{122}',
 '{125}',
 '{126}',
 '{129}',
 '{12}',
 '{130}',
 '{132}',
 '{140}',
 '{143}',
 '{145}',
 '{147}',
 '{151}',
 '{154}',
 '{158}',
 '{161}',
 '{162}',
 '{163}',
 '{168}',
 '{170}',
 '{171}',
 '{173}',
 '{175}',
 '{177}',
 '{17}',
 '{181}',
 '{183}',
 '{185}',
 '{192}',
 '{193}',
 '{196}',
 '{197}',
 '{198}',
 '{1}',
 '{201}',
 '{204}',
 '{205}',
 '{207}',
 '{208}',
 '{210}',
 '{214}',
 '{217, 283}',
 '{217, 346}',
 '{217}',
 '{21}',
 '{227, 390}',
 '{227}',
 '{229}',
 '{234}',
 '{236}',
 '{239}',
 '{240}',
 '{242}',
 '{258}',
 '{259}',
 '{25}',
 '{265}',
 '{266}',
 '{274}',
 '{275}',
 '{276}',
 '{279}',
 '{27}',
 '{280}',
 '{283}',
 '{285}',
 '{28}',
 '{290}',
 '{294}',
 '{296}',
 '{308}',
 '{309}',
 '{310}',
 '{319}',
 '{31}',
 '{322}',
 '{325}',
 '{326}',
 '{329}',
 '{32}',
 '{332}',
 '{334}',
 '{335}',
 '{336}',
 '{33}',
 '{343}',
 '{346}',
 '{348}',
 '{349}',
 '{350}',
 '{351}

In [472]:
a = A.createCandidates(3)
len(a)

12445153

In [154]:
ss = set([1, 2, 3, 7])
print(len(ss))

4


In [174]:
import itertool
for asd in itertools.combinations(ss, 1):
    print(asd)

(1,)
(2,)
(3,)
(7,)


In [70]:
repr(set([1, 2, 3, 7]))

'{1, 2, 3, 7}'

In [72]:
repr(set([1, 2, 7, 3, 3]))

'{1, 2, 3, 7}'