In [41]:
import collections
import itertools
import numpy as np

# Define the sparse matrix

In [42]:
#columns are the items
#rows are the transactions
A = np.array([[1, 0, 1, 0, 1],
              [1, 0, 1, 1, 0],
              [0, 0, 0, 1, 1],
              [0, 0, 1, 2, 0],
              [2, 0, 0, 0, 0]])
orders = A
min_support = 1 # it was 12

# Mapping the sparse matrix to records

In [67]:
# TODO: map the items of the records to ids and sort each record (1 points)
mapped_records = []
# In the following tasks use the mapped records to compute the frequent itemsets.
for record in orders:
    mapped_record = []
    for index, item in enumerate(record):
        if(item > 0):
            mapped_record.append(index)
    mapped_record.sort()
    mapped_records.append(mapped_record)

In [44]:
for record in mapped_records:
    print(record)

[0, 2, 4]
[0, 2, 3]
[3, 4]
[2, 3]
[0]


# Apriori Algorithm

In [45]:
# TODO: calculate the support of length-1 itemsets using Counter or defaultdict (1 points)
l1_items = collections.Counter()
for record in mapped_records:
    l1_items.update(record)
    
# TODO: filter out the frequent length-1 itemsets with their support (1 point)
frequent_l1_items = {}
for item in l1_items:
    support = l1_items[item]
    if support >= min_support:
        frequent_l1_items[(item,)] = support

# Store all frequent itemsets (keys) with their support (value) in this dictionary.
# Hint: Convert the itemsets to tuples or sets so that you can use them as keys.
# TODO: save the length-1 frequent items and their supports to frequent_itemsets (1 points)
frequent_itemsets = {}

for item in frequent_l1_items:
    frequent_itemsets[item] = frequent_l1_items[item]

In [47]:
# TODO: implement the apriori_gen algorithm based on the lecture slides
def apriori_gen(itemsets):
    # TODO: generate candidates (4 points)
    C_k = set()
    for p in itemsets:
        for q in itemsets:
            if p[-1] < q[-1]:
                C_k.add( p + (q[-1],) )
        
    # TODO: prune the candidates and return them (4 points)
    def all_subsets_in_itemsets(x):
        for subset in itertools.combinations(x, len(x) - 1):
            if subset not in itemsets:
                return False
        return True
    
    return list(filter(all_subsets_in_itemsets, C_k))

In [48]:
# TODO: implement an algorithm to calculate the support of the given itemset (2 points)
# You do not need to implement a Hash Tree for calculating the supports.
def calculate_support(itemset):
    if len(itemset) == 1:
        try:
            return frequent_l1_items[itemset]
        except KeyError:
            return 0
        
    support = 0
    for record in mapped_records:
        itemset_in_record = True
        for item in itemset:
            if item not in record:
                itemset_in_record = False
                break
        if itemset_in_record:
            support += 1
    return support

In [49]:
# TODO: set the initial frequent itemsets which needs to be used in the first iteration (1 point)
# (It will be updated after each iteration.)
frequent_n_itemsets = frequent_l1_items

# TODO: set the correct loop condition until the Apriori algorithm should run (1 point)
while len(frequent_n_itemsets) != 0:
    candidates = apriori_gen(frequent_n_itemsets)
    supports = map(calculate_support, candidates)

    # TODO: filter out the frequent candidates (2 point)
    frequent_candidates = {}
    for candidate, support in zip(candidates, supports):
        if support >= min_support:
            frequent_candidates[candidate] = support

    # TODO: add the frequent candidates to frequent_itemsets (1 point)
    for item in frequent_candidates:
        frequent_itemsets[item] = frequent_candidates[item]
    
    # replace the frequent_n_itemsets for the next iteration
    frequent_n_itemsets = [itemset for itemset in frequent_candidates]

##

In [50]:
for itemset in frequent_itemsets:
    support = frequent_itemsets[itemset]
    print(f"{support} : {itemset}\n")

3 : (0,)

3 : (2,)

2 : (4,)

3 : (3,)

1 : (2, 4)

1 : (0, 4)

1 : (3, 4)

1 : (0, 3)

2 : (2, 3)

2 : (0, 2)

1 : (0, 2, 4)

1 : (0, 2, 3)



# Recommendations

In [65]:
def get_recommendation(itemset, recommendation_amount = 2):
    if hasattr(itemset, '__len__'):
        products = tuple(sorted(itemset))
    else:
        products = (itemset,)
        
    confidences = {}
    max_support = frequent_itemsets[products]
    for itemset in filter(
            lambda x:  len(x) == len(products) + 1 and all([y in x for y in products]), frequent_itemsets
        ):
        #filtering the already added items
        for item in filter(lambda x: x not in products, itemset):
            #calculating the confidence for recommending this item and adding it to the confidence dict
            confidences[item] = frequent_itemsets[itemset] / max_support
            
    #sorting the confidences dict from biggest confidence to lowest
    confidences = sorted(confidences.items(), key=lambda x: x[1], reverse=True)

    #printing the recommendations
    for i in range(min(recommendation_amount, len(confidences))):
        index, confidence = confidences[i]
        print(f"\"{index}\" is recommended with {confidence * 100:.2f}% confidence.")

In [66]:
#Example
itemset = (0,2)
#itemset = 2

recommendation_amount = 2

get_recommendation(itemset, recommendation_amount)

"4" is recommended with 50.00% confidence.
"3" is recommended with 50.00% confidence.
