In [24]:
import numpy as np
dataset_filename = "affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))

This dataset has 100 samples and 5 features


In [25]:
print(X[:5])

[[ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  1.  1.]
 [ 1.  0.  0.  0.  1.]
 [ 1.  0.  0.  1.  1.]
 [ 0.  0.  1.  0.  0.]]


In [26]:
# The names of the features, for your reference.
features = ["bread", "milk", "cheese", "apples", "bananas"]

In our first example, we will compute the Support and Confidence of the rule "If a person buys Apples, they also buy Bananas".

In [27]:
# First, how many rows contain our premise: that a person is buying apples
num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        num_apple_purchases += 1
print("{0} people bought Apples".format(num_apple_purchases))

38 people bought Apples


In [28]:
# How many of the cases that a person bought Apples involved the people purchasing Bananas too?
# Record both cases where the rule is valid and is invalid.
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        if sample[4] == 1:
            # This person bought both Apples and Bananas
            rule_valid += 1
        else:
            # This person bought Apples, but not Bananas
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

23 cases of the rule being valid were discovered
15 cases of the rule being invalid were discovered


In [29]:
# Now we have all the information needed to compute Support and Confidence
support = rule_valid  # The Support is the number of times the rule is discovered.
confidence = rule_valid / num_apple_purchases
print("{0}".format(num_apple_purchases))
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))
# Confidence can be thought of as a percentage using the following:
print("As a percentage, that is {0:.1f}%.".format(100 * confidence))

38
The support is 23 and the confidence is 0.605.
As a percentage, that is 60.5%.


In [30]:
from collections import defaultdict
# Now compute for all possible rules
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)
print (num_occurences)
for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        print (num_occurences)
        for conclusion in range(n_features):
            if premise == conclusion:  # It makes little sense to measure if X -> X.
                continue
            if sample[conclusion] == 1:
                # This person also bought the conclusion item
                valid_rules[(premise, conclusion)] += 1
            else:
                # This person bought the premise, but not the conclusion
                invalid_rules[(premise, conclusion)] += 1
print(valid_rules)
print(invalid_rules)
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]

defaultdict(<class 'int'>, {})
defaultdict(<class 'int'>, {1: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 1, 4: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 1, 4: 1, 0: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 1, 4: 2, 0: 1})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 1, 4: 2, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 2, 4: 2, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 1, 3: 2, 4: 3, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 2, 3: 2, 4: 3, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 3, 3: 2, 4: 3, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 3, 3: 3, 4: 3, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 3, 3: 3, 4: 4, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 4, 3: 3, 4: 4, 0: 2})
defaultdict(<class 'int'>, {1: 1, 2: 4, 3: 3, 4: 4, 0: 3})
defaultdict(<class 'int'>, {1: 2, 2: 4, 3: 3, 4: 4, 0: 3})
defaultdict(<class 'int'>, {1: 2, 2: 5, 3: 3, 4: 4, 0: 3})
defaultdict(<class 'int'>,

In [31]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.447
 - Support: 17

Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.579
 - Support: 22

Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.447
 - Support: 17

Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.605
 - Support: 23

Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.407
 - Support: 22

Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.426
 - Support: 23

Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.541
 - Support: 20

Rule: If a person buys bananas they will also buy bread
 - Confidence: 0.370
 - Support: 20

Rule: If a person buys bread they will also buy apples
 - Confidence: 0.324
 - Support: 12

Rule: If a person buys apples they will also buy bread
 - Confidence: 0.316
 - Support: 12

Rule: If a person buys bread they will also buy milk
 - Confidence: 

In [32]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

In [33]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.340
 - Support: 18



In [34]:
# Sort by support
from pprint import pprint
pprint(list(support.items()))

[((2, 3), 17),
 ((2, 4), 22),
 ((3, 2), 17),
 ((3, 4), 23),
 ((4, 2), 22),
 ((4, 3), 23),
 ((0, 4), 20),
 ((4, 0), 20),
 ((0, 3), 12),
 ((3, 0), 12),
 ((0, 1), 18),
 ((1, 0), 18),
 ((1, 3), 18),
 ((1, 4), 28),
 ((3, 1), 18),
 ((4, 1), 28),
 ((0, 2), 4),
 ((1, 2), 12),
 ((2, 0), 4),
 ((2, 1), 12)]


In [35]:
from operator import itemgetter
print(sorted_support)
print("")
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
key = itemgetter(1)
print(key)
print(sorted_support)

[((2, 4), 27), ((4, 2), 27), ((2, 3), 25), ((3, 2), 25), ((3, 4), 21), ((4, 3), 21), ((1, 4), 19), ((4, 1), 19), ((0, 4), 17), ((4, 0), 17), ((0, 1), 14), ((1, 0), 14), ((1, 3), 9), ((3, 1), 9), ((1, 2), 7), ((2, 1), 7), ((0, 3), 5), ((3, 0), 5), ((0, 2), 4), ((2, 0), 4)]

operator.itemgetter(1)
[((1, 4), 28), ((4, 1), 28), ((3, 4), 23), ((4, 3), 23), ((2, 4), 22), ((4, 2), 22), ((0, 4), 20), ((4, 0), 20), ((0, 1), 18), ((1, 0), 18), ((1, 3), 18), ((3, 1), 18), ((2, 3), 17), ((3, 2), 17), ((0, 3), 12), ((3, 0), 12), ((1, 2), 12), ((2, 1), 12), ((0, 2), 4), ((2, 0), 4)]


In [36]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.528
 - Support: 28

Rule #2
Rule: If a person buys bananas they will also buy milk
 - Confidence: 0.519
 - Support: 28

Rule #3
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.605
 - Support: 23

Rule #4
Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.426
 - Support: 23

Rule #5
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.579
 - Support: 22



In [37]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [23]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.694
 - Support: 25

Rule #2
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.659
 - Support: 27

Rule #3
Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.630
 - Support: 17

Rule #4
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.610
 - Support: 25

Rule #5
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.583
 - Support: 21

