In [40]:

import pandas as pd
from collections import defaultdict
from itertools import combinations

df = pd.read_csv("Groceries_dataset.csv")
baskets = df.groupby('Member_number')['itemDescription'].apply(set).tolist()
items = sorted(df['itemDescription'].unique())
thresh = 19.28

item_counts = defaultdict(int)
for basket in baskets:
    for item in basket:
        item_counts[item] += 1

frequent_items = {item for item, count in item_counts.items() if count >= thresh}
lstfreqitems = list(frequent_items)


pair_counts = defaultdict(int)
for basket in baskets:
    for pair in combinations(sorted(basket & frequent_items), 2):
        pair_counts[pair] += 1

freq_pairs = {pair for pair, count in pair_counts.items() if count >= thresh}
lstfreqpairs = list(freq_pairs)

triplet_counts = defaultdict(int)
for basket in baskets:
    basket_items = basket & frequent_items
    for triplet in combinations(sorted(basket_items), 3):
        if all((pair in freq_pairs) for pair in combinations(triplet, 2)):
            triplet_counts[triplet] += 1
freq_triplets = {triplet for triplet, count in triplet_counts.items() if count >= thresh}

print()
rules = []
for i in lstfreqitems:
    for j in lstfreqitems:
        if i != j:
            both = pair_counts[tuple(sorted(tuple((str(i), str(j)))))]/len(baskets)
            suppfirst = item_counts[i]/len(baskets)
            conf = both/suppfirst
            lift = both/((suppfirst*(item_counts[j]/len(baskets))))
            if conf >= .003 and lift >1:
                rules.append([i,j,round(conf,3), round(lift,3)])

for i in lstfreqpairs:
    for j in lstfreqitems:
        if j not in i:
            antecedent = triplet_counts[tuple(sorted(tuple(list(i)+[j])))]/len(baskets)
            consequent = pair_counts[tuple(sorted(i))]/len(baskets)
            conf = antecedent/consequent
            lift = antecedent/(consequent*(item_counts[j]/len(baskets)))
            if conf >= .003 and lift >1:
                rules.append([i,j, round(conf,3),round(lift,3)])
for i in range(len(rules)):
    print("Rule: {} implies {} with {} confidence and {} lift".format(rules[i][0], rules[i][1], rules[i][2], rules[i][3]))




Rule: rum implies white bread with 0.094 confidence and 1.056 lift
Rule: rum implies chewing gum with 0.094 confidence and 2.1 lift
Rule: rum implies pip fruit with 0.219 confidence and 1.282 lift
Rule: rum implies turkey with 0.031 confidence and 1.562 lift
Rule: rum implies hard cheese with 0.062 confidence and 1.171 lift
Rule: rum implies waffles with 0.125 confidence and 1.811 lift
Rule: rum implies condensed milk with 0.031 confidence and 1.31 lift
Rule: rum implies canned vegetables with 0.125 confidence and 6.091 lift
Rule: rum implies beverages with 0.125 confidence and 2.013 lift
Rule: rum implies packaged fruit/vegetables with 0.094 confidence and 2.947 lift
Rule: rum implies frozen fish with 0.062 confidence and 2.412 lift
Rule: rum implies ice cream with 0.062 confidence and 1.107 lift
Rule: rum implies baking powder with 0.031 confidence and 1.007 lift
Rule: rum implies dessert with 0.156 confidence and 1.807 lift
Rule: rum implies pasta with 0.062 confidence and 2.065 li

In [41]:
print(frequent_items)


{'rum', 'white bread', 'dog food', 'chewing gum', 'jam', 'pip fruit', 'turkey', 'hard cheese', 'waffles', 'condensed milk', 'canned vegetables', 'beverages', 'packaged fruit/vegetables', 'frozen fish', 'ice cream', 'baking powder', 'dessert', 'pastry', 'pasta', 'flower (seeds)', 'female sanitary products', 'nut snack', 'chocolate marshmallow', 'specialty fat', 'spices', 'candles', 'vinegar', 'butter', 'sweet spreads', 'meat spreads', 'softener', 'yogurt', 'beef', 'red/blush wine', 'other vegetables', 'butter milk', 'cream cheese ', 'tea', 'sausage', 'citrus fruit', 'ketchup', 'soups', 'root vegetables', 'UHT-milk', 'domestic eggs', 'tidbits', 'potato products', 'nuts/prunes', 'brown bread', 'soap', 'misc. beverages', 'ham', 'frankfurter', 'rolls/buns', 'cake bar', 'bottled water', 'canned fish', 'cleaner', 'meat', 'soda', 'newspapers', 'soft cheese', 'oil', 'herbs', 'processed cheese', 'pickled vegetables', 'sauces', 'cereals', 'dental care', 'snack products', 'coffee', 'Instant food p

In [42]:
print(freq_pairs)


{('ice cream', 'sausage'), ('waffles', 'whipped/sour cream'), ('oil', 'other vegetables'), ('UHT-milk', 'shopping bags'), ('bottled water', 'waffles'), ('chicken', 'tropical fruit'), ('other vegetables', 'whipped/sour cream'), ('frozen dessert', 'whole milk'), ('canned beer', 'citrus fruit'), ('bottled water', 'cat food'), ('spread cheese', 'whole milk'), ('UHT-milk', 'bottled water'), ('chicken', 'fruit/vegetable juice'), ('bottled beer', 'sliced cheese'), ('chocolate', 'whole milk'), ('chocolate', 'frankfurter'), ('butter', 'waffles'), ('brown bread', 'dessert'), ('canned beer', 'newspapers'), ('liquor', 'shopping bags'), ('misc. beverages', 'yogurt'), ('pork', 'salty snack'), ('detergent', 'sausage'), ('mayonnaise', 'other vegetables'), ('UHT-milk', 'oil'), ('beef', 'frankfurter'), ('beef', 'whole milk'), ('chicken', 'onions'), ('sausage', 'specialty bar'), ('newspapers', 'sausage'), ('specialty chocolate', 'white bread'), ('frozen potato products', 'root vegetables'), ('liquor (app

In [43]:
print(freq_triplets)


{('chicken', 'frankfurter', 'whole milk'), ('bottled water', 'waffles', 'yogurt'), ('UHT-milk', 'pastry', 'whole milk'), ('other vegetables', 'pastry', 'pip fruit'), ('newspapers', 'pastry', 'yogurt'), ('brown bread', 'domestic eggs', 'pastry'), ('pastry', 'pip fruit', 'pork'), ('other vegetables', 'salty snack', 'shopping bags'), ('UHT-milk', 'shopping bags', 'whole milk'), ('other vegetables', 'red/blush wine', 'whole milk'), ('butter', 'rolls/buns', 'sausage'), ('canned beer', 'other vegetables', 'sausage'), ('canned beer', 'napkins', 'whole milk'), ('beef', 'curd', 'rolls/buns'), ('other vegetables', 'whipped/sour cream', 'whole milk'), ('chicken', 'frozen vegetables', 'other vegetables'), ('bottled beer', 'brown bread', 'tropical fruit'), ('other vegetables', 'root vegetables', 'soft cheese'), ('other vegetables', 'pork', 'soda'), ('fruit/vegetable juice', 'long life bakery product', 'other vegetables'), ('onions', 'tropical fruit', 'whole milk'), ('berries', 'domestic eggs', 'who

The presence of antecedents implies the consequents above with a minimum confidence threshold of .3.

1. Support is just the frequency of an itemset in all of the baskets, so it measures how often something occurs or how probable it is to occur. Confidence is basically how related to items are to each other. It is like the ratio of how many times the items occur together normalized by the number of times the antecedent occurs. Lift is basically how related the antecedents and consequents are in an itemset normalized by their independance. If it is greater than 1 than that means there is a positive correlation while less than 1 means a negative correlation.

2. Support is the basis for most of our calculations to determine confidence, interestingness, and lift. We can make a lot of conclusions on relatedness of items based on how popular they are from this calculation, its like the base point for everything. Lift helps us filter out rules we are confident in that are in pairs based on their frequency. If we have a lot of pairs that reach a minimum support, we only want the ones that we are a certain level of confident about, meaning they occur frequently together compared to seperately. It doesn't really mean anything if a pair reached the support but they only occur one time together compared to the items occuring 20 times seperately, its more of a coincidence compared to confidence that they are correlated to each other. It measures the observed confidence compared to what was expected, to suggest there was a positive or negative correlation if it occured more or less than just random chance or what was expected. Lift requires that we know the confidence about the antecedents and the consequents, it helps determine how confident we are that there is a rule that has meaning. This also helps us filter out frequent pairs if they occure frequently together compared to how frequently the antecedent occurs. It measures the strength of two itemsets. Association rules are created by keeping track of items support, which we can build off of from monotonicity property to keep track of frequent pairs and frequent triplets and so on. Once we have frequent pairs we can try to start to find association rules only with a certain level of confidence though. We can check how often does one occur in the pair in relation to the other, and then how confident are we in this. We only want the ones we are confident in to a certain level because we want them to have a certain strength of association. Lastly, once we find association rules, we can basically say with a certain confidence level or positive/negative correlation that if we are going to put this item or these two items next to each other, we would benefit by putting the item it would imply next to it, because people like to buy them together. We could also use it to say we shouldn't put these together because people don't usually buy these things together and that wouldn't do us any good. We can also put emphasis on the ones that have a high confidence level so we know there is a greater chance they are going to buy those things together. I only outputted the rules that had a lift of greater than one to make sure there was a positive correlation and the observed support was greater than what was expected of them independently to filter some of the rules out since there are so many.

In [79]:
from collections import defaultdict
from itertools import combinations
import itertools
def PCY(transaction, supp_threshold, conf_threshold):
    item_counts = defaultdict(int)
    hash_table = defaultdict(int)
    pairs = defaultdict(int)
    for basket in transaction:
        for item in basket:
            item_counts[item] += 1
        for i in range(len(basket)):
            for j in range(i+1, len(basket)):
                hash_table[hash((basket[i], basket[j]))%10007] += 1
    freqitems = {item: count for item, count in item_counts.items() if count >= 19.28}
    bitmap = {bucket: 1 if count >= 19.38 else 0 for bucket, count in hash_table.items()}

    candidate_pairs = defaultdict(int)

    for basket in transaction:
        for i in range(len(basket)):
          for j in range(i+1, len(basket)):
            pair = (basket[i], basket[j])
            hashed_pair = hash(pair)%10007
            if bitmap[hashed_pair] == 1 and pair[0] in freqitems and pair[1] in freqitems:
                candidate_pairs[pair] += 1

    frequentpairs = {tuple(sorted(pair)):count for pair, count in candidate_pairs.items() if count >= 19.28}

    rulespcy = []
    for i in freqitems:
      for j in freqitems:
        if i != j:
            pair = tuple(sorted(tuple((str(i), str(j)))))
            if pair in frequentpairs:
              both = frequentpairs[tuple(sorted(tuple((str(i), str(j)))))]/len(baskets)
              suppfirst = freqitems[i]/len(baskets)
              conf = both/suppfirst
              lift = both/((suppfirst*(freqitems[j]/len(baskets))))
              if conf >= .003 and lift >1:
                rulespcy.append([i,j,round(conf,3), round(lift,3)])
    return rulespcy, freqitems, frequentpairs


items = df.groupby('Member_number')['itemDescription'].apply(list).reset_index()
items = items['itemDescription'].tolist()
for i in range(len(PCY(items, .0005, .003)[0])):
    print("Rule: {} implies {} with {} confidence and {} lift".format(rules[i][0], rules[i][1], rules[i][2], rules[i][3]))

print("Freqeunt Items")

print(PCY(items, .0005, .003)[1])

print("Frequent Pairs")

print(PCY(items, .0005, .003)[2])




Rule: rum implies white bread with 0.094 confidence and 1.056 lift
Rule: rum implies chewing gum with 0.094 confidence and 2.1 lift
Rule: rum implies pip fruit with 0.219 confidence and 1.282 lift
Rule: rum implies turkey with 0.031 confidence and 1.562 lift
Rule: rum implies hard cheese with 0.062 confidence and 1.171 lift
Rule: rum implies waffles with 0.125 confidence and 1.811 lift
Rule: rum implies condensed milk with 0.031 confidence and 1.31 lift
Rule: rum implies canned vegetables with 0.125 confidence and 6.091 lift
Rule: rum implies beverages with 0.125 confidence and 2.013 lift
Rule: rum implies packaged fruit/vegetables with 0.094 confidence and 2.947 lift
Rule: rum implies frozen fish with 0.062 confidence and 2.412 lift
Rule: rum implies ice cream with 0.062 confidence and 1.107 lift
Rule: rum implies baking powder with 0.031 confidence and 1.007 lift
Rule: rum implies dessert with 0.156 confidence and 1.807 lift
Rule: rum implies pasta with 0.062 confidence and 2.065 lif

To compare the PCY algorithm with the Apriori algorithm, they both found the same amount of frequent items and frequent pairs. The PCY algorithm ran faster because it is less computationally complex, and would run faster for larger sets of data. We also only learned 2 passes for the PCY algorithm in class instead of 3 for the apriori algorithm. I have printed all of the rules I produced as well as the frequent items and frequent pairs. I was not able to produce the same amount of association rules since we only learned two passes in class, so to create association rules with two items in the antecedent you would need a third pass to find all of the triplets to check the support of the antecedent.