# Lab  1 - basket analysis

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

In [1]:
# !pip install more-itertools

## Part 1 - data preparation/preprocessing

In [2]:
# import all needed packages

from more_itertools import powerset

In [21]:
# defining constants

PATH = './basket.csv'
EPSILON = 0.0001

In [22]:
# reading basket data

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

# list of sets containing products from baskets
baskets = read_baskets(PATH)

# list of all available products
products = unique_products(baskets)

## Part 2 - Support, confidence and lift

In [23]:
# computing a data structure (dictionary or graph) with interesting `support` values

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    len_all_baskets = len(baskets)
    
    def rek(basket : set, baskets : list[set]):
        nonlocal supports
        for product in all_products:
            if product in basket : continue
            current_A = basket | {product}
            containing_baskets = [b for b in baskets if (current_A.issubset(b))]
            s = len(containing_baskets)/len_all_baskets
            if s > epsilon:
                supports[tuple(sorted(current_A))] = s
                rek(current_A,containing_baskets)
    rek(set(),baskets)
    
    return supports
    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('abrasive cleaner', 'beef'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen meals'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen vegetables'): 0.00013366303548753594,
 ('abrasive cleaner', 'meat'): 0.00013366303548753594,
 ('abrasive cleaner', 'other vegetables'): 0.00020049455323130388,
 ('abrasive cleaner', 'pip fruit'): 0.00013366303548753594,
 ('abrasive cleaner', 'soda'): 0.00013366303548753594,
 ('abrasive cleaner', 'whipped/sour cream'): 0.00013366303548753594,
 ('abrasive cleaner', 'whole milk'): 0.00020049455323130388,
 ('abrasive cleaner', 'yogurt'): 0.00013366303548753594,
 ('artif. sweetener',): 0.0019381140145692708,
 ('artif. sweetener', 'bottled water'): 0.00013366303548753594,
 ('artif. sweetener', 'bottled water', 'soda'): 0.00013366303548753594,
 ('artif. sweetener', 'butter'): 0.00013366303548753594,
 ('artif. sweetener', 'curd'): 0.00013366303548753594,
 ('artif. sweetener', 'domestic eggs'): 0.000200

In [53]:
# defining functions to compute support, confidence and lift

def support(supports, products: set) -> float:
    products = tuple(sorted(products))
    return supports[products] if products in supports else 0 

def confidence(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    support_A = support(supports, prior_products)
    support_AB = support(supports,new_basket)
    return support_AB/support_A if support_A > 0 else 0 
    
def lift(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    following_products = tuple(sorted(following_products))
    support_A = support(supports, prior_products)
    support_B = support(supports, following_products)
    support_AB = support(supports, new_basket)
    return support_AB/(support_A*support_B) if support_A > 0 and support_B > 0 else 0 

In [54]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Part 3 - Generating recommendations

In [70]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: set, products: list, supports) -> list[tuple[str, tuple[str], float, float]]:
    result = []
    for subbasket in powerset(basket):
        subbasket = set(subbasket)
        for product in products:
            if product in subbasket:continue
            cnfdnc = confidence(supports, subbasket, {product})
            lft = lift(supports, subbasket, {product})
            if lft > 1:
                result.append((product,subbasket,cnfdnc,lft))
    
    return result

In [71]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'whole milk', 'sausage', 'semi-finished bread', 'yogurt'}


[('artif. sweetener',
  {'whole milk'},
  0.003385526872619552,
  1.7468151239657366),
 ('bathroom cleaner',
  {'whole milk'},
  0.0012695725772323319,
  1.1174479101839636),
 ('brandy', {'whole milk'}, 0.005501481168006772, 2.1662805978127717),
 ('candles', {'whole milk'}, 0.004655099449851883, 1.055367470729299),
 ('canned fruit', {'whole milk'}, 0.0025391451544646637, 1.8092013783930843),
 ('canned vegetables',
  {'whole milk'},
  0.005924672027084216,
  1.0811081407470868),
 ('chocolate marshmallow',
  {'whole milk'},
  0.004231908590774439,
  1.055367470729299),
 ('cookware', {'whole milk'}, 0.001692763436309776, 1.4899305469119517),
 ('detergent', {'whole milk'}, 0.008887008040626322, 1.0308240411774547),
 ('dish cleaner', {'whole milk'}, 0.005501481168006772, 1.1276529139299358),
 ('finished products',
  {'whole milk'},
  0.005501481168006772,
  1.286229104951333),
 ('hair spray', {'whole milk'}, 0.000846381718154888, 1.4071566276390655),
 ('ham', {'whole milk'}, 0.0173508252221

In [72]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'photo/film', 'tropical fruit', 'domestic eggs', 'yogurt', 'white wine', 'root vegetables', 'soda'}


[('bottled beer', {'photo/film'}, 0.05063291139240507, 1.1174340017176356),
 ('chewing gum', {'photo/film'}, 0.05063291139240507, 4.209001406469762),
 ('cream cheese ', {'photo/film'}, 0.025316455696202535, 1.0700851033397698),
 ('curd', {'photo/film'}, 0.03797468354430379, 1.1274110910186859),
 ('dessert', {'photo/film'}, 0.03797468354430379, 1.6096747588482088),
 ('domestic eggs', {'photo/film'}, 0.05063291139240507, 1.3650815372334362),
 ('grapes', {'photo/film'}, 0.03797468354430379, 2.6306258790436003),
 ('liquor (appetizer)',
  {'photo/film'},
  0.025316455696202535,
  5.653882486302664),
 ('napkins', {'photo/film'}, 0.03797468354430379, 1.7166622050556426),
 ('packaged fruit/vegetables',
  {'photo/film'},
  0.025316455696202535,
  2.9827569022226657),
 ('pet care', {'photo/film'}, 0.025316455696202535, 4.456589724497394),
 ('salty snack', {'photo/film'}, 0.025316455696202535, 1.3480787422856888),
 ('soda', {'photo/film'}, 0.10126582278481014, 1.042835861203795),
 ('sugar', {'pho