# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [54]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.0001

In [55]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [56]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_pair_support(supports: dict, baskets: list[tuple[str]], all_products: list[str], epsilon: float, current_set: tuple, index: int):
    if index == len(all_products):
        return
    for idx, next_product in enumerate(all_products[index+1:]):
        newTuple = current_set + (next_product,)
        support = sum([1 for basket in baskets if set(newTuple).issubset(basket)]) / len(baskets)
        if support < epsilon :
            continue
        else:
            supports[newTuple] = support
            get_pair_support(supports, baskets, all_products, epsilon, newTuple, index + idx + 1)


def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    for idx, product in enumerate(all_products):
        temp_sum = sum([1 for basket in baskets if product in basket]) / len(baskets)
        if temp_sum > epsilon:
            supports[(product,)] = temp_sum
            get_pair_support(supports, baskets, all_products, epsilon, (product,), idx)
    return {k: v for k, v in supports.items() if v >= epsilon}
    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('abrasive cleaner', 'beef'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen meals'): 0.00013366303548753594,
 ('abrasive cleaner', 'frozen vegetables'): 0.00013366303548753594,
 ('abrasive cleaner', 'meat'): 0.00013366303548753594,
 ('abrasive cleaner', 'other vegetables'): 0.00020049455323130388,
 ('abrasive cleaner', 'pip fruit'): 0.00013366303548753594,
 ('abrasive cleaner', 'soda'): 0.00013366303548753594,
 ('abrasive cleaner', 'whipped/sour cream'): 0.00013366303548753594,
 ('abrasive cleaner', 'whole milk'): 0.00020049455323130388,
 ('abrasive cleaner', 'yogurt'): 0.00013366303548753594,
 ('artif. sweetener',): 0.0019381140145692708,
 ('artif. sweetener', 'bottled water'): 0.00013366303548753594,
 ('artif. sweetener', 'bottled water', 'soda'): 0.00013366303548753594,
 ('artif. sweetener', 'butter'): 0.00013366303548753594,
 ('artif. sweetener', 'curd'): 0.00013366303548753594,
 ('artif. sweetener', 'domestic eggs'): 0.000200

In [57]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports: dict, products: tuple[str]) -> float:
    proper_key = tuple(sorted(products))
    if proper_key in supports:
        return supports[proper_key]
    return 0.0

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    if support(supports, prior_products) != 0:
        return support(supports,prior_products + following_products)/support(supports,prior_products)
    else:
        return 0.0
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    if support(supports, prior_products) != 0 and  support(supports,following_products) != 0:
        return support(supports,prior_products + following_products)/(support(supports,prior_products) * support(supports,following_products))
    else:
        return 0.0
    

In [67]:
print(support(supports, ('rolls/buns', 'whole milk')))
print(confidence(supports, ('rolls/buns', 'whole milk'), ('yogurt',)))
print(lift(supports, ('rolls/buns', 'whole milk'), ('yogurt',)))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [83]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def by_first_element(t):
    return t[1]

def generate_next_product_candidates(basket: tuple[str], products: list[str], supports, k) -> list[tuple[str, tuple[str], float, float]]:
    candidates = {}
    for subbasket in powerset(basket):
        if 0 < len(subbasket) < 6:
            for product in [item for item in products if item not in basket]:
                conf = confidence(supports, tuple(subbasket), (product,))
                templift = lift(supports, tuple(subbasket), (product,))
                if templift > 1:
                    if product not in candidates or candidates[product] < conf:
                        candidates[product] = conf
    items = list(candidates.items())
    items.sort(key=by_first_element, reverse=True)
    return items[:k]

In [1]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports, 5)

NameError: name 'baskets' is not defined

In [86]:
generate_next_product_candidates(baskets[33], products, supports, 5)

[('whole milk', 0.5),
 ('onions', 0.42857142857142855),
 ('pastry', 0.4),
 ('frozen vegetables', 0.37499999999999994),
 ('bottled beer', 0.3333333333333333)]