# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety
from more_itertools import powerset

In [3]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [4]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [28]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_basket_name(basket: tuple[str]):
    name = ', '.join(sorted(list(set(basket))))
    return name

def calculate_ratio(curr_basket: set[str], baskets: list[tuple[str]]):
    occurances = 0
    n = len(baskets)

    for basket in baskets:
        if curr_basket.issubset(basket):
            occurances += 1
     
    return occurances/n

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
     
    for basket in baskets:
        basket_set = set(basket)

        for possible_basket in powerset(basket_set):
            possible_basket_set = set(possible_basket)

            possible_basket_key = get_basket_name(possible_basket_set)
            if possible_basket_key in supports:
                continue
              
            ratio = calculate_ratio(possible_basket_set, baskets)
            if ratio > epsilon:
                supports[possible_basket_key] = ratio
                
    return supports



# def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
#     product_subsets = []
#     for subset in powerset(all_products):
#         if len(list(subset)) == 0:
#             continue
#         if len(list(subset)) > K:
#             break
#         product_subsets.append(list(subset))

    
#     supports = {}
    
#     for product_subset in product_subsets:
#         contains = 0
#         for basket in baskets:
#             if product_subset.issubset(basket):
#                 contains += 1

#         name = get_basket_name(product_subset)
#         ratio = contains/len(baskets)
#         if ratio > epsilon:
#             supports[name] = contains/len(baskets)

#     return supports
       
supports = get_supports(baskets, products, EPSILON)
supports

{'': 1.0,
 'whole milk': 0.15792287642852368,
 'pastry': 0.0517275947336764,
 'salty snack': 0.018779656485998796,
 'pastry, whole milk': 0.006482657221145492,
 'salty snack, whole milk': 0.0019381140145692708,
 'semi-finished bread': 0.009490075519615051,
 'sausage': 0.06034886052262247,
 'yogurt': 0.08587850030074183,
 'semi-finished bread, whole milk': 0.001670787943594199,
 'sausage, whole milk': 0.008955423377664907,
 'whole milk, yogurt': 0.011160863463209249,
 'sausage, yogurt': 0.005747510525964045,
 'sausage, whole milk, yogurt': 0.0014702933903628951,
 'soda': 0.09710619528169484,
 'pickled vegetables': 0.008955423377664907,
 'misc. beverages': 0.01577223818752924,
 'canned beer': 0.04691572545612511,
 'hygiene articles': 0.013700461137472432,
 'rolls/buns': 0.11000467820624206,
 'rolls/buns, whole milk': 0.013967787208447505,
 'rolls/buns, sausage': 0.005346521419501437,
 'rolls/buns, sausage, whole milk': 0.0011361358016440553,
 'soda, whole milk': 0.011628684087415625,
 'f

In [25]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    name = get_basket_name(products)
    if name in supports:
        return supports[name]
    
    return EPSILON

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    AuB = prior_products.union(following_products)
    supp_AuB = support(supports, AuB)
    supp_A = support(supports, prior_products)
    return supp_AuB/supp_A
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    AuB = prior_products.union(following_products)
    supp_AuB = support(supports, AuB)
    supp_A = support(supports, prior_products)
    supp_B = support(supports, following_products)
    return supp_AuB/(supp_A * supp_B)

In [29]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [None]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)
generate_advanced_candidates(baskets[1], products, supports)

In [None]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)