# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [None]:
# importujemy wszystkie potrzebne pakiety

from itertools import chain, combinations

from tqdm import tqdm

In [4]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [5]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [21]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def powerset(iterable, max_len=None):
    s = list(iterable)
    max_len = max_len or len(s)
    return chain.from_iterable(combinations(s, r) for r in range(1, max_len + 1))

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    baskets = [set(b) for b in baskets]
    
    N = len(baskets)
    supports = dict()
    
    for product_set in tqdm(powerset(all_products, K)):
        key = ''.join(sorted(product_set))
        if key in supports: continue 

        counter = 0
        for basket in baskets:
            if set(product_set).issubset(set(basket)):
                counter += 1
        #

        value = counter / N
        supports[key] = value if value > epsilon else epsilon

        if value <= epsilon:
            remaining_items = [p for p in all_products if p not in product_set]
            for r in range(1, K - len(product_set) + 1):
                for extra in combinations(remaining_items, r):
                    key = ''.join(sorted(product_set + extra))
                    supports[key] = epsilon
        #
    #
    return supports 
#
    
supports = get_supports(baskets, products, EPSILON)
supports

1476it [00:51, 28.75it/s]


KeyboardInterrupt: 

In [None]:
from joblib import dump, load
import os 

if os.path.exists("supports.joblib"):
    supports = load("supports.joblib")
else:
    dump(supports, "supports.joblib")

In [None]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    key = ''.join(sorted(products))
    return supports.get(key, EPSILON)
#

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    key_prior_products = ''.join(sorted(prior_products))
    support_prior_products = supports.get(key_prior_products, EPSILON)

    key_combined_products = ''.join(sorted(list(prior_products) + list(following_products)))
    support_combined_products = supports.get(key_combined_products, EPSILON)

    return support_combined_products / support_prior_products
#

def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    key_combined_products = ''.join(sorted(list(prior_products) + list(following_products)))
    support_combined_products = supports.get(key_combined_products, EPSILON)

    key_prior_products = ''.join(sorted(prior_products))
    support_prior_products = supports.get(key_prior_products, EPSILON)       

    key_following_products = ''.join(sorted(following_products))
    support_following_products = supports.get(key_following_products, EPSILON) 

    return support_combined_products / (support_prior_products + support_following_products)
#

In [None]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

## Część 3. - generowanie rekomendacji

In [None]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    all_basket_sets = powerset(basket, K - 1)
    recommended_products = []

    for basket_set in all_basket_sets:
        for product in products:
            confidenceValue = confidence(supports, basket_set, {product})

            liftValue = lift(supports, basket_set, product)
            if liftValue <= 1: continue 

            recommended_products.append({product, basket_set, confidenceValue, liftValue})
        #
    #
    return recommended_products 
#


In [None]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift

def generate_advanced_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    raise NotImplementedError()

In [None]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)
generate_advanced_candidates(baskets[1], products, supports)

In [None]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)