# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [1]:
# importujemy wszystkie potrzebne pakiety

from itertools import chain, combinations

from tqdm import tqdm

In [94]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001
K = 4

In [95]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [96]:
def powerset(items, max_len=None):
    max_len = max_len or len(items)
    return [set(combo) for r in range(1, max_len + 1) for combo in combinations(items, r)]
    # return chain.from_iterable(combinations(baskets, r) for r in range(1, max_len + 1))
#

In [97]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_supports(baskets: list[set[str]], all_products: list[str], epsilon: float):
    
    N = len(baskets)
    supports = dict()
    
    for product_set in powerset(all_products, K):
        sorted_product_set = sorted(product_set)
        full_key = '-'.join(sorted_product_set)

        partial_key = '-'.join(sorted_product_set[:-1])
        if partial_key in supports and supports[partial_key] == epsilon:
            supports[full_key] = epsilon
            continue
        #

        counter = 0
        for basket in baskets:
            if product_set.issubset(basket):
                counter += 1
        #

        value = counter / N
        supports[full_key] = value if value > epsilon else epsilon

    #
    return supports 
#
    
supports = get_supports(baskets, products, EPSILON)
supports

{'abrasive cleaner': 0.0014702933903628951,
 'artif. sweetener': 0.0019381140145692708,
 'baby cosmetics': 0.001,
 'bags': 0.001,
 'baking powder': 0.008086613646995923,
 'bathroom cleaner': 0.0011361358016440553,
 'beef': 0.03395041101383412,
 'berries': 0.021787074784468355,
 'beverages': 0.016574216400454454,
 'bottled beer': 0.04531176903027468,
 'bottled water': 0.06068301811134131,
 'brandy': 0.0025395976742631824,
 'brown bread': 0.03762614448974136,
 'butter': 0.03522020985096572,
 'butter milk': 0.017576689166610975,
 'cake bar': 0.006148499632426653,
 'candles': 0.004410880171088686,
 'candy': 0.014368776314910112,
 'canned beer': 0.04691572545612511,
 'canned fish': 0.007685624540533315,
 'canned fruit': 0.001403461872619127,
 'canned vegetables': 0.005480184454988973,
 'cat food': 0.011829178640646929,
 'cereals': 0.002806923745238254,
 'chewing gum': 0.012029673193878232,
 'chicken': 0.027868742899151238,
 'chocolate': 0.02359152576355009,
 'chocolate marshmallow': 0.00400

In [98]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    key = '-'.join(sorted(products))
    return supports.get(key, 0)
#

def confidence(supports, prior_products: set[str], following_products: set[str]) -> float:
    key_prior_products = '-'.join(sorted(prior_products))
    support_prior_products = supports.get(key_prior_products, EPSILON)

    key_combined_products = '-'.join(sorted(prior_products | following_products))
    support_combined_products = supports.get(key_combined_products, EPSILON)

    if support_combined_products <= EPSILON: return 0
    if support_prior_products <= EPSILON: return 0
    return support_combined_products / support_prior_products
#

def lift(supports, prior_products: set[str], following_products: set[str]) -> float:
    key_combined_products = '-'.join(sorted(prior_products | following_products))
    support_combined_products = supports.get(key_combined_products, EPSILON)

    key_prior_products = '-'.join(sorted(prior_products))
    support_prior_products = supports.get(key_prior_products, EPSILON)       

    key_following_products = '-'.join(sorted(following_products))
    support_following_products = supports.get(key_following_products, EPSILON) 

    if support_combined_products <= EPSILON: return 0
    if support_prior_products <= EPSILON: return 0
    if support_following_products <= EPSILON: return 0
    return support_combined_products / (support_prior_products * support_following_products)
#

In [None]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [103]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence

def generate_basic_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    all_basket_sets = powerset(basket, K - 1)
    recommended_products = []

    for basket_set in all_basket_sets:
        for product in products:

            if product in basket: continue 

            confidence_value = confidence(supports, basket_set, {product})

            lift_value = lift(supports, basket_set, {product})
            if lift_value <= 1: continue 

            recommended_products.append((product, basket_set, confidence_value, lift_value))
        #
    #
    
    result = sorted(
        recommended_products, 
        key = lambda item: item[2],
        reverse = True
    )
    seen = set()
    return [t for t in result if t[0] not in seen and not seen.add(t[0])]
#

In [104]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie confidence i lift

def generate_advanced_candidates(basket: set[str], products: list[str], supports) -> list[tuple[str, set[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    return sorted(
        generate_basic_candidates(basket, products, supports),
        key = lambda item: item[2] * item[3],
        reverse = True
    )

In [105]:
print(baskets[1])
print(generate_basic_candidates(baskets[1], products, supports)[:3])
print(generate_advanced_candidates(baskets[1], products, supports)[:3])

{'yogurt', 'semi-finished bread', 'whole milk', 'sausage'}
[('rolls/buns', {'sausage', 'whole milk'}, 0.12686567164179105, 1.1532752398396837), ('soda', {'sausage', 'whole milk'}, 0.11940298507462688, 1.2296124333596987), ('bottled beer', {'sausage'}, 0.05537098560354374, 1.2220000849348451)]
[('soda', {'sausage', 'whole milk'}, 0.11940298507462688, 1.2296124333596987), ('rolls/buns', {'sausage', 'whole milk'}, 0.12686567164179105, 1.1532752398396837), ('curd', {'sausage'}, 0.04872646733111849, 1.4466153386419167)]


In [106]:
print(baskets[33])
print(generate_basic_candidates(baskets[33], products, supports)[:3])
print(generate_advanced_candidates(baskets[33], products, supports)[:3])

{'white wine', 'root vegetables', 'domestic eggs', 'soda', 'tropical fruit', 'photo/film', 'yogurt'}
[('sausage', {'yogurt'}, 0.0669260700389105, 1.1089864739670185), ('citrus fruit', {'yogurt'}, 0.053696498054474705, 1.0106423904265471), ('shopping bags', {'root vegetables'}, 0.04803073967339097, 1.0093875810856026)]
[('sausage', {'yogurt'}, 0.0669260700389105, 1.1089864739670185), ('citrus fruit', {'yogurt'}, 0.053696498054474705, 1.0106423904265471), ('shopping bags', {'root vegetables'}, 0.04803073967339097, 1.0093875810856026)]


In [107]:
basket = {'brown bread', 'butter'}
print(generate_basic_candidates(basket, products, supports)[:3])
print(generate_advanced_candidates(basket, products, supports)[:3])

[('canned beer', {'brown bread'}, 0.06394316163410303, 1.3629366489046775), ('citrus fruit', {'butter'}, 0.05502846299810246, 1.0357118136359837), ('pastry', {'brown bread'}, 0.05328596802841919, 1.0301265369628376)]
[('canned beer', {'brown bread'}, 0.06394316163410303, 1.3629366489046775), ('citrus fruit', {'butter'}, 0.05502846299810246, 1.0357118136359837), ('pastry', {'brown bread'}, 0.05328596802841919, 1.0301265369628376)]
