# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [228]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset, unique_everseen


In [229]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.0000001 #10^-7
# EPSILON = 0.001


In [230]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

In [231]:
baskets

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'},
 {'butter', 'whole milk'},
 {'frozen vegetables', 'other vegetables'},
 {'sugar', 'tropical fruit'},
 {'butter milk', 'specialty chocolate'},
 {'dental care', 'frozen meals'},
 {'rolls/buns'},
 {'detergent', 'root vegetables'},
 {'rolls/buns', 'sausage'},
 {'cling film/bags', 'dish cleaner'},
 {'canned beer', 'frozen fish'},
 {'pip fruit', 'tropical fruit', 'whole milk'},
 {'pastry', 'root vegetables', 'whole milk'},
 {'chocolate', 'red/blush wine', 'rolls/buns'},
 {'other vegetables', 'shopping bags'},
 {'chocolate', 'packaged fruit/vegetables', 'rolls/buns', 'whole milk'},
 {'hygiene articles', 'other vegetables'},
 

## Część 2. - obliczanie wskaźników

In [232]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`

def get_valid_baskets(baskets: list[tuple[str]], current_product:str):
    valid_baskets = []
    for basket in baskets:
        if current_product in basket:
            valid_baskets.append(basket)
    return valid_baskets


def calc_subset(supports: dict, baskets: list[tuple[str]], baskets_number: int, products: list[str], epsilon: float, ind: int, subset: str = ""):
    current_product = products[ind]
    valid_baskets = get_valid_baskets(baskets, current_product)
    support = len(valid_baskets)/baskets_number
    if support > epsilon:
        new_subset = ''.join([subset, current_product])
        supports[new_subset] = support
        for i in range(ind+1, len(products)):
            calc_subset(supports, valid_baskets, baskets_number, products,
                        epsilon, i, new_subset)

    

def get_supports(baskets: list[tuple[str]], products: list[str], epsilon: float):
    supports = {}
    products_sorted = sorted(products)
    baskets_number = len(baskets)
    for i in range(len(products_sorted)):
        calc_subset(supports, baskets, baskets_number, products_sorted,
                    epsilon, i)
    return supports


supports = get_supports(baskets, products, EPSILON)
supports

{'abrasive cleaner': 0.0014702933903628951,
 'abrasive cleanerbeef': 0.00013366303548753594,
 'abrasive cleanerbeeffrozen vegetables': 6.683151774376797e-05,
 'abrasive cleanerbeeffrozen vegetablesuht-milk': 6.683151774376797e-05,
 'abrasive cleanerbeefpasta': 6.683151774376797e-05,
 'abrasive cleanerbeefpastapork': 6.683151774376797e-05,
 'abrasive cleanerbeefpastaporksalty snack': 6.683151774376797e-05,
 'abrasive cleanerbeefpastaporksalty snackshopping bags': 6.683151774376797e-05,
 'abrasive cleanerbeefpastaporkshopping bags': 6.683151774376797e-05,
 'abrasive cleanerbeefpastasalty snack': 6.683151774376797e-05,
 'abrasive cleanerbeefpastasalty snackshopping bags': 6.683151774376797e-05,
 'abrasive cleanerbeefpastashopping bags': 6.683151774376797e-05,
 'abrasive cleanerbeefpork': 6.683151774376797e-05,
 'abrasive cleanerbeefporksalty snack': 6.683151774376797e-05,
 'abrasive cleanerbeefporksalty snackshopping bags': 6.683151774376797e-05,
 'abrasive cleanerbeefporkshopping bags': 

In [248]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: tuple[str]) -> float:
    products_sorted=sorted(products)
    index = ''.join(products_sorted)
    if index in supports:
        return supports[index]
    return 0
    

def confidence(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    products = prior_products.union(following_products)
    supp_A = support(supports, prior_products)
    if supp_A != 0:
        return support(supports, products)/supp_A
    return 0
    
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    products = prior_products.union(following_products)
    supp_A = support(supports, prior_products)
    supp_B = support(supports, following_products)
    if supp_A != 0 and supp_B != 0:
        return support(supports, products)/(supp_A*supp_B)
    return 0


In [249]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Część 3. - generowanie rekomendacji

In [270]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence
def generate_next_product_candidates(basket: tuple[str], products: list[str], supports) -> list[tuple[str, tuple[str], float, float]]:
    # return [(item, subbasket, confidence, lift)]
    candidates=[]
    subsets = powerset(basket)
    for subset in subsets:
        if len(subset)>0:
            for product in products:
                if product not in basket:
                    lift_v = lift(supports, set(subset), {product})
                    confidence_v = confidence(supports,set(subset),{product})
                    if lift_v > 1:
                        candidates.append(
                            (product, set(subset), confidence_v, lift_v))
    return sorted(candidates, key=lambda x: x[2], reverse=True)



In [271]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)


{'semi-finished bread', 'whole milk', 'sausage', 'yogurt'}


[('other vegetables',
  {'other vegetables', 'sausage', 'semi-finished bread', 'yogurt'},
  0.5,
  4.09496442255063),
 ('other vegetables',
  {'other vegetables',
   'sausage',
   'semi-finished bread',
   'whole milk',
   'yogurt'},
  0.5,
  4.09496442255063),
 ('other vegetables',
  {'other vegetables', 'semi-finished bread', 'whole milk', 'yogurt'},
  0.33333333333333337,
  2.72997628170042),
 ('beverages',
  {'beverages', 'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  15.08366935483871),
 ('bottled beer',
  {'bottled beer', 'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  5.517330383480826),
 ('chocolate',
  {'chocolate', 'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  10.597025495750708),
 ('napkins',
  {'napkins', 'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  11.301359516616314),
 ('other vegetables',
  {'other vegetables', 'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  2.047482211275315),
 ('rum',
  {'rum', 'sausage', 'se

In [268]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'photo/film', 'soda', 'tropical fruit', 'yogurt', 'root vegetables', 'domestic eggs', 'white wine'}


[('domestic eggs',
  {'domestic eggs', 'photo/film', 'soda', 'tropical fruit'},
  1.0,
  26.96036036036036),
 ('root vegetables',
  {'photo/film', 'root vegetables', 'soda', 'tropical fruit'},
  1.0,
  14.37367915465898),
 ('white wine',
  {'photo/film', 'soda', 'tropical fruit', 'white wine'},
  1.0,
  85.50285714285714),
 ('yogurt',
  {'photo/film', 'soda', 'tropical fruit', 'yogurt'},
  1.0,
  11.644357976653696),
 ('domestic eggs',
  {'domestic eggs', 'photo/film', 'root vegetables', 'soda'},
  1.0,
  26.96036036036036),
 ('tropical fruit',
  {'photo/film', 'root vegetables', 'soda', 'tropical fruit'},
  1.0,
  14.756410256410257),
 ('white wine',
  {'photo/film', 'root vegetables', 'soda', 'white wine'},
  1.0,
  85.50285714285714),
 ('yogurt',
  {'photo/film', 'root vegetables', 'soda', 'yogurt'},
  1.0,
  11.644357976653696),
 ('domestic eggs',
  {'domestic eggs', 'photo/film', 'soda', 'white wine'},
  1.0,
  26.96036036036036),
 ('root vegetables',
  {'photo/film', 'root vegeta