# Lab  1 - basket analysis

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

In [1]:
# !pip install more-itertools

## Part 1 - data preparation/preprocessing

In [2]:
# import all needed packages

from more_itertools import powerset

In [3]:
# defining constants

PATH = './basket.csv'
EPSILON = 0.00005

In [4]:
# reading basket data

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

# list of sets containing products from baskets
baskets = read_baskets(PATH)

# list of all available products
products = unique_products(baskets)

## Part 2 - Support, confidence and lift

In [5]:
# computing a data structure (dictionary or graph) with interesting `support` values

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    len_all_baskets = len(baskets)
    
    def rek(basket : set, baskets : list[set]):
        nonlocal supports
        for product in all_products:
            if product in basket : continue
            current_A = basket | {product}
            containing_baskets = [b for b in baskets if (current_A.issubset(b))]
            s = len(containing_baskets)/len_all_baskets
            if s > epsilon:
                supports[tuple(sorted(current_A))] = s
                rek(current_A,containing_baskets)
    rek(set(),baskets)
    
    return supports
    
supports = get_supports(baskets, products, EPSILON)
supports

{('abrasive cleaner',): 0.0014702933903628951,
 ('abrasive cleaner', 'beef'): 0.00013366303548753594,
 ('abrasive cleaner', 'beef', 'frozen vegetables'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'frozen vegetables',
  'uht-milk'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'pork'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'salty snack'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'salty snack',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'pork',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'salty snack'): 6.683151774376797e-05,
 ('abrasive cleaner',
  'beef',
  'pasta',
  'salty snack',
  'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'pasta', 'shopping bags'): 6.683151774376797e-05,
 ('abrasive cleaner', 'beef', 'p

In [6]:
# defining functions to compute support, confidence and lift

def support(supports, products: set) -> float:
    products = tuple(sorted(products))
    return supports[products] if products in supports else 0 

def confidence(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    support_A = support(supports, prior_products)
    support_AB = support(supports,new_basket)
    return support_AB/support_A if support_A > 0 else 0 
    
def lift(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    following_products = tuple(sorted(following_products))
    support_A = support(supports, prior_products)
    support_B = support(supports, following_products)
    support_AB = support(supports, new_basket)
    return support_AB/(support_A*support_B) if support_A > 0 and support_B > 0 else 0 

In [7]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Part 3 - Generating recommendations

In [8]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket: set, products: list, supports) -> list[tuple[str, tuple[str], float, float]]:
    result = []
    for subbasket in powerset(basket):
        subbasket = set(subbasket)
        for product in products:
            if product in subbasket:continue
            cnfdnc = confidence(supports, subbasket, {product})
            lft = lift(supports, subbasket, {product})
            if lft > 1:
                result.append((product,subbasket,cnfdnc,lft))
    
    return sorted(result,key= lambda x: x[2],reverse = True)

In [9]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'whole milk', 'yogurt', 'semi-finished bread', 'sausage'}


[('whole milk',
  {'sausage', 'semi-finished bread', 'yogurt'},
  1.0,
  6.332204824375794),
 ('sausage',
  {'semi-finished bread', 'whole milk', 'yogurt'},
  0.6666666666666667,
  11.046880767811002),
 ('yogurt',
  {'sausage', 'semi-finished bread', 'whole milk'},
  0.5,
  5.822178988326848),
 ('other vegetables',
  {'sausage', 'semi-finished bread', 'yogurt'},
  0.5,
  4.09496442255063),
 ('other vegetables',
  {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
  0.5,
  4.09496442255063),
 ('whole milk',
  {'sausage', 'semi-finished bread'},
  0.4444444444444445,
  2.814313255278131),
 ('other vegetables',
  {'semi-finished bread', 'whole milk', 'yogurt'},
  0.33333333333333337,
  2.72997628170042),
 ('whole milk', {'sausage', 'yogurt'}, 0.2558139534883721, 1.6198663504217146),
 ('beverages',
  {'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  15.08366935483871),
 ('bottled beer',
  {'sausage', 'semi-finished bread', 'whole milk'},
  0.25,
  5.517330383480826),
 ('

In [10]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'photo/film', 'white wine', 'root vegetables', 'yogurt', 'tropical fruit', 'soda', 'domestic eggs'}


[('domestic eggs',
  {'photo/film', 'root vegetables', 'white wine'},
  1.0,
  26.96036036036036),
 ('soda',
  {'photo/film', 'root vegetables', 'white wine'},
  1.0,
  10.298004129387474),
 ('tropical fruit',
  {'photo/film', 'root vegetables', 'white wine'},
  1.0,
  14.756410256410257),
 ('yogurt',
  {'photo/film', 'root vegetables', 'white wine'},
  1.0,
  11.644357976653696),
 ('domestic eggs',
  {'photo/film', 'white wine', 'yogurt'},
  1.0,
  26.96036036036036),
 ('root vegetables',
  {'photo/film', 'white wine', 'yogurt'},
  1.0,
  14.37367915465898),
 ('soda', {'photo/film', 'white wine', 'yogurt'}, 1.0, 10.298004129387474),
 ('tropical fruit',
  {'photo/film', 'white wine', 'yogurt'},
  1.0,
  14.756410256410257),
 ('domestic eggs',
  {'photo/film', 'tropical fruit', 'white wine'},
  1.0,
  26.96036036036036),
 ('root vegetables',
  {'photo/film', 'tropical fruit', 'white wine'},
  1.0,
  14.37367915465898),
 ('soda',
  {'photo/film', 'tropical fruit', 'white wine'},
  1.0,
 