# Lab  1 - basket analysis

## Lab preparation

 * Download and extract: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
 * [opctional] Create a python virtual enviroment
 `python3 -m venv ./recsyslab1`
 * install needed libraries:
 `pip install more-itertools`

In [1]:
# !pip install more-itertools

## Part 1 - data preparation/preprocessing

In [2]:
# import all needed packages

from more_itertools import powerset

In [3]:
# defining constants

PATH = './basket.csv'
EPSILON = 0.0001

In [4]:
# reading basket data

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

# list of sets containing products from baskets
baskets = read_baskets(PATH)

# list of all available products
products = unique_products(baskets)

## Part 2 - Support, confidence and lift

In [5]:
# computing a data structure (dictionary or graph) with interesting `support` values

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    len_all_baskets = len(baskets)
    
    def rek(basket : set, baskets : list[set]):
        nonlocal supports
        for product in all_products:
            if product in basket : continue
            current_A = basket | {product}
            containing_baskets = [b for b in baskets if (current_A.issubset(b))]
            s = len(containing_baskets)/len_all_baskets
            if s > epsilon:
                supports[tuple(sorted(current_A))] = s
                rek(current_A,containing_baskets)

    rek(set(),baskets)
    
    return supports
    
supports = get_supports(baskets, products, EPSILON)
# supports

In [6]:
# defining functions to compute support, confidence and lift

def support(supports, products: set) -> float:
    products = tuple(sorted(products))
    return supports[products] if products in supports else 0 

def confidence(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    support_A = support(supports, prior_products)
    support_AB = support(supports,new_basket)
    return support_AB/support_A if support_A > 0 else 0 
    
def lift(supports, prior_products: set, following_products: set) -> float:
    new_basket = tuple(sorted(prior_products | following_products))
    prior_products = tuple(sorted(prior_products))
    following_products = tuple(sorted(following_products))
    support_A = support(supports, prior_products)
    support_B = support(supports, following_products)
    support_AB = support(supports, new_basket)
    return support_AB/(support_A*support_B) if support_A > 0 and support_B > 0 else 0 

In [7]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448512


## Part 3 - Generating recommendations

In [11]:
# list of potential recommendations
# products with lift > 1 and high confidence(sorting by confindence)

def generate_next_product_candidates(basket: set, products: list, supports) -> list[tuple[str, tuple[str], float, float]]:
    result = []
    for subbasket in powerset(basket):
        subbasket = set(subbasket)
        for product in products:
            if product in subbasket:continue
            cnfdnc = confidence(supports, subbasket, {product})
            lft = lift(supports, subbasket, {product})
            if lft > 1 and product not in basket:
                result.append((product,subbasket,cnfdnc,lft))
    
    return sorted(result,key= lambda x: x[2],reverse = True)

In [12]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)[:10]

{'sausage', 'semi-finished bread', 'whole milk', 'yogurt'}


[('rolls/buns',
  {'sausage', 'whole milk', 'yogurt'},
  0.2272727272727273,
  2.066027836076439),
 ('pork',
  {'sausage', 'whole milk', 'yogurt'},
  0.18181818181818185,
  4.901883701883703),
 ('soda',
  {'sausage', 'whole milk', 'yogurt'},
  0.18181818181818185,
  1.8723643871613596),
 ('other vegetables',
  {'sausage', 'whole milk', 'yogurt'},
  0.13636363636363635,
  1.1168084788774444),
 ('rolls/buns',
  {'sausage', 'whole milk'},
  0.12686567164179105,
  1.1532752398396837),
 ('domestic eggs',
  {'semi-finished bread', 'whole milk'},
  0.12,
  3.2352432432432434),
 ('rolls/buns',
  {'whole milk', 'yogurt'},
  0.11976047904191618,
  1.0886853267947703),
 ('soda', {'sausage', 'whole milk'}, 0.11940298507462688, 1.2296124333596987),
 ('soda', {'sausage', 'yogurt'}, 0.11627906976744186, 1.1974423406264507),
 ('soda', {'sausage'}, 0.09856035437430785, 1.0149749363405152)]

In [13]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)[:10]

{'yogurt', 'soda', 'photo/film', 'root vegetables', 'tropical fruit', 'domestic eggs', 'white wine'}


[('whole milk', {'photo/film', 'yogurt'}, 0.5, 3.166102412187897),
 ('onions',
  {'root vegetables', 'tropical fruit', 'yogurt'},
  0.42857142857142855,
  21.164073550212162),
 ('pastry',
  {'root vegetables', 'soda', 'tropical fruit'},
  0.4,
  7.732816537467701),
 ('frozen vegetables',
  {'white wine', 'yogurt'},
  0.37499999999999994,
  13.391706443914078),
 ('bottled beer',
  {'tropical fruit', 'white wine'},
  0.3333333333333333,
  7.356440511307767),
 ('whole milk',
  {'root vegetables', 'tropical fruit', 'yogurt'},
  0.28571428571428575,
  1.809201378393084),
 ('whole milk',
  {'tropical fruit', 'white wine'},
  0.22222222222222224,
  1.4071566276390655),
 ('beef', {'soda', 'white wine'}, 0.21428571428571427, 6.311726659167604),
 ('shopping bags',
  {'soda', 'white wine'},
  0.21428571428571427,
  4.503310593900482),
 ('whole milk',
  {'domestic eggs', 'soda'},
  0.1842105263157895,
  1.1664587834376465)]