# Market basket analysis(MBA) for predicting products bought together

In [1]:
import kagglehub
import shutil
import os
import pandas as pd
from faker import Faker
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download latest version
path = kagglehub.dataset_download("psparks/instacart-market-basket-analysis")

# Move it to local project folder
destination = "./data/instacart"
os.makedirs(destination, exist_ok=True)
shutil.move(path, destination)

Downloading from https://www.kaggle.com/api/v1/datasets/download/psparks/instacart-market-basket-analysis?dataset_version_number=1...


100%|████████████████████████████████████████| 197M/197M [00:06<00:00, 32.8MB/s]

Extracting files...





'./data/instacart/1'

In [3]:
#Get the 100 most ordered products id
ordered_products = pd.read_csv('data/instacart/order_products__prior.csv')
top_ids = ordered_products['product_id'].value_counts().head(1000).index
top_ids

Index([24852, 13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845,
       ...
        4945, 13966,  7746, 35199, 45190, 18023, 20378, 43014,  3339, 27020],
      dtype='int64', name='product_id', length=1000)

In [4]:
all_products = pd.read_csv('data/instacart/products.csv')

# select products that are most ordered
instacart_products = all_products[all_products['product_id'].isin(top_ids)]

# 2. Create mock products using instacart product IDs
fake = Faker()
mock_products = []
for idx, row in instacart_products.iterrows():
    mock_products.append({
        "product_id": int(row['product_id']),  # KEEP the real Instacart ID
        "name": fake.catch_phrase(),     # YOUR fake name
        "description": fake.text(), 
        "price": round(random.uniform(5.0, 100.0), 2),
        "category": fake.word(ext_word_list=['Electronics', 'Home', 'Apparel']),
        "image_url": f"https://picsum.photos/seed/{row['product_id']}/400/600"
    })

In [5]:
instacart_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
33,34,Peanut Butter Cereal,121,14
44,45,European Cucumber,83,4
195,196,Soda,77,7
247,248,Dried Sweetened Cranberries,117,19
259,260,Cantaloupe,24,4


In [6]:
products_df = pd.DataFrame(mock_products)

print(products_df.head())

   product_id                                               name  \
0          34                      Inverse grid-enabled function   
1          45  Ameliorated contextually-based process improve...   
2         196             Enterprise-wide discrete knowledgebase   
3         248                     Adaptive optimal system engine   
4         260         Face-to-face fresh-thinking infrastructure   

                                         description  price     category  \
0  Energy billion nearly admit fund ten. Oil stop...  49.18      Apparel   
1  Explain often field probably hour sometimes. V...  72.27  Electronics   
2  Ten technology finally worker reason reveal be...  50.35  Electronics   
3  Position money ago.\nBy model hour ability. Gr...  84.61      Apparel   
4  Fast quite industry candidate without evidence...  46.71  Electronics   

                                image_url  
0   https://picsum.photos/seed/34/400/600  
1   https://picsum.photos/seed/45/400/600  
2 

17515025

In [21]:
#Filter orders to only contain products in mock product_data
order_filter = ordered_products[ordered_products['product_id'].isin(top_ids)]
len(order_filter)

# 1. Get a list of unique order IDs and sample them
unique_orders = order_filter['order_id'].unique()

# Sampling 500k orders
sampled_orders = pd.Series(unique_orders).sample(n=500000, random_state=42)

# 2. Filter the dataframe to only include these orders
subset_df = order_filter[order_filter['order_id'].isin(sampled_orders)]

#Filter orders to only contain orders with more than 1 product
order_counts = subset_df.groupby('order_id').size()
valid_orders = order_counts[order_counts > 1].index

## Filtering is linked to the main df, adding .copy() makes it a df of itself, not linked to the bigger dataframe
order_df = subset_df[subset_df.duplicated('order_id', keep=False)].copy()

In [35]:
#Verifying the Sample

print(order_filter['product_id'].value_counts(normalize=True).head(5))
print(order_df['product_id'].value_counts(normalize=True).head(5))

product_id
24852    0.026981
13176    0.021664
21137    0.015112
21903    0.013812
47209    0.012194
Name: proportion, dtype: float64
product_id
24852    0.027045
13176    0.021542
21137    0.015117
21903    0.013978
47209    0.012266
Name: proportion, dtype: float64


In [10]:
# Create a mapping of {product_id: category}
category_map = products_df.set_index('product_id')['category']

# Use that map to fill the new column in order_df
order_df['category'] = order_df['product_id'].map(category_map)

In [12]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules

In [24]:
## 1 hot encoding
# 1. Convert to a list of lists (one list of products per order)
transactions = subset_df.groupby('order_id')['product_id'].apply(list).values.tolist()

In [25]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
basket_sets = pd.DataFrame(te_ary, columns=te.columns_)

In [31]:
frequent_itemsets = fpgrowth(basket_sets, min_support=0.005, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.081832,(21903)
1,0.017496,(46667)
2,0.016644,(24838)
3,0.009996,(33754)
4,0.005640,(17461)
...,...,...
380,0.007752,"(13176, 45007)"
381,0.005506,"(47209, 45007)"
382,0.007036,"(21903, 45007)"
383,0.006680,"(24852, 45007)"


In [32]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(13176),(21903),0.127828,0.081832,0.017336,0.135620,1.657295,1.0,0.006876,1.062227,0.454735,0.090140,0.058582,0.173734
1,(21903),(13176),0.081832,0.127828,0.017336,0.211849,1.657295,1.0,0.006876,1.106605,0.431955,0.090140,0.096335,0.173734
2,(24852),(21903),0.160384,0.081832,0.017428,0.108664,1.327894,1.0,0.004303,1.030103,0.294096,0.077531,0.029224,0.160819
3,(21903),(24852),0.081832,0.160384,0.017428,0.212973,1.327894,1.0,0.004303,1.066820,0.268935,0.077531,0.062634,0.160819
4,(21137),(21903),0.088480,0.081832,0.012714,0.143693,1.755957,1.0,0.005474,1.072242,0.472299,0.080674,0.067375,0.149530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,(45007),(21903),0.035680,0.081832,0.007036,0.197197,2.409782,1.0,0.004116,1.143703,0.606671,0.063688,0.125647,0.141589
194,(24852),(45007),0.160384,0.035680,0.006680,0.041650,1.167322,1.0,0.000957,1.006229,0.170719,0.035272,0.006191,0.114435
195,(45007),(24852),0.035680,0.160384,0.006680,0.187220,1.167322,1.0,0.000957,1.033017,0.148642,0.035272,0.031962,0.114435
196,(21137),(45007),0.088480,0.035680,0.005294,0.059833,1.676926,1.0,0.002137,1.025690,0.442855,0.044538,0.025046,0.104104


In [39]:
# 1. We only want 1 item predicting 1 item for a simple "Bought Together" UI
simple_rules = rules[(rules['antecedents'].apply(len) == 1) & 
                     (rules['consequents'].apply(len) == 1)].copy()

# 2. Extract the ID from the frozenset
simple_rules['product_id'] = simple_rules['antecedents'].apply(lambda x: list(x)[0])
simple_rules['recommendation_id'] = simple_rules['consequents'].apply(lambda x: list(x)[0])

# 3. Select only the columns your API needs
# Lift tells us the strength, and Confidence tells us the probability
final_recommendations = simple_rules[['product_id', 'recommendation_id', 'confidence', 'lift']]

# 4. Sort to keep only the best 3 recommendations per product
final_recommendations = (final_recommendations
                         .sort_values(['product_id', 'lift'], ascending=[True, False])
                         .groupby('product_id')
                         .head(3))

In [40]:
final_recommendations 
len(final_recommendations)

84

In [41]:
final_recommendations 

Unnamed: 0,product_id,recommendation_id,confidence,lift
77,4605,24852,0.287463,1.792339
12,4920,24852,0.300759,1.875246
101,5077,24852,0.254747,1.588354
45,5876,47209,0.240167,3.352889
43,5876,13176,0.252060,1.971865
...,...,...,...,...
65,47766,47626,0.136304,2.666457
23,47766,26209,0.126941,2.635377
10,49683,47766,0.163498,2.753320
6,49683,24852,0.328287,2.046884


In [51]:
instacart_products[instacart_products["product_id"] == 49683]

Unnamed: 0,product_id,product_name,aisle_id,department_id
49682,49683,Cucumber Kirby,83,4


In [54]:
instacart_products[instacart_products["product_id"] == 21903]

Unnamed: 0,product_id,product_name,aisle_id,department_id
21902,21903,Organic Baby Spinach,123,4
