<a href="https://colab.research.google.com/github/muajnstu/Product-Recommendation-system/blob/main/market_basket_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <p style="background-color:#F8F1E8; font-family:newtimeroman;color:#602F44; font-size:150%; text-align:center; border-radius: 15px 50px;"> 🛒 Market Basket Analysis 🛍️ </p>

In [None]:
import pandas as pd
import statsmodels.stats.api as sms
import statsmodels.stats.api as sms
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.frequent_patterns import hmine
from mlxtend.frequent_patterns import fpgrowth

In [None]:
orders = pd.read_csv("https://media.githubusercontent.com/media/muajnstu/Product-Recommendation-system/refs/heads/main/orders.csv/orders.csv")

In [None]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [None]:
orders["day_hour"] = [f"{day}-{hour}" for day,hour in zip(orders["order_dow"],orders["order_hour_of_day"])]

In [None]:
orders["user_day"] = [f"{user}-{day}" for user,day in zip(orders["user_id"],orders["order_dow"])]

In [None]:
orders = orders[orders["eval_set"]=="prior"]

In [None]:
order_products = pd.read_csv("https://media.githubusercontent.com/media/muajnstu/Product-Recommendation-system/refs/heads/main/order_products__prior.csv")

# Data Cleaning & Preprocessing

In [None]:
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [None]:
df = pd.merge(orders,order_products, how="inner", on="order_id")[["order_id","user_id","product_id","day_hour","user_day"]]
df.head()

Unnamed: 0,order_id,user_id,product_id,day_hour,user_day
0,2539329,1,196,2-8,1-2
1,2539329,1,14084,2-8,1-2
2,2539329,1,12427,2-8,1-2
3,2539329,1,26088,2-8,1-2
4,2539329,1,26405,2-8,1-2


In [None]:
low_conf, up_conf = sms.DescrStatsW(df["product_id"].value_counts()).tconfint_mean()
print(f"Lower Confidence Interval: {low_conf:.0f}")
print(f"Upper Confidence Interval: {up_conf:.0f}")

Lower Confidence Interval: 611
Upper Confidence Interval: 695


In [None]:
important_products = df["product_id"].value_counts()[df["product_id"].value_counts() > low_conf].index
important_products

Index([24852, 13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209, 27845,
       ...
       23495,  9532, 32420, 30278, 18102,  5386,  4933, 24920, 42221, 17070],
      dtype='int64', name='product_id', length=7248)

In [None]:
df = df[df["product_id"].isin(important_products)]
df.shape

(28214831, 5)

In [None]:
low_conf, up_conf = sms.DescrStatsW(df["user_id"].value_counts()).tconfint_mean()
print(f"Lower Confidence Interval: {low_conf:.0f}")
print(f"Upper Confidence Interval: {up_conf:.0f}")

Lower Confidence Interval: 136
Upper Confidence Interval: 138


In [None]:
important_baskets = df["user_id"].value_counts()[df["user_id"].value_counts() > low_conf].index
important_baskets

Index([201268, 129928, 186704, 182401, 137629, 176478, 164055,  79106,  60694,
        13701,
       ...
        94901,  94914,  23417, 176623,  15631, 187840, 173117, 173129,   7907,
        78709],
      dtype='int64', name='user_id', length=60532)

In [None]:
basket = df.groupby(["user_id","product_id"])["order_id"].count().unstack().notnull()
basket

product_id,1,10,23,25,28,34,37,45,49,54,...,49615,49621,49628,49640,49644,49652,49655,49667,49680,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206205,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
206206,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
206207,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
206208,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False


<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:150%; text-align:left">
<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:150%; text-align:left">

<h3 align="left"><font color='#4E5672'>🛠️
Apriori Algorithm </font></h3>

In [None]:
frequent_itemsets = apriori(basket,min_support=0.1,use_colnames=True,verbose=1)
frequent_itemsets.sort_values("support", ascending=False)

Processing 171 combinations | Sampling itemset size 3


Unnamed: 0,support,itemsets
14,0.358797,(24852)
6,0.308249,(13176)
9,0.285452,(21137)
11,0.267012,(21903)
36,0.225119,(47626)
16,0.217633,(26209)
35,0.210812,(47209)
7,0.209337,(16797)
37,0.207503,(47766)
26,0.180175,(39275)


In [None]:
rules = association_rules(frequent_itemsets,metric="support",min_threshold=0.01)
rules.sort_values(by="lift")

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
19,(21903),(24852),0.267012,0.358797,0.119691,0.448262,1.249347,1.0,0.023888,1.162151,0.272285,0.236489,0.139527,0.390926
18,(24852),(21903),0.358797,0.267012,0.119691,0.33359,1.249347,1.0,0.023888,1.099906,0.311261,0.236489,0.090832,0.390926
10,(21137),(24852),0.285452,0.358797,0.131306,0.459992,1.282038,1.0,0.028886,1.187395,0.307876,0.255985,0.15782,0.412976
11,(24852),(21137),0.358797,0.285452,0.131306,0.365961,1.282038,1.0,0.028886,1.126977,0.343093,0.255985,0.112671,0.412976
27,(24852),(26209),0.358797,0.217633,0.112598,0.313822,1.441975,1.0,0.034512,1.14018,0.478018,0.242757,0.122946,0.415599
26,(26209),(24852),0.217633,0.358797,0.112598,0.517377,1.441975,1.0,0.034512,1.328578,0.391769,0.242757,0.247316,0.415599
6,(24852),(16797),0.358797,0.209337,0.113341,0.315891,1.509003,1.0,0.038231,1.155755,0.52606,0.249213,0.134765,0.428658
7,(16797),(24852),0.209337,0.358797,0.113341,0.541426,1.509003,1.0,0.038231,1.398254,0.426618,0.249213,0.284823,0.428658
29,(24852),(47626),0.358797,0.225119,0.123757,0.344921,1.532173,1.0,0.042985,1.182882,0.541688,0.268943,0.154607,0.44733
28,(47626),(24852),0.225119,0.358797,0.123757,0.549739,1.532173,1.0,0.042985,1.42407,0.448239,0.268943,0.297787,0.44733


<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:150%; text-align:left">

<h3 align="left"><font color='#4E5672'>🛠️ Association Rules Algoritması </font></h3>
    



In [None]:
random_product = rules.sample(1,random_state=45)["antecedents"].explode().iloc[0]
random_product

21137

In [None]:
lime  = 26209
banana = 24852


In [None]:
def arl_recommender(rules_df, id, rec=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, k in enumerate(sorted_rules["antecedents"]):
        for j in list(k):
            if j == id :
                for k in list(sorted_rules.iloc[i]["consequents"]):
                    if k not in recommendation_list:
                        recommendation_list.append(k)

    return recommendation_list[0:rec]

In [None]:
arl_recommender(rules,random_product,5)

[39275, 47209, 21903, 13176, 26209]

In [None]:
products = pd.read_csv("https://raw.githubusercontent.com/muajnstu/Product-Recommendation-system/refs/heads/main/products.csv")
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [None]:
def names_of_products(rules_df, bought,recommend = 5):

    rec = arl_recommender(rules_df,bought,recommend)

    name_of_rec={}
    bought_name = products[products["product_id"]==bought]["product_name"].iloc[0]
    for i in rec:
        name_of_rec[i] = products[products["product_id"]==i]["product_name"].iloc[0]
    recommend_df = pd.DataFrame(name_of_rec.items(), columns=["product_id","product_name"])
    print(f"Bought: {bought_name}\n")
    return recommend_df

In [None]:
names_of_products(rules,random_product,5)

Bought: Organic Strawberries



Unnamed: 0,product_id,product_name
0,39275,Organic Blueberries
1,47209,Organic Hass Avocado
2,21903,Organic Baby Spinach
3,13176,Bag of Organic Bananas
4,26209,Limes


<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:150%; text-align:left">

<h3 align="left"><font color='#4E5672'>🛠️ H-Mine Algorithm</font></h3>
    



In [None]:
from mlxtend.frequent_patterns import hmine

In [None]:
frequent_itemsets = hmine(basket,min_support=0.05,use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
269,0.358797,(24852)
62,0.308249,(13176)
145,0.285452,(21137)
205,0.267012,(21903)
432,0.225119,(47626)
...,...,...
58,0.050179,"(13176, 10749)"
413,0.05016,"(45007, 44359)"
97,0.050131,"(13176, 37646)"
17,0.050077,"(4920, 47626)"


<div style="border-radius:10px; border:#4E5672 solid; padding: 15px; background-color: #F8F1E8; font-size:150%; text-align:left">

<h3 align="left"><font color='#4E5672'>👂 A Hearsay </font></h3>
    

In [None]:
frequent_itemsets = fpgrowth(basket,min_support=0.05,use_colnames=True)
frequent_itemsets.sort_values("support", ascending=False)

Unnamed: 0,support,itemsets
4,0.358797,(24852)
0,0.308249,(13176)
18,0.285452,(21137)
19,0.267012,(21903)
57,0.225119,(47626)
...,...,...
436,0.050179,"(13176, 10749)"
341,0.050160,"(44359, 45007)"
174,0.050131,"(13176, 37646)"
283,0.050077,"(4920, 47626)"


In [None]:
%pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.14.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading statsmodels-0.14.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-1.0.1 statsmodels-0.14.5


In [None]:
%pip install mlxtend

