In [35]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [150]:
from efficient_apriori import apriori

In [215]:
trans = pd.read_csv('https://raw.githubusercontent.com/overtunned/AppliedPredicitiveAnalysis/main/Case%20study%20dataset/Groceries_dataset.csv')
trans.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [216]:
# Checking for null values
trans.isnull().sum()

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [217]:
all_products = trans['itemDescription'].unique()
print("Total products: {}".format(len(all_products)))

Total products: 167


In [218]:
def ditribution_plot(x,y,name=None,xaxis=None,yaxis=None):
    fig = go.Figure([
        go.Bar(x=x, y=y)
    ])

    fig.update_layout(
        title_text=name,
        xaxis_title=xaxis,
        yaxis_title=yaxis
    )
    fig.show()

In [219]:
x = trans['itemDescription'].value_counts()
x = x.sort_values(ascending = False) 
x = x[:10]

ditribution_plot(x=x.index, y=x.values, yaxis="Count", xaxis="Products")

In [220]:
one_hot = pd.get_dummies(trans['itemDescription'])
trans.drop('itemDescription', inplace=True, axis=1)
trans = trans.join(one_hot)
trans.head()

Unnamed: 0,Member_number,Date,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,1808,21-07-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2552,05-01-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2300,19-09-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1187,12-12-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3037,01-02-2015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [222]:
records = trans.groupby(["Member_number","Date"])[all_products[:]].apply(sum)
records = records.reset_index()[all_products]

In [223]:
## Replacing non-zero values with product names
def get_Pnames(x):
    for product in all_products:
        if x[product] > 0:
            x[product] = product
    return x

records = records.apply(get_Pnames, axis=1)
records.head()

Unnamed: 0,tropical fruit,whole milk,pip fruit,other vegetables,rolls/buns,pot plants,citrus fruit,beef,frankfurter,chicken,...,flower (seeds),rice,tea,salad dressing,specialty vegetables,pudding powder,ready soups,make up remover,toilet cleaner,preservation products
0,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,whole milk,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [224]:
## Removing zeros
x = records.values
x = [sub[~(sub == 0)].tolist() for sub in x if sub[sub != 0].tolist()]
transactions = x

In [225]:
temsets, rules = apriori(transactions,min_support=0.0005,min_confidence=0.05,max_length=4)

In [226]:
rules_rhs = filter(lambda rule: len(rule.lhs) == 2 and len(rule.rhs) == 1, rules)
for rule in sorted(rules_rhs, key=lambda rule: rule.lift):
    print(rule) # Prints the rule and its confidence, support, lift
    print("="*100)

{rolls/buns, yogurt} -> {other vegetables} (conf: 0.077, supp: 0.001, lift: 0.630, conv: 0.951)
{pip fruit, whole milk} -> {other vegetables} (conf: 0.081, supp: 0.001, lift: 0.662, conv: 0.955)
{rolls/buns, root vegetables} -> {whole milk} (conf: 0.105, supp: 0.001, lift: 0.663, conv: 0.941)
{other vegetables, rolls/buns} -> {yogurt} (conf: 0.057, supp: 0.001, lift: 0.663, conv: 0.969)
{other vegetables, yogurt} -> {rolls/buns} (conf: 0.074, supp: 0.001, lift: 0.676, conv: 0.962)
{other vegetables, shopping bags} -> {whole milk} (conf: 0.108, supp: 0.001, lift: 0.685, conv: 0.944)
{other vegetables, pip fruit} -> {whole milk} (conf: 0.108, supp: 0.001, lift: 0.685, conv: 0.944)
{bottled water, whole milk} -> {other vegetables} (conf: 0.084, supp: 0.001, lift: 0.689, conv: 0.959)
{shopping bags, whole milk} -> {other vegetables} (conf: 0.084, supp: 0.001, lift: 0.690, conv: 0.959)
{bottled water, other vegetables} -> {whole milk} (conf: 0.110, supp: 0.001, lift: 0.695, conv: 0.946)
{ro

In [227]:
from mlxtend.preprocessing import TransactionEncoder

In [228]:
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df

Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,bags,baking powder,bathroom cleaner,beef,berries,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14958,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,True,False
14959,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14960,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
14961,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [234]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets = apriori(df, min_support=0.0005,use_colnames=True)

In [235]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.004010,(Instant food products),1
1,0.021386,(UHT-milk),1
2,0.001470,(abrasive cleaner),1
3,0.001938,(artif. sweetener),1
4,0.008087,(baking powder),1
...,...,...,...
1442,0.000601,"(shopping bags, whole milk, soda)",3
1443,0.000535,"(shopping bags, whole milk, yogurt)",3
1444,0.000668,"(whole milk, tropical fruit, soda)",3
1445,0.000936,"(yogurt, whole milk, soda)",3
