In [1]:
from google.cloud import bigquery
from datetime import datetime, timedelta

import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
from datetime import datetime
import datetime
from tqdm import tqdm
from matplotlib.patches import Rectangle
import warnings

warnings.filterwarnings("ignore")

%load_ext google.cloud.bigquery
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
from scipy.special import comb
from itertools import combinations, permutations


<IPython.core.display.Javascript object>

Referência: https://www.computersciencemaster.com.br/como-funciona-o-algoritmo-apriori/

In [3]:
df_mining = pd.DataFrame(
    [
        ["onion", "potato", "burguer", "false", "false"],
        ["false", "potato", "burguer", "milk", "false"],
        ["false", "false", "false", "milk", "beer"],
        ["onion", "potato", "false", "milk", "false"],
        ["onion", "potato", "burguer", "false", "beer"],
        ["onion", "potato", "burguer", "milk", "beer"],
    ]
)

<IPython.core.display.Javascript object>

In [4]:
minimum_support = 0.5
df_values = df_mining.values.astype(str)
n_trans = len(df_mining)
n_minimum_support = minimum_support * n_trans

index, counts = np.unique(df_values, return_counts=True)
df_item = pd.DataFrame(zip(index, counts), columns=["product", "frequency"])
df_item.sort_values(by="frequency", ascending=False, inplace=True)
df_item.reset_index(drop=True, inplace=True)
df_item = df_item.query("product != 'false'")
df_item

Unnamed: 0,product,frequency
1,potato,5
2,burguer,4
3,milk,4
4,onion,4
5,beer,3


<IPython.core.display.Javascript object>

In [5]:
n_minimum_support = minimum_support * n_trans
df_item_frequent = df_item[df_item["frequency"] > n_minimum_support]
df_item_frequent["supp"] = df_item_frequent["frequency"] / n_trans
df_item_frequent

Unnamed: 0,product,frequency,supp
1,potato,5,0.833333
2,burguer,4,0.666667
3,milk,4,0.666667
4,onion,4,0.666667


<IPython.core.display.Javascript object>

In [6]:
itemset_frequency = []
for i in tqdm(np.arange(len(df_item_frequent), 0, -1)):
    comb = list(combinations(df_item_frequent["product"].values, i))
    for w in comb:
        count = 0
        for instancia in df_values:
            if all(elem in instancia for elem in w):
                count = count + 1
        if count >= n_minimum_support:
            itemset_frequency.append({"itemset": w, "frequency": count})
df_itemset_frequency = pd.DataFrame(itemset_frequency)
df_itemset_frequency.sort_values(by="frequency", inplace=True, ascending=False)
df_item_frequent.set_index("product", inplace=True)
df_itemset_frequency["supp"] = df_itemset_frequency["frequency"] / n_trans
df_itemset_frequency

100%|██████████| 4/4 [00:00<00:00, 4513.64it/s]


Unnamed: 0,itemset,frequency,supp
5,"(potato,)",5,0.833333
1,"(potato, burguer)",4,0.666667
3,"(potato, onion)",4,0.666667
6,"(burguer,)",4,0.666667
7,"(milk,)",4,0.666667
8,"(onion,)",4,0.666667
0,"(potato, burguer, onion)",3,0.5
2,"(potato, milk)",3,0.5
4,"(burguer, onion)",3,0.5


<IPython.core.display.Javascript object>

# Support

The support of an item set is the proportion of transactions in the database over the total number of transactions.

In [8]:
df_itemset_frequency["supp"] = df_itemset_frequency["frequency"] / n_trans

<IPython.core.display.Javascript object>

In [9]:
df_itemset_frequency

Unnamed: 0,itemset,frequency,supp
5,"(potato,)",5,0.833333
1,"(potato, burguer)",4,0.666667
3,"(potato, onion)",4,0.666667
6,"(burguer,)",4,0.666667
7,"(milk,)",4,0.666667
8,"(onion,)",4,0.666667
0,"(potato, burguer, onion)",3,0.5
2,"(potato, milk)",3,0.5
4,"(burguer, onion)",3,0.5


<IPython.core.display.Javascript object>

In [10]:
df_item_frequent

Unnamed: 0_level_0,frequency,supp
product,Unnamed: 1_level_1,Unnamed: 2_level_1
potato,5,0.833333
burguer,4,0.666667
milk,4,0.666667
onion,4,0.666667


<IPython.core.display.Javascript object>

# Confidence 

The confidence of a rule is the likelihood of purchasing item B when item A is purchased. 

Example:

Confidence of buying burger given that potato was bought

In [11]:
%%latex
\begin{equation}
Conf({burguer}=>{potato}) = \frac{p({burguer}\cap{potato})}{p({potato})}= \frac{0.66}{0.83} = 79\%
\end{equation}

<IPython.core.display.Latex object>

<IPython.core.display.Javascript object>

That is, 80% of the people who buy potatoes also buy burgers.

# Lift

Lift also calculates the likelihood of purchasing an item relative to another object. However, this measure considers the popularity of both.

Example:

Lift of buying burger given that potato was bought

In [12]:
%%latex
\begin{equation}
Lift({burguer}=>{potato}) = \frac{p({burguer}\cap{potato})}{p({burguer})*p({potato})}= \frac{0.66}{0.66*0.83} = 1.19
\end{equation}

<IPython.core.display.Latex object>

<IPython.core.display.Javascript object>

Based on this value, we check:

If Lift(X=>Y) > 1, then set Y is likely to be bought when X is bought.


If Lift(X=>Y) ≤1, then Y is NOT likely to be bought if X is bought.

In [13]:
prob = []
for item, supp in tqdm(df_itemset_frequency[["itemset", "supp"]].values):
    s = round(supp * 100, 2)
    for conjunto, supp_c in df_itemset_frequency[["itemset", "supp"]].values:

        conf = round((supp_c / supp) * 100, 2)
        lift = round(
            supp_c / np.prod([df_item_frequent.loc[i]["supp"] for i in conjunto]), 2
        )
        conj = set(conjunto) - set(item)
        if set(item).intersection(set(conjunto)) == set(item):
            if len(conj) > 0 and conf > 0:
                prob.append([conj, item, conf, lift])
df = pd.DataFrame(prob, columns=["B", "A", "confidence", "lift"])
df

100%|██████████| 9/9 [00:00<00:00, 311.29it/s]


Unnamed: 0,B,A,confidence,lift
0,{burguer},"(potato,)",80.0,1.2
1,{onion},"(potato,)",80.0,1.2
2,"{onion, burguer}","(potato,)",60.0,1.35
3,{milk},"(potato,)",60.0,0.9
4,{onion},"(potato, burguer)",75.0,1.35
5,{burguer},"(potato, onion)",75.0,1.35
6,{potato},"(burguer,)",100.0,1.2
7,"{onion, potato}","(burguer,)",75.0,1.35
8,{onion},"(burguer,)",75.0,1.12
9,{potato},"(milk,)",75.0,0.9


<IPython.core.display.Javascript object>