In [2]:
import numpy as np
import pandas as pd

In [3]:
transaction_dict = {"t1" : ["i1", "i2", "i3"], 
                    "t2" : ["i2", "i3", "i4"], 
                    "t3" : ["i4", "i5"], 
                    "t4" : ["i1", "i2", "i4"],
                    "t5" : ["i1", "i2", "i3", "i5"], 
                    "t6" : ["i1", "i2", "i3", "i4"]}

In [4]:
transactions = pd.DataFrame(list(transaction_dict.items()), columns=["transactions", "items"])
transactions

Unnamed: 0,transactions,items
0,t1,"[i1, i2, i3]"
1,t2,"[i2, i3, i4]"
2,t3,"[i4, i5]"
3,t4,"[i1, i2, i4]"
4,t5,"[i1, i2, i3, i5]"
5,t6,"[i1, i2, i3, i4]"


In [5]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   transactions  6 non-null      object
 1   items         6 non-null      object
dtypes: object(2)
memory usage: 224.0+ bytes


In [9]:
dataset = transactions["items"].tolist()
dataset

[['i1', 'i2', 'i3'],
 ['i2', 'i3', 'i4'],
 ['i4', 'i5'],
 ['i1', 'i2', 'i4'],
 ['i1', 'i2', 'i3', 'i5'],
 ['i1', 'i2', 'i3', 'i4']]

## ARM with FP Growth

In [19]:
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.preprocessing import TransactionEncoder

In [20]:
te = TransactionEncoder()
te_ary = te.fit(dataset)
transformed_te_ary = te_ary.transform(dataset)
type(transformed_te_ary), transformed_te_ary.shape

(numpy.ndarray, (6, 5))

In [21]:
df = pd.DataFrame(transformed_te_ary, columns=te.columns_)
df.head()

Unnamed: 0,i1,i2,i3,i4,i5
0,True,True,True,False,False
1,False,True,True,True,False
2,False,False,False,True,True
3,True,True,False,True,False
4,True,True,True,False,True


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   i1      6 non-null      bool 
 1   i2      6 non-null      bool 
 2   i3      6 non-null      bool 
 3   i4      6 non-null      bool 
 4   i5      6 non-null      bool 
dtypes: bool(5)
memory usage: 158.0 bytes


In [23]:
frequent_itemsets = fpgrowth(df, min_support=0.5, use_colnames=True)
type(frequent_itemsets)

pandas.core.frame.DataFrame

In [24]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.833333,(i2)
1,0.666667,(i3)
2,0.666667,(i1)
3,0.666667,(i4)
4,0.666667,"(i3, i2)"
5,0.666667,"(i2, i1)"
6,0.5,"(i3, i1)"
7,0.5,"(i3, i2, i1)"
8,0.5,"(i4, i2)"


In [25]:
frequent_itemsets = frequent_itemsets.sort_values(by="support", ascending=False)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.833333,(i2)
1,0.666667,(i3)
2,0.666667,(i1)
3,0.666667,(i4)
4,0.666667,"(i3, i2)"
5,0.666667,"(i2, i1)"
6,0.5,"(i3, i1)"
7,0.5,"(i3, i2, i1)"
8,0.5,"(i4, i2)"


## ARM with apriori

## Support

It tell us about how popular an itemset is, as measured by the proportion of transactions in which an itemset appears. It is measured as follows:

For Movie Recommendation, we calculate it as:
$$
\begin{equation*}
support(M) = \frac{\text{number of user watchlists containing M}}{\text{total number of user watchlists}}
\end{equation*}
$$

wheras for Market Basket Optimization, we calculate it as:
$$
\begin{equation*}
support(I) = \frac{\text{number of transactions containing I}}{\text{total number of transactions}}
\end{equation*}
$$

## Confidence

It tell us about how likely item B is purchased when item A is purchased, expressed as {A -> B}. It is measured as follows:

For Movie Recommendation, we calculate it as:
$$
\begin{equation*}
confidence(M_1\rightarrow{M_2}) = \frac{\text{number of user watchlists containing $M_1$ and $M_2$}}{\text{number of user watchlists containing $M_1$}}
\end{equation*}
$$

wheras for Market Basket Optimization, we calculate it as:
$$
\begin{equation*}
confidence(I_1\rightarrow{I_2}) = \frac{\text{number of transactions containing $I_1$ and $I_2$}}{\text{number of transactions containing $I_1$}}
\end{equation*}
$$

## Lift

It tell us about how likely the item B is purchased when the item A is purchased while controlling for how popular item B is. It is measured as follows:

For Movie Recommendation, we calculate it as:
$$
\begin{equation*}
lift(M_1\rightarrow{M_2}) = \frac{Confidence(M_1\rightarrow{M_2})}{Support(M_2)}
\end{equation*}
$$

wheras for Market Basket Optimization, we calculate it as:
$$
\begin{equation*}
lift(I_1\rightarrow{I_2}) = \frac{Confidence(I_1\rightarrow{I_2})}{Support(I_2)}
\end{equation*}
$$

## Apriori

Apriori algorithm consist of:

1. Step 1: Set a minimum support and confidence.
2. Step 2: Take all the subsets in transactions having higher support than minimum support.
3. Step 3: Take all the rules of these subsets having higher confidence than minimum confidence.
4. Step 4: Sort the rules by decreasing lift.

We will be using `apriori` function from `apyori` package to implement the apriori algorithm. It return all the different association measures (or the rules) such support, confidence and lift.

In [26]:
df

Unnamed: 0,i1,i2,i3,i4,i5
0,True,True,True,False,False
1,False,True,True,True,False
2,False,False,False,True,True
3,True,True,False,True,False
4,True,True,True,False,True
5,True,True,True,True,False


In [37]:
from mlxtend.frequent_patterns import apriori

result = apriori(df, min_support=0.5, use_colnames=True, max_len=3)
result

Unnamed: 0,support,itemsets
0,0.666667,(i1)
1,0.833333,(i2)
2,0.666667,(i3)
3,0.666667,(i4)
4,0.666667,"(i2, i1)"
5,0.5,"(i3, i1)"
6,0.666667,"(i3, i2)"
7,0.5,"(i4, i2)"
8,0.5,"(i3, i2, i1)"


In [38]:
frequent_itemsets = result.sort_values(by="support", ascending=False)
frequent_itemsets

Unnamed: 0,support,itemsets
1,0.833333,(i2)
0,0.666667,(i1)
2,0.666667,(i3)
3,0.666667,(i4)
4,0.666667,"(i2, i1)"
6,0.666667,"(i3, i2)"
5,0.5,"(i3, i1)"
7,0.5,"(i4, i2)"
8,0.5,"(i3, i2, i1)"


### Apriori using apyori

In [10]:
dataset

[['i1', 'i2', 'i3'],
 ['i2', 'i3', 'i4'],
 ['i4', 'i5'],
 ['i1', 'i2', 'i4'],
 ['i1', 'i2', 'i3', 'i5'],
 ['i1', 'i2', 'i3', 'i4']]

In [40]:
from apyori import apriori

In [49]:
rules = apriori(transactions=dataset, min_support = 0.00030, min_confidence=0.0001, min_lift=0.0001, min_length=0, max_length=5)
results = list(rules)

In [50]:
results

[RelationRecord(items=frozenset({'i1'}), support=0.6666666666666666, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'i1'}), confidence=0.6666666666666666, lift=1.0)]),
 RelationRecord(items=frozenset({'i2'}), support=0.8333333333333334, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'i2'}), confidence=0.8333333333333334, lift=1.0)]),
 RelationRecord(items=frozenset({'i3'}), support=0.6666666666666666, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'i3'}), confidence=0.6666666666666666, lift=1.0)]),
 RelationRecord(items=frozenset({'i4'}), support=0.6666666666666666, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'i4'}), confidence=0.6666666666666666, lift=1.0)]),
 RelationRecord(items=frozenset({'i5'}), support=0.3333333333333333, ordered_statistics=[OrderedStatistic(items_base=frozenset(), items_add=frozenset({'i5'}), confidence=0.3333333333333333

In [19]:
def inspect(results):
    '''
    function to put the result in well organised pandas dataframe
    '''
    lhs         = [tuple(result[2][0][0])[0] for result in results]
    rhs         = [tuple(result[2][0][1])[0] for result in results]
    supports    = [result[1] for result in results]
    confidences = [result[2][0][2] for result in results]
    lifts       = [result[2][0][3] for result in results]
    return list(zip(lhs, rhs, supports, confidences, lifts))

resultsinDataFrame = pd.DataFrame(inspect(results), columns = ['Item #1', 'Item #2', 'Support', 'Confidence', 'Lift'])
resultsinDataFrame.head()

IndexError: tuple index out of range