In [4]:
#importing libraries

import os
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import OneHotEncoder

In [5]:
# setting directory
os.chdir(r'C:\buan6383')

In [6]:
# reading data

data = pd.read_csv(r'data\transactions.csv')

In [7]:
# Use pd.get_dummies to do onehotencoding and make separate columns

data = pd.get_dummies(data, columns=['Product'])
data.head()

Unnamed: 0,Transaction,Product_Bow,Product_Candy Bar,Product_Deodorant,Product_Greeting Cards,Product_Magazine,Product_Markers,Product_Pain Reliever,Product_Pencils,Product_Pens,Product_Perfume,Product_Photo Processing,Product_Prescription Med,Product_Shampoo,Product_Soap,Product_Toothbrush,Product_Toothpaste,Product_Wrapping Paper
0,12359,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,12362,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,12362,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
3,12365,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,12371,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
data.shape

(459258, 18)

In [9]:
# This tab takes a while to run

data = data.groupby('Transaction').agg(any)
data.shape

(200000, 17)

In [10]:
#Renaming columns to remove Product_ from column names

data = data.rename(columns=lambda x: x[len('Product_'):] if x.startswith('Product_') else x)

In [11]:
data.head()

Unnamed: 0_level_0,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
12359,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12362,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
12365,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
12371,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12380,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [12]:
# Writing data to a csv file
data.to_csv('group05transactions01.csv', index=True)

In [13]:
data.shape

(200000, 17)

In [14]:
# Copying data to trax
trax = data.copy()

In [15]:
trax.head(9)

Unnamed: 0_level_0,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
12359,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12362,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
12365,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
12371,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
12380,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
12383,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
12386,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
12392,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False
12401,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False


In [16]:
# Using the apriori algorithm to find the frequent itemsets with 
# Minimum support threshold of 1%

fi = apriori(trax, min_support = 0.01, use_colnames = True)
fi

Unnamed: 0,support,itemsets
0,0.054645,(Bow)
1,0.171005,(Candy Bar)
2,0.146885,(Greeting Cards)
3,0.241305,(Magazine)
4,0.0267,(Pain Reliever)
5,0.134925,(Pencils)
6,0.143575,(Pens)
7,0.08996,(Perfume)
8,0.05848,(Photo Processing)
9,0.014505,(Prescription Med)


### 40 items are frequent and out of these 15 are singletons

### We have 15 singletons which means 2 columns are in less than 1% of the transactions

### 1% is around 2000 transactions

In [17]:
# identifying association rules with minimum threshold of 10%

rules = association_rules(fi, metric="confidence", min_threshold=0.1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
2,(Greeting Cards),(Candy Bar),0.146885,0.171005,0.04366,0.297239,1.738191,0.018542,1.179626,0.49781
3,(Candy Bar),(Greeting Cards),0.171005,0.146885,0.04366,0.255314,1.738191,0.018542,1.145604,0.512294
4,(Candy Bar),(Magazine),0.171005,0.241305,0.040535,0.23704,0.982325,-0.000729,0.99441,-0.021244
5,(Magazine),(Candy Bar),0.241305,0.171005,0.040535,0.167982,0.982325,-0.000729,0.996367,-0.023167
6,(Candy Bar),(Pencils),0.171005,0.134925,0.033015,0.193065,1.430903,0.009942,1.07205,0.36326
7,(Pencils),(Candy Bar),0.134925,0.171005,0.033015,0.244691,1.430903,0.009942,1.097558,0.348109
8,(Toothpaste),(Candy Bar),0.160425,0.171005,0.03978,0.247966,1.450053,0.012347,1.102338,0.369675
9,(Candy Bar),(Toothpaste),0.171005,0.160425,0.03978,0.232625,1.450053,0.012347,1.094087,0.374393


### We find 50 rules with a minimum confidence of 10% 


In [18]:
# Sorting first by lift and then by confidence

# Highest lift is 3.60


rules2 = rules.sort_values(['lift', 'confidence'], ascending =[False, False])
rules2

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
27,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
26,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
31,(Greeting Cards),"(Candy Bar, Magazine)",0.146885,0.040535,0.016665,0.113456,2.798966,0.010711,1.082253,0.753386
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Greeting Cards, Magazine)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
49,"(Toothpaste, Magazine)",(Greeting Cards),0.031665,0.146885,0.011945,0.37723,2.568202,0.007294,1.369873,0.63059
39,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124




The rule {Perfume} --> {Toothbrush} has the highest lift of 3.601

Since the lift is symmetric {Toothbrush} --> {Perfume} has the same lift



A : Antecedent, C : Consequent

Lift is calculated using the following formula: 

Support (A & C) / Support (A)*Support (C)


Another way to calculate it is: Confidence (Rule) / Support (C)



For {Perfume} --> {Toothbrush} we calculate lift this way: 

0.021820 / (0.089960 * 0.067350) = 3.601370

## Leverage

Leverage is another measure of independence, it is a measure of deviation from independence. 

It is calculated using the following formula:
    
Leverage = Support(A & C) - (Support(A) * Support(C))

A leverage of 0 imples A and C are Statistically independent

Greater than 0 implies Positive Correlation

Less than 0 implies Negative Correlation


Calculation: 
    
    0.021820 - (0.067350 * 0.089960) = 0.015761
    
    
This suggests a slight positive relationship between perfume and toothbrush purchases.

## Conviction

Conviction is another measure of dependency, it checks the dependence of consequent on the antecedent. 

A conviction of 1 implies independence. 

Formula: 
    
    1 - Support(C) / (1 - Confidence(A --> C)) = Conviction
    
Calculation: 
    
    The conviction for rule 27 :
    (1- .067350)/(1-.242552)=1.231306


    The conviction for rule 36:
    (1- .08996)/(1-.32398)=1.346172


In [19]:
# Getting top 5 rules with highest confidence
rules.nlargest(n=5,columns=['confidence'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Greeting Cards, Magazine)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
39,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
32,"(Greeting Cards, Toothpaste)",(Candy Bar),0.03208,0.171005,0.013175,0.410692,2.401637,0.007689,1.406726,0.60296


Some insights we can gather from these results are:

- The same 4 items appear several times in the top 5 rules, sometimes simply switching up what is considered an antecedent versus a consequent.
- 5/5 of these rules include a Candy Bar. Candy Bars appear in 17% of the transactions. 
- Magazines, Greeting Cards and Toothpaste each appear in 3/5 rules. 
- The most frequently bought items in the store are Magazines (24%), Candy Bars (17%), Toothpaste (16%), and Greeting Cards (15%), in that order. 
- While their frequency could be alarming, we can notice that the confidence of each rule (range: 0.41-0.46) is usually higher than the support of the consequents. So, these rules are likely helping us make correct associations and increase the chance of purchasing the consequents to ~40%. 
- Lift is >1 in all rules – hence, we can assume they all have a positive correlation. 
- Conviction ranges from 1.4-1.55. This tells us that, when we assume the antecedents and consequents of the rules above are related, if these two events are actually independent, we’d be wrong about 1.4-1.55 times more often. 


In [20]:
## Getting top 5 rules with highest lift

rules.nlargest(n=5,columns=['lift'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
27,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
26,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
0,(Bow),(Toothbrush),0.054645,0.06735,0.01134,0.207521,3.081236,0.00766,1.176877,0.714499
1,(Toothbrush),(Bow),0.06735,0.054645,0.01134,0.168374,3.081236,0.00766,1.136755,0.724232
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879


Some insights we can gather from these results are:
- Candy Bars appear in 3/5 rules as a consequent. 
- Toothbrushes appear in 2/5 rules. 
- Magazines appear in 2/5 rules. 
- Candy Bars and Magazines items are some of the most frequently sold items at the store. 
- Lift is >1 in all rules – hence, we can assume they all have a positive correlation.  
- Confidence is higher than the consequent support – this means that anytime the antecedents are purchased, the chance of the consequent getting purchased increases. 
- This is the second time that  {pencils, toothpaste} --> {candy bar}, makes an appearance. 
- Rule where {greeting cards, magazines}-->{candy bar}, also made an appearance in the last sort

In [21]:
# Getting top 5 rules with highest leverage

rules_sortc = rules.sort_values(['leverage'], ascending =[False])
rules_sortc.head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(Greeting Cards),(Candy Bar),0.146885,0.171005,0.04366,0.297239,1.738191,0.018542,1.179626,0.49781
3,(Candy Bar),(Greeting Cards),0.171005,0.146885,0.04366,0.255314,1.738191,0.018542,1.145604,0.512294
26,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
27,(Perfume),(Toothbrush),0.08996,0.06735,0.02182,0.242552,3.60137,0.015761,1.231306,0.793732
8,(Toothpaste),(Candy Bar),0.160425,0.171005,0.03978,0.247966,1.450053,0.012347,1.102338,0.369675
9,(Candy Bar),(Toothpaste),0.171005,0.160425,0.03978,0.232625,1.450053,0.012347,1.094087,0.374393
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
31,(Greeting Cards),"(Candy Bar, Magazine)",0.146885,0.040535,0.016665,0.113456,2.798966,0.010711,1.082253,0.753386
29,"(Greeting Cards, Magazine)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
6,(Candy Bar),(Pencils),0.171005,0.134925,0.033015,0.193065,1.430903,0.009942,1.07205,0.36326


In [22]:
# Dropping redundant rows
rules_sortc=rules_sortc.drop([3,27,9,31])
rules_sortc.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(Greeting Cards),(Candy Bar),0.146885,0.171005,0.04366,0.297239,1.738191,0.018542,1.179626,0.49781
26,(Toothbrush),(Perfume),0.06735,0.08996,0.02182,0.323979,3.60137,0.015761,1.346172,0.77449
8,(Toothpaste),(Candy Bar),0.160425,0.171005,0.03978,0.247966,1.450053,0.012347,1.102338,0.369675
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
29,"(Greeting Cards, Magazine)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802


- Some insights we can gather from these results are:
- Candy Bar appears in 4/5 rules. 
- Greeting Cards appear in 3/5 rules.
- It is now the third time that Rule #29 makes an appearance.
- Rule #26 has a high lift result, and is showing up for the second time in our analysis. 
- Rule #26 {Perfume} --> {Toothbrush} is an interesting rule because it includes items we have not encountered in our top association rules yet. 
- Both of these items are less frequently purchased than the other products that consistently show up in our analysis. For instance, toothbrushes are bought about 7% of the time. But when someone has bought perfume, the chance of a toothbrush purchase more than triples, to 24%. 
- While we do not have access to the prices of these items, we must recognize that perfumes are often pricier than toothbrushes (definitely pricier than the top 4 products, like candy bars or magazines). Hence, this rule is extra appealing due to its sale of a higher-end product.
- Through a more practical lens, we noticed that toothpaste and perfume fall within the realm of “self-care” and “hygiene.” A person who takes care of their dental hygiene enough to get a new toothbrush is likely to also care about other aspects of their hygiene, like their perceived smell. Hence, a rational hypothesis is that the customers who buy perfume and toothbrushes could also be interested. 

In [23]:
# Getting top 5 rules with highest conviction

rules.nlargest(n=5,columns=['conviction'])

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
42,"(Toothpaste, Pencils)",(Candy Bar),0.02456,0.171005,0.01139,0.463762,2.71198,0.00719,1.545947,0.64716
29,"(Greeting Cards, Magazine)",(Candy Bar),0.036335,0.171005,0.016665,0.458649,2.682078,0.010452,1.531344,0.650802
39,"(Toothpaste, Magazine)",(Candy Bar),0.031665,0.171005,0.01372,0.433286,2.533762,0.008305,1.46281,0.625124
30,"(Candy Bar, Magazine)",(Greeting Cards),0.040535,0.146885,0.016665,0.411126,2.798966,0.010711,1.448723,0.669879
32,"(Greeting Cards, Toothpaste)",(Candy Bar),0.03208,0.171005,0.013175,0.410692,2.401637,0.007689,1.406726,0.60296


Some insights we can gather from these results are:
- We have seen every single one of these rules before in our analysis. In fact, these rules include only the top 5 most frequently sold products in the store. 
- Most of these rules have Candy Bar as their consequent.
- Confidence is higher than the consequent support for every single one of these rules. Hence, the consequents have a better chance of being purchased when they are bundled with the antecedents than when they are not. 

Overall, we reviewed a little less than 20 rules. Most of these 20 rules included the same 4 products (Magazine, Candy Bar, Toothpaste, Greeting Cards) which are the ones that are most frequently purchased. Hence, it feels redundant to utilize 10+ rules trying to predict the sale of a Candy Bar or a Magazine when it is already likely that they will purchase it. 