First, install the python package called "pymining" by typing **pip install mlxtend** in cmd or terminal

Visit **http://rasbt.github.io/mlxtend/#examples** for more information

# Data Preparation

In [1]:
import pandas as pd
import matplotlib as plt
%matplotlib inline
import csv

from mlxtend.preprocessing import OnehotTransactions

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
df = pd.read_csv("data/transactionformat.csv")
df

Unnamed: 0,customer_id,purchased_item
0,1,apple
1,1,banana
2,1,carrot
3,2,banana
4,3,apple
5,4,apple
6,4,carrot
7,4,diet coke
8,5,banana
9,5,carrot


In [3]:
#convert the data
###  https://stackoverflow.com/questions/22219004/grouping-rows-in-list-in-pandas-groupby

transactions = df.groupby(['customer_id'])['purchased_item'].apply(list)
transactions

customer_id
1       [apple, banana, carrot]
2                      [banana]
3                       [apple]
4    [apple, carrot, diet coke]
5              [banana, carrot]
6              [banana, carrot]
Name: purchased_item, dtype: object

In [4]:
# convert to list

dataset = transactions.values.T.tolist()
dataset

[['apple', 'banana', 'carrot'],
 ['banana'],
 ['apple'],
 ['apple', 'carrot', 'diet coke'],
 ['banana', 'carrot'],
 ['banana', 'carrot']]

In [5]:
oht = OnehotTransactions()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df

Unnamed: 0,apple,banana,carrot,diet coke
0,1,1,1,0
1,0,1,0,0
2,1,0,0,0
3,1,0,1,1
4,0,1,1,0
5,0,1,1,0


In [6]:
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.5,[apple]
1,0.666667,[banana]
2,0.666667,[carrot]
3,0.166667,[diet coke]
4,0.166667,"[apple, banana]"
5,0.333333,"[apple, carrot]"
6,0.166667,"[apple, diet coke]"
7,0.5,"[banana, carrot]"
8,0.166667,"[carrot, diet coke]"
9,0.166667,"[apple, banana, carrot]"


In [7]:
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(diet coke),(apple),0.166667,1.0,2.0
1,(apple),(diet coke),0.5,0.333333,2.0
2,(carrot),(banana),0.666667,0.75,1.125
3,(banana),(carrot),0.666667,0.75,1.125
4,"(diet coke, carrot)",(apple),0.166667,1.0,2.0
5,"(diet coke, apple)",(carrot),0.166667,1.0,1.5
6,"(carrot, apple)",(diet coke),0.333333,0.5,3.0
7,(diet coke),"(carrot, apple)",0.166667,1.0,3.0
8,(carrot),"(diet coke, apple)",0.666667,0.25,1.5
9,(apple),"(diet coke, carrot)",0.5,0.333333,2.0


In [8]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
rules

Unnamed: 0,antecedants,consequents,support,confidence,lift
0,(diet coke),(apple),0.166667,1.0,2.0
1,(apple),(diet coke),0.5,0.333333,2.0
2,"(diet coke, carrot)",(apple),0.166667,1.0,2.0
3,"(diet coke, apple)",(carrot),0.166667,1.0,1.5
4,"(carrot, apple)",(diet coke),0.333333,0.5,3.0
5,(diet coke),"(carrot, apple)",0.166667,1.0,3.0
6,(carrot),"(diet coke, apple)",0.666667,0.25,1.5
7,(apple),"(diet coke, carrot)",0.5,0.333333,2.0
8,(diet coke),(carrot),0.166667,1.0,1.5
9,(carrot),(diet coke),0.666667,0.25,1.5


In [9]:
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules["antecedant_len"] = rules["antecedants"].apply(lambda x: len(x))
rules

Unnamed: 0,antecedants,consequents,support,confidence,lift,antecedant_len
0,(diet coke),(apple),0.166667,1.0,2.0,1
1,(carrot),(banana),0.666667,0.75,1.125,1
2,(banana),(carrot),0.666667,0.75,1.125,1
3,"(diet coke, carrot)",(apple),0.166667,1.0,2.0,2
4,"(diet coke, apple)",(carrot),0.166667,1.0,1.5,2
5,"(carrot, apple)",(diet coke),0.333333,0.5,3.0,2
6,(diet coke),"(carrot, apple)",0.166667,1.0,3.0,1
7,(diet coke),(carrot),0.166667,1.0,1.5,1
8,(carrot),(apple),0.666667,0.5,1.0,1
9,(apple),(carrot),0.5,0.666667,1.0,1


Pandas DataFrames make it easy to filter the results further. Let's say we are ony interested in rules that satisfy the following criteria:

1. at least 2 antecedants
2. a confidence > 0.75
3. a lift score > 1.1

We could compute the antecedent length as follows:

In [10]:
rules[ (rules['antecedant_len'] >= 2) &
       (rules['confidence'] > 0.75) &
       (rules['lift'] > 1.1) ]

Unnamed: 0,antecedants,consequents,support,confidence,lift,antecedant_len
3,"(diet coke, carrot)",(apple),0.166667,1.0,2.0,2
4,"(diet coke, apple)",(carrot),0.166667,1.0,1.5,2
11,"(apple, banana)",(carrot),0.166667,1.0,1.5,2
