In [1]:
import pandas as pd

In [2]:
import numpy as np
import itertools
import re
import ordered_set
from ordered_set import OrderedSet

In [3]:
class itemset:
    def __init__(self, items, support):
        self.items = items
        self.support = support
        
    def to_string(self):
        return "Items:" + str(self.items) + "\n Support:" + str(self.support)


In [4]:
class Rules:
    def __init__(self, x, y):
        self.x=x
        self.y=y
    
    def support_count(self, l, df):
        ls=[]
        for se in sorted(l):
            f_df = df[se]
            if(isinstance(f_df, pd.Series)):
                ls.append(itemset(OrderedSet([se]), (sum(f_df))/len(df)))
            else:
                ls.append(itemset(se, sum(f_df.all(axis=1))/len(df)))
        return ls
        
    def confidence_x(self, df):
        
        xs=(self.x[0])
        ys=(self.y[0])
        return (self.support_count([xs | ys], df)[0].support) /(self.support_count(self.x, df)[0].support)
    
    def confidence_y(self, df):
        
        xs=(self.x[0])
        ys=(self.y[0])
        return (self.support_count([xs | ys], df)[0].support) /(self.support_count(self.y,df)[0].support)

In [5]:
class Apriori:
    
    def __init__(self, df):
        self.df = df
        
    def support_count(self, l):
        ls=[]
        for se in sorted(l):
            f_df = self.df[se]
            
            if(isinstance(f_df, pd.Series)):
                ls.append(itemset(OrderedSet([se]), sum(f_df)/len(self.df)))
            else:
                ls.append(itemset(se, (sum(f_df.all(axis=1))/len(self.df))))
        return ls
    
    def create_k_set(self,iters, k):
        ls = []
        for i in range(len(iters)):
            for j in range(len(iters) -1) :
                s1 = iters[i].items
                s2 = iters[j+1].items
                if (s1 != s2 and s1[0:k-2] == s2[0:k-2]) and (len(s1) ==  len(s2)):
                    s = s1 | s2
                    s = sorted(s)
                    s = OrderedSet(s)
                    if s not in ls:
                        ls.append(s)
        return ls
    
    def filter_minsup(self, ls, minsup):
        new_ls = []
        stop_algo = True
        for items in ls:
            if(items.support >= minsup):
                new_ls.append(items)
                if items.support != minsup:
                    stop_algo = False
        return new_ls, stop_algo
    
    def create_x(self, k2, i):     
        xs = {}
        idx =0 
        while(i> 1):
            for xa in k2:
                if i > 1:
                    xs[idx] = list(itertools.combinations(xa.items, i))
                    idx+=1
            i-=1
        return xs
    
    def create_y(self, k2, xs):
        ys = {}
        for idx, itemx in xs.items():
            temp_y = []
            all_items= set(list(itertools.chain(*itemx)))
            for yi in range(len(itemx)):
                y = OrderedSet(itemx[yi]).symmetric_difference(all_items)
                temp_y.append(y)
            ys[idx] = temp_y
        return ys
        
    def create_rules(self, k2, k):
        xs = self.create_x(k2, k)
        ys = self.create_y(k2, xs)
        
        rs = []
        for idx, item in  xs.items():
            for i in range(len(item)):
                yi = ys[idx][i]
                xi = item[i]
                rs.append(Rules([OrderedSet(xi)], [(yi)]))
        
        return rs
        
    
    def apriori(self, minssup):
        k_set=list(self.df.columns)
        k=2
        l_k_set = 0
        while True:
            ck_set = self.support_count(k_set) #should return if tk_set never has an item
            t_k_set, stop_algo = self.filter_minsup(ck_set, minssup)
            if stop_algo: break
            k_set = self.create_k_set(t_k_set, k)
            if len(k_set) == 0: break
            k+=1
            l_k_set+=1
        final_k_set = t_k_set if len(t_k_set) > 0 else k_set
        return self.create_rules(final_k_set, l_k_set)

In [6]:
data = {"customer_id": [1,1,2,2,3,3,4,4,5], "transaction_id": [1,2,3,4,5,6,7,8,9], 
        "items": [["Milk", "Bread", "Butter"], ["Bread", "Cheese"], ["Bread", "Jam"],
                  ["Milk", "Bread", "Cheese"], ["Milk", "Jam"], ["Bread", "Jam"], ["Milk", "Jam"], 
                  ["Milk", "Bread", "Jam", "Butter"], ["Milk", "Bread", "Jam"]]}
        

In [7]:
df=pd.DataFrame(data)

In [8]:
df

Unnamed: 0,customer_id,transaction_id,items
0,1,1,"[Milk, Bread, Butter]"
1,1,2,"[Bread, Cheese]"
2,2,3,"[Bread, Jam]"
3,2,4,"[Milk, Bread, Cheese]"
4,3,5,"[Milk, Jam]"
5,3,6,"[Bread, Jam]"
6,4,7,"[Milk, Jam]"
7,4,8,"[Milk, Bread, Jam, Butter]"
8,5,9,"[Milk, Bread, Jam]"


In [9]:
#Index - pass lis
df = df[['transaction_id', 'items']]

In [10]:
df

Unnamed: 0,transaction_id,items
0,1,"[Milk, Bread, Butter]"
1,2,"[Bread, Cheese]"
2,3,"[Bread, Jam]"
3,4,"[Milk, Bread, Cheese]"
4,5,"[Milk, Jam]"
5,6,"[Bread, Jam]"
6,7,"[Milk, Jam]"
7,8,"[Milk, Bread, Jam, Butter]"
8,9,"[Milk, Bread, Jam]"


In [11]:
for idx, row in df.iterrows():
    for item in row['items']:
        df.loc[idx, item] = True

In [13]:
df.fillna(False, inplace=True)

In [14]:
df.drop('items', axis=1, inplace=True)
df.set_index(['transaction_id'], inplace=True)

In [15]:
df.columns

Index(['Milk', 'Bread', 'Butter', 'Cheese', 'Jam'], dtype='object')

In [16]:
df

Unnamed: 0_level_0,Milk,Bread,Butter,Cheese,Jam
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,True,True,True,False,False
2,False,True,False,True,False
3,False,True,False,False,True
4,True,True,False,True,False
5,True,False,False,False,True
6,False,True,False,False,True
7,True,False,False,False,True
8,True,True,True,False,True
9,True,True,False,False,True


In [17]:
apriori_df = Apriori(df)

In [19]:
rules = apriori_df.apriori(2/9)

In [20]:
for r in rules:
    print(str(r.y) + " -> " + str(r.x) + str(r.confidence_y(df)))
print("=======")
for r in rules:
    print(str(r.x) + " -> " + str(r.y) + str(r.confidence_x(df)))

[OrderedSet(['Milk'])] -> [OrderedSet(['Bread', 'Butter'])]0.3333333333333333
[OrderedSet(['Butter'])] -> [OrderedSet(['Bread', 'Milk'])]1.0
[OrderedSet(['Bread'])] -> [OrderedSet(['Butter', 'Milk'])]0.2857142857142857
[OrderedSet(['Milk'])] -> [OrderedSet(['Bread', 'Jam'])]0.3333333333333333
[OrderedSet(['Jam'])] -> [OrderedSet(['Bread', 'Milk'])]0.3333333333333333
[OrderedSet(['Bread'])] -> [OrderedSet(['Jam', 'Milk'])]0.2857142857142857
[OrderedSet(['Bread', 'Butter'])] -> [OrderedSet(['Milk'])]1.0
[OrderedSet(['Bread', 'Milk'])] -> [OrderedSet(['Butter'])]0.5
[OrderedSet(['Butter', 'Milk'])] -> [OrderedSet(['Bread'])]1.0
[OrderedSet(['Bread', 'Jam'])] -> [OrderedSet(['Milk'])]0.5
[OrderedSet(['Bread', 'Milk'])] -> [OrderedSet(['Jam'])]0.5
[OrderedSet(['Jam', 'Milk'])] -> [OrderedSet(['Bread'])]0.5


In [22]:
from mlxtend.frequent_patterns import apriori

In [24]:
shop_frequent_itemsets = apriori(df, min_support=2/9, use_colnames=True)

In [25]:
shop_frequent_itemsets

Unnamed: 0,support,itemsets
0,0.666667,(Milk)
1,0.777778,(Bread)
2,0.222222,(Butter)
3,0.222222,(Cheese)
4,0.666667,(Jam)
5,0.444444,"(Bread, Milk)"
6,0.222222,"(Milk, Butter)"
7,0.444444,"(Milk, Jam)"
8,0.222222,"(Bread, Butter)"
9,0.222222,"(Bread, Cheese)"


In [27]:
from mlxtend.frequent_patterns import association_rules

In [28]:
shop_rules = association_rules(shop_frequent_itemsets, metric="confidence", min_threshold=0.2)

In [29]:
shop_rules[['antecedents', 'consequents', 'confidence']].tail(12)

Unnamed: 0,antecedents,consequents,confidence
12,"(Bread, Milk)",(Butter),0.5
13,"(Bread, Butter)",(Milk),1.0
14,"(Milk, Butter)",(Bread),1.0
15,(Bread),"(Milk, Butter)",0.285714
16,(Milk),"(Bread, Butter)",0.333333
17,(Butter),"(Bread, Milk)",1.0
18,"(Bread, Milk)",(Jam),0.5
19,"(Bread, Jam)",(Milk),0.5
20,"(Milk, Jam)",(Bread),0.5
21,(Bread),"(Milk, Jam)",0.285714


In [30]:
df=pd.read_csv("tutorial3")
df.head(20)


Unnamed: 0,A,Quantity,Transaction,Store,Product
0,30000,2,93194,6,Magazine
1,30001,2,93194,6,Candy Bar
2,30002,2,93194,6,Candy Bar
3,30003,2,93194,6,Candy Bar
4,30004,2,93194,6,Candy Bar
5,30005,2,93197,1,Pencils
6,30006,1,93200,6,Candy Bar
7,30007,1,93200,6,Candy Bar
8,30008,1,93200,6,Candy Bar
9,30009,1,93200,6,Magazine


In [31]:
df2 = df.groupby(["Transaction", "Product"]).size().reset_index(name="Count")
df2.head(20)

Unnamed: 0,Transaction,Product,Count
0,93194,Candy Bar,4
1,93194,Magazine,1
2,93197,Pencils,1
3,93200,Candy Bar,3
4,93200,Magazine,1
5,93206,Greeting Cards,1
6,93206,Magazine,1
7,93206,Pencils,2
8,93212,Toothbrush,1
9,93215,Candy Bar,2


In [32]:
df3 = (df2.groupby(["Transaction", "Product"])['Count']
          .sum().unstack().reset_index().fillna(0)
          .set_index('Transaction'))

df3.head()

Product,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
93194,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93200,0.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93206,0.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [33]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

df4 = df3.applymap(encode_units)

df4

Product,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
93194,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
93197,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
93200,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
93206,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
93212,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
133424,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
133427,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
133430,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [34]:
app = Apriori(df4)

In [35]:
app.df

Product,Bow,Candy Bar,Deodorant,Greeting Cards,Magazine,Markers,Pain Reliever,Pencils,Pens,Perfume,Photo Processing,Prescription Med,Shampoo,Soap,Toothbrush,Toothpaste,Wrapping Paper
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
93194,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
93197,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
93200,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
93206,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0
93212,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133406,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
133424,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
133427,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
133430,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [36]:
tx = app.apriori(0.001)

In [37]:
for r in tx:
    print(str(r.y) + " -> " + str(r.x) + str(r.confidence_y(df4)))
print("=======")
for r in tx:
    print(str(r.x) + " -> " + str(r.y) + str(r.confidence_x(df4)))

[OrderedSet(['Toothpaste'])] -> [OrderedSet(['Candy Bar', 'Greeting Cards', 'Magazine', 'Perfume', 'Toothbrush'])]0.0036596523330283625
[OrderedSet(['Toothbrush'])] -> [OrderedSet(['Candy Bar', 'Greeting Cards', 'Magazine', 'Perfume', 'Toothpaste'])]0.00879120879120879
[OrderedSet(['Perfume'])] -> [OrderedSet(['Candy Bar', 'Greeting Cards', 'Magazine', 'Toothbrush', 'Toothpaste'])]0.007194244604316546
[OrderedSet(['Magazine'])] -> [OrderedSet(['Candy Bar', 'Greeting Cards', 'Perfume', 'Toothbrush', 'Toothpaste'])]0.002564102564102564
[OrderedSet(['Greeting Cards'])] -> [OrderedSet(['Candy Bar', 'Magazine', 'Perfume', 'Toothbrush', 'Toothpaste'])]0.003891050583657587
[OrderedSet(['Candy Bar'])] -> [OrderedSet(['Greeting Cards', 'Magazine', 'Perfume', 'Toothbrush', 'Toothpaste'])]0.0033840947546531297
[OrderedSet(['Toothbrush', 'Toothpaste'])] -> [OrderedSet(['Candy Bar', 'Greeting Cards', 'Magazine', 'Perfume'])]0.16666666666666666
[OrderedSet(['Perfume', 'Toothpaste'])] -> [OrderedSet(

In [42]:
pd.set_option('display.max_colwidth', 400)

In [43]:
shop_frequent_itemsets = apriori(df4, min_support=0.001, use_colnames=True)

(shop_frequent_itemsets.head(500).tail(50))

Unnamed: 0,support,itemsets
166,0.001041,"(Magazine, Shampoo, Perfume)"
167,0.004163,"(Magazine, Toothbrush, Perfume)"
168,0.003122,"(Magazine, Toothpaste, Perfume)"
169,0.001041,"(Magazine, Shampoo, Photo Processing)"
170,0.001487,"(Magazine, Toothbrush, Photo Processing)"
171,0.00223,"(Magazine, Toothpaste, Photo Processing)"
172,0.001041,"(Magazine, Wrapping Paper, Photo Processing)"
173,0.001635,"(Magazine, Prescription Med, Wrapping Paper)"
174,0.00223,"(Magazine, Toothbrush, Shampoo)"
175,0.001189,"(Magazine, Toothpaste, Shampoo)"


In [44]:
# Finding the association rules
shop_rules = association_rules(shop_frequent_itemsets, metric="confidence", min_threshold=0.00001)


# Sorting 
shop_rules.sort_values("confidence", ascending = False, inplace = True)

# Previewing the associative rules 


#top 10 combinations of items frequently bought together
shop_rules.head(10000).tail(900).head(700).tail(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
788,(Toothbrush),"(Magazine, Candy Bar, Greeting Cards)",0.067648,0.017247,0.001041,0.015385,0.892042,-0.000126,0.998109
553,(Magazine),"(Toothpaste, Pens)",0.231936,0.007731,0.00342,0.014744,1.907027,0.001626,1.007117
870,(Magazine),"(Candy Bar, Pencils, Toothpaste)",0.231936,0.011002,0.00342,0.014744,1.340073,0.000868,1.003797
733,(Greeting Cards),"(Magazine, Candy Bar, Pens)",0.15284,0.004758,0.00223,0.014591,3.066938,0.001503,1.009979
887,(Pens),"(Magazine, Candy Bar, Toothpaste)",0.144068,0.013232,0.002081,0.014448,1.09187,0.000175,1.001233
339,(Pens),"(Candy Bar, Pencils)",0.144068,0.035088,0.002081,0.014448,0.411765,-0.002974,0.979058
212,(Perfume),"(Toothbrush, Bow)",0.082664,0.01011,0.001189,0.014388,1.423191,0.000354,1.004341
699,(Perfume),"(Toothbrush, Toothpaste)",0.082664,0.003568,0.001189,0.014388,4.032374,0.000894,1.010978
900,(Perfume),"(Magazine, Candy Bar, Toothbrush)",0.082664,0.002081,0.001189,0.014388,6.912641,0.001017,1.012487
18,(Toothpaste),(Bow),0.162504,0.051591,0.00223,0.013724,0.26601,-0.006154,0.961606
