In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

## Read dataset

In [2]:
sales_receipt_df = pd.read_csv('./datasets/201904 sales reciepts.csv')
products_df = pd.read_csv('./datasets/product.csv')

In [3]:
sales_receipt_df.head(2)

Unnamed: 0,transaction_id,transaction_date,transaction_time,sales_outlet_id,staff_id,customer_id,instore_yn,order,line_item_id,product_id,quantity,line_item_amount,unit_price,promo_item_yn
0,7,2019-04-01,12:04:43,3,12,558,N,1,1,52,1,2.5,2.5,N
1,11,2019-04-01,15:54:39,3,17,781,N,1,1,27,2,7.0,3.5,N


In [4]:
products_df.head(2)

Unnamed: 0,product_id,product_group,product_category,product_type,product,product_description,unit_of_measure,current_wholesale_price,current_retail_price,tax_exempt_yn,promo_yn,new_product_yn
0,1,Whole Bean/Teas,Coffee beans,Organic Beans,Brazilian - Organic,It's like Carnival in a cup. Clean and smooth.,12 oz,14.4,$18.00,Y,N,N
1,2,Whole Bean/Teas,Coffee beans,House blend Beans,Our Old Time Diner Blend,Out packed blend of beans that is reminiscent ...,12 oz,14.4,$18.00,Y,N,N


## Data Wrangling

In [5]:
# filter out the relevent data
sales_receipt_df = sales_receipt_df[['transaction_id','transaction_date','sales_outlet_id','customer_id','product_id','quantity']]
products_df = products_df[['product_id','product_category','product']]

In [6]:
# merging the datasets
df = pd.merge(sales_receipt_df, products_df, how='left', on='product_id')
df.head(2)

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product
0,7,2019-04-01,3,558,52,1,Tea,Traditional Blend Chai Rg
1,11,2019-04-01,3,781,27,2,Coffee,Brazilian Lg


### removing the sizes Lg, Sm, Rg and irrelevent items from the dataset and finding the category

In [7]:
# this is the before sizes
print("before :: ",df[df['product'].str.contains('Brazilian')]['product'].unique())

# replacing the sizes value with nothing
df['product'] = df['product'].str.replace(' Lg', '')
df['product'] = df['product'].str.replace(' Sm', '')
df['product'] = df['product'].str.replace(' Rg', '')

# this is after
print("after ::", df[df['product'].str.contains('Brazilian')]['product'].unique())

before ::  ['Brazilian Lg' 'Brazilian Sm' 'Brazilian Rg' 'Brazilian - Organic']
after :: ['Brazilian' 'Brazilian - Organic']


In [8]:
print(sorted(list(df['product'].unique())))

['Almond Croissant', 'Brazilian', 'Brazilian - Organic', 'Cappuccino', 'Carmel syrup', 'Chili Mayan', 'Chocolate Chip Biscotti', 'Chocolate Croissant', 'Chocolate syrup', 'Civet Cat', 'Columbian Medium Roast', 'Cranberry Scone', 'Croissant', 'Dark chocolate', 'Earl Grey', 'English Breakfast', 'Espresso Roast', 'Espresso shot', 'Ethiopia', 'Ginger Biscotti', 'Ginger Scone', 'Guatemalan Sustainably Grown', 'Hazelnut Biscotti', 'Hazelnut syrup', 'I Need My Bean! Diner mug', 'I Need My Bean! Latte cup', 'I Need My Bean! T-shirt', 'Jamacian Coffee River', 'Jamaican Coffee River', 'Jumbo Savory Scone', 'Latte', 'Lemon Grass', 'Morning Sunrise Chai', 'Oatmeal Scone', 'Organic Decaf Blend', 'Our Old Time Diner Blend', 'Ouro Brasileiro shot', 'Peppermint', 'Primo Espresso Roast', 'Scottish Cream Scone ', 'Serenity Green Tea', 'Spicy Eye Opener Chai', 'Sugar Free Vanilla syrup', 'Sustainably Grown Organic', 'Traditional Blend Chai']


In [9]:
# the item which i select from the dataset
products_to_consider = ['Cappuccino','Latte','Espresso shot','Dark chocolate ','Dark chocolate','Sugar Free Vanilla syrup','Chocolate syrup',
                        'Carmel syrup','Hazelnut syrup','Ginger Scone','Chocolate Croissant','Jumbo Savory Scone','Cranberry Scone',
                        'Hazelnut Biscotti','Croissant','Almond Croissant','Oatmeal Scone','Chocolate Chip Biscotti','Ginger Biscotti']

# filtering the dataset
df = df[df['product'].isin(products_to_consider)]
df.head(2)

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product
16,108,2019-04-01,3,65,40,1,Coffee,Cappuccino
17,112,2019-04-01,3,90,37,2,Coffee,Espresso shot


In [10]:
# finding the product category with the corrosponding product
df[['product','product_category']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,product,product_category
0,Cappuccino,Coffee
1,Espresso shot,Coffee
2,Latte,Coffee
3,Dark chocolate,Drinking Chocolate
4,Oatmeal Scone,Bakery
5,Jumbo Savory Scone,Bakery
6,Chocolate Chip Biscotti,Bakery
7,Ginger Biscotti,Bakery
8,Chocolate Croissant,Bakery
9,Hazelnut Biscotti,Bakery


## Clean Transnation

Now some trasnactions id goes to multiple customer because here is only one product per row, so 
<br>if they are sitting together and pay as a group then the same id can go to multiple customers
<br> or if a person order multiple items
<br> which can confuse recommandation engine so we concatinate transationid and customer id to make a new transation number.

In [11]:
df['transaction'] = df['transaction_id'].astype('str') + '_' + df['customer_id'].astype('str')
df.head(2)

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product,transaction
16,108,2019-04-01,3,65,40,1,Coffee,Cappuccino,108_65
17,112,2019-04-01,3,90,37,2,Coffee,Espresso shot,112_90


### removing the customers which only bought ones and never return back because we don't need them for recommandation engine

In [12]:
num_of_itemes_for_transaction = df['transaction'].value_counts().reset_index()
num_of_itemes_for_transaction.head()

Unnamed: 0,transaction,count
0,209_0,31
1,206_0,30
2,204_0,27
3,208_0,25
4,203_0,24


In [13]:
# filter tensactions with more than 2 items
valid_transactions = num_of_itemes_for_transaction[num_of_itemes_for_transaction['count'] > 1]['transaction'].to_list()
df = df[df['transaction'].isin(valid_transactions)]
df.head(2)

Unnamed: 0,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,product_category,product,transaction
34,199,2019-04-01,3,112,41,2,Coffee,Cappuccino,199_112
35,199,2019-04-01,3,112,79,1,Bakery,Jumbo Savory Scone,199_112


In [14]:
df.shape

(10189, 9)

## Popular Trends

In [19]:
df['product_category'].value_counts()

product_category
Bakery                3800
Coffee                3174
Flavours              2246
Drinking Chocolate     947
Packaged Chocolate      22
Name: count, dtype: int64

In [20]:
df["product"].value_counts()

product
Cappuccino                  1290
Latte                       1256
Dark chocolate               969
Chocolate Croissant          636
Espresso shot                628
Sugar Free Vanilla syrup     605
Chocolate syrup              568
Carmel syrup                 561
Hazelnut syrup               512
Ginger Scone                 417
Jumbo Savory Scone           357
Croissant                    355
Chocolate Chip Biscotti      352
Cranberry Scone              350
Almond Croissant             347
Hazelnut Biscotti            338
Oatmeal Scone                334
Ginger Biscotti              314
Name: count, dtype: int64

## getting the  Popuplar item for  Recommendation engine

In [21]:
product_recommendation = df.groupby(["product","product_category"]).count().reset_index()
product_recommendation.head()

Unnamed: 0,product,product_category,transaction_id,transaction_date,sales_outlet_id,customer_id,product_id,quantity,transaction
0,Almond Croissant,Bakery,347,347,347,347,347,347,347
1,Cappuccino,Coffee,1290,1290,1290,1290,1290,1290,1290
2,Carmel syrup,Flavours,561,561,561,561,561,561,561
3,Chocolate Chip Biscotti,Bakery,352,352,352,352,352,352,352
4,Chocolate Croissant,Bakery,636,636,636,636,636,636,636


In [22]:
# we only store three columns
product_recommendation = product_recommendation[['product','product_category','transaction_id']]
# change the column names
product_recommendation.columns = ['product','product_category','count']
product_recommendation.head()

Unnamed: 0,product,product_category,count
0,Almond Croissant,Bakery,347
1,Cappuccino,Coffee,1290
2,Carmel syrup,Flavours,561
3,Chocolate Chip Biscotti,Bakery,352
4,Chocolate Croissant,Bakery,636


In [23]:
# save the dataframe to a csv file
# product_recommendation.to_csv('./api/recommendation_objects/popularity_recommendation.csv',index=False)

## Apriori Recommendation engine

In [24]:
# count the number of items in each transaction
df.groupby(["transaction","product"])["product"].count().reset_index(name="count").head()

Unnamed: 0,transaction,product,count
0,1000_0,Dark chocolate,1
1,1000_0,Oatmeal Scone,1
2,1001_8306,Cappuccino,1
3,1001_8306,Carmel syrup,1
4,1002_0,Carmel syrup,1


above we can see that 0, 1 index transation is same means same person bought both items at once.<br><br>
so its a long format data we convert it to wide format data where each transation have all product and we use 0 and 1 if it bought together.

In [25]:
train_basket = (df.groupby(["transaction","product"])["product"].count().reset_index(name="count"))

In [26]:
# convert long format to wide format
train_basket = train_basket.pivot_table(index="transaction", columns="product", values="count").fillna(0)
train_basket.head()

product,Almond Croissant,Cappuccino,Carmel syrup,Chocolate Chip Biscotti,Chocolate Croissant,Chocolate syrup,Cranberry Scone,Croissant,Dark chocolate,Espresso shot,Ginger Biscotti,Ginger Scone,Hazelnut Biscotti,Hazelnut syrup,Jumbo Savory Scone,Latte,Oatmeal Scone,Sugar Free Vanilla syrup
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1000_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1001_8306,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1002_0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1004_5383,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1005_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0


In [27]:
def encode_units(x):
    """It encodes the number of items in each transaction to be (False)0 or (True)1 only because it has
    count values greater than 1"""
    x = int(x)
    if x<=0:
        return False
    if x>0:
        return True
    
my_basket_sets = train_basket.map(encode_units)
# this output is what the apriori expects 0 or 1
my_basket_sets.head()
    

product,Almond Croissant,Cappuccino,Carmel syrup,Chocolate Chip Biscotti,Chocolate Croissant,Chocolate syrup,Cranberry Scone,Croissant,Dark chocolate,Espresso shot,Ginger Biscotti,Ginger Scone,Hazelnut Biscotti,Hazelnut syrup,Jumbo Savory Scone,Latte,Oatmeal Scone,Sugar Free Vanilla syrup
transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1000_0,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False
1001_8306,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1002_0,False,False,True,False,False,False,False,False,True,True,False,True,False,False,False,False,False,False
1004_5383,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False
1005_0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False


In [28]:
# min_support = 0.05 means return only those items which have support greater than 5%
frequent_items = apriori(my_basket_sets, min_support=0.05, use_colnames=True)
frequent_items.head()

Unnamed: 0,support,itemsets
0,0.115646,(Almond Croissant)
1,0.388889,(Cappuccino)
2,0.191232,(Carmel syrup)
3,0.112623,(Chocolate Chip Biscotti)
4,0.135676,(Chocolate Croissant)


In [29]:
# association rules
rules_basket = association_rules(frequent_items, metric="lift", min_threshold=1)
rules_basket.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(Almond Croissant),(Cappuccino),0.115646,0.388889,0.053288,0.460784,1.184874,1.0,0.008314,1.133333,0.176432,0.11809,0.117647,0.298905
1,(Cappuccino),(Almond Croissant),0.388889,0.115646,0.053288,0.137026,1.184874,1.0,0.008314,1.024775,0.255319,0.11809,0.024176,0.298905
2,(Dark chocolate),(Almond Croissant),0.277022,0.115646,0.057445,0.207367,1.793115,1.0,0.025409,1.115717,0.611791,0.171364,0.103715,0.35205
3,(Almond Croissant),(Dark chocolate),0.115646,0.277022,0.057445,0.496732,1.793115,1.0,0.025409,1.436567,0.500152,0.171364,0.303896,0.35205
4,(Latte),(Almond Croissant),0.382086,0.115646,0.054422,0.142433,1.231629,1.0,0.010235,1.031236,0.304358,0.122762,0.03029,0.306511


In [30]:
# sort the values by confidence
rules_basket[rules_basket['antecedents'] == {'Cappuccino'}].sort_values('confidence',ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
26,(Cappuccino),(Sugar Free Vanilla syrup),0.388889,0.200302,0.113001,0.290573,1.450674,1.0,0.035105,1.127245,0.508361,0.237302,0.112881,0.427362
12,(Cappuccino),(Chocolate syrup),0.388889,0.188964,0.109599,0.281827,1.491429,1.0,0.036113,1.129304,0.539185,0.23406,0.114499,0.430914
7,(Cappuccino),(Carmel syrup),0.388889,0.191232,0.102797,0.264334,1.38227,1.0,0.028429,1.099369,0.45254,0.21536,0.090387,0.400942
22,(Cappuccino),(Hazelnut syrup),0.388889,0.17158,0.090325,0.232264,1.353682,1.0,0.0236,1.079044,0.427539,0.192122,0.073253,0.379348
21,(Cappuccino),(Ginger Scone),0.388889,0.133409,0.059713,0.153547,1.150951,1.0,0.007832,1.023791,0.214614,0.129085,0.023238,0.30057


## Save in JSON format

In [31]:
# getting all product category
product_category = df[['product','product_category']].drop_duplicates().set_index('product').to_dict()['product_category']
product_category

{'Cappuccino': 'Coffee',
 'Jumbo Savory Scone': 'Bakery',
 'Latte': 'Coffee',
 'Chocolate Chip Biscotti': 'Bakery',
 'Espresso shot': 'Coffee',
 'Hazelnut Biscotti': 'Bakery',
 'Chocolate Croissant': 'Bakery',
 'Dark chocolate': 'Packaged Chocolate',
 'Cranberry Scone': 'Bakery',
 'Croissant': 'Bakery',
 'Almond Croissant': 'Bakery',
 'Ginger Biscotti': 'Bakery',
 'Oatmeal Scone': 'Bakery',
 'Ginger Scone': 'Bakery',
 'Chocolate syrup': 'Flavours',
 'Hazelnut syrup': 'Flavours',
 'Carmel syrup': 'Flavours',
 'Sugar Free Vanilla syrup': 'Flavours'}

In [32]:
recommendations_json = {}

antecedents = rules_basket['antecedents'].unique()
for antecedent in antecedents:
    rec_df = rules_basket[rules_basket['antecedents'] == antecedent].sort_values('confidence',ascending=False)

    key = '_'.join(antecedent)
    recommendations_json[key] = []

    for _,row in rec_df.iterrows():
        # we did this because 
        rec_objs = row['consequents']
        for rec_obj in rec_objs:
            # if the recommendation already exists
            already_exists = False
            for current_rec_object in recommendations_json[key]:
                if rec_obj == current_rec_object['product']:
                    already_exists = True
            if already_exists:
                continue
            
            # if the recommendation does not already exist
            rec_obj = {
                'product': rec_obj,
                'product_category':product_category[rec_obj],
                'confidence': row['confidence']
            }

            recommendations_json[key].append(rec_obj)


In [33]:
import pprint
pprint.pprint(recommendations_json)

{'Almond Croissant': [{'confidence': 0.4967320261437908,
                       'product': 'Dark chocolate',
                       'product_category': 'Packaged Chocolate'},
                      {'confidence': 0.47058823529411764,
                       'product': 'Latte',
                       'product_category': 'Coffee'},
                      {'confidence': 0.46078431372549017,
                       'product': 'Cappuccino',
                       'product_category': 'Coffee'}],
 'Cappuccino': [{'confidence': 0.29057337220602525,
                 'product': 'Sugar Free Vanilla syrup',
                 'product_category': 'Flavours'},
                {'confidence': 0.2818270165208941,
                 'product': 'Chocolate syrup',
                 'product_category': 'Flavours'},
                {'confidence': 0.26433430515063167,
                 'product': 'Carmel syrup',
                 'product_category': 'Flavours'},
                {'confidence': 0.23226433430515062,
     

In [34]:
# save the json
import json
with open("api/recommendation_objects/apriori_recommendation.json","w") as json_file:
    json.dump(recommendations_json, json_file)