# Association Rule Based Recommender System using Armut's Dataset

In [1]:
# Libraries
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Task 1: 
Data Preparation.

Step 1: Read the armut.csv data.

In [2]:
df_ = pd.read_csv('armut_data.csv')

In [3]:
df = df_.copy()

In [4]:
def check_df(dataframe, head=7, tail=7):
    '''
    Prints the general information about the given dataframe e.g. shape, head,
    tail, info, descriptive statistics, etc.

    Parameters
    ----------
    dataframe : DataFrame
        The dataframe that we want to have general information about.
    head: int
        Prints the first n rows of the dataframe.
    tail: int
        Prints the last n rows of the dataframe.
    '''
    print('####### Shape #######')
    print(dataframe.shape)
    print('####### Info #######')
    print(dataframe.info())
    print('####### Head #######')
    print(dataframe.head(head))
    print('####### Tail #######')
    print(dataframe.tail(tail))
    print('####### Descriptive Statistics #######')
    print(dataframe.describe([0.05, 0.25, 0.50, 0.75 ,0.95, 0.99]).T)
    print('####### NA #######')
    print(dataframe.isnull().sum())
    print('####### Number of Unique Values #######')
    print(dataframe.nunique())


check_df(df)

####### Shape #######
(162523, 4)
####### Info #######
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162523 entries, 0 to 162522
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   UserId      162523 non-null  int64 
 1   ServiceId   162523 non-null  int64 
 2   CategoryId  162523 non-null  int64 
 3   CreateDate  162523 non-null  object
dtypes: int64(3), object(1)
memory usage: 5.0+ MB
None
####### Head #######
   UserId  ServiceId  CategoryId           CreateDate
0   25446          4           5  2017-08-06 16:11:00
1   22948         48           5  2017-08-06 16:12:00
2   10618          0           8  2017-08-06 16:13:00
3    7256          9           4  2017-08-06 16:14:00
4   25446         48           5  2017-08-06 16:16:00
5   14354         15           1  2017-08-06 16:27:00
6   14162         21           5  2017-08-06 16:28:00
####### Tail #######
        UserId  ServiceId  CategoryId           CreateDate
1

Step 2: 'ServiceID' represents a different service for each 'CategoryID'.
Combine 'ServiceID' and 'CategoryID' with '_' to create a new variable to represent the services.

In [5]:
df['Service'] = df['ServiceId'].astype(str) + '_' + df['CategoryId'].astype(str)
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service
0,25446,4,5,2017-08-06 16:11:00,4_5
1,22948,48,5,2017-08-06 16:12:00,48_5
2,10618,0,8,2017-08-06 16:13:00,0_8
3,7256,9,4,2017-08-06 16:14:00,9_4
4,25446,48,5,2017-08-06 16:16:00,48_5


Step 3: The dataset consists of the date and time the services were received, there is no basket definition (invoice, etc.).

In order to apply Association Rule Learning, a basket (invoice, etc.) definition must be created.
Here, the definition of basket is the services that each customer receives monthly. 

For Example: A basket of 9_4, 46_4 services received by the customer with id 7256 in the 8th month of 2017;
The 9_4, 38_4 services received in the 10th month of 2017 represent another basket. Baskets must be identified with a unique ID.

To do this, first create a new date variable containing only the year and month. Combine 'UserID' and the newly created date variable with '_' and assign it to a new variable called ID.

In [6]:
# Convert date variable to date data type
df['CreateDate'] = df['CreateDate'].apply(pd.to_datetime)

In [7]:
# Create a new date variable containing only the year and month
df['New_Date'] = pd.to_datetime(df['CreateDate']).dt.to_period('M').astype(str)
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08


In [8]:
# Combine 'UserID' and the newly created date variable (New_Date) with '_' and assign it to a new variable (BasketID).
df['BasketID'] = df['UserId'].astype(str) + '_' + df['New_Date']
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date,BasketID
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08,10618_2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08,7256_2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08,25446_2017-08


In [9]:
# Example for 'UserId': 7256
df[df['UserId'] == 7256]

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,New_Date,BasketID
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08,7256_2017-08
1268,7256,46,4,2017-08-09 16:15:00,46_4,2017-08,7256_2017-08
9540,7256,46,4,2017-08-29 03:53:00,46_4,2017-08,7256_2017-08
24679,7256,9,4,2017-10-01 04:59:00,9_4,2017-10,7256_2017-10
24680,7256,38,4,2017-10-01 05:01:00,38_4,2017-10,7256_2017-10
28698,7256,9,4,2017-10-11 08:06:00,9_4,2017-10,7256_2017-10
65325,7256,15,1,2017-12-31 04:17:00,15_1,2017-12,7256_2017-12
67093,7256,2,0,2018-01-03 22:06:00,2_0,2018-01,7256_2018-01
70623,7256,38,4,2018-01-11 13:07:00,38_4,2018-01,7256_2018-01
160299,7256,18,4,2018-07-25 00:51:00,18_4,2018-07,7256_2018-07


# Task 2: 
Create Association Rules.

Step 1: Create the basket service pivot table as below. 

|    Service     |  0_8   |  10_9  | 11_11  |  12_7  | 13_11  |  14_7  |  15_1  |  16_8  |  17_5  | 18_4.. |
| :-----------: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: |
|    BasketID    | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; | &nbsp; |
|   0_2017-08   |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |  0..   |
|   0_2017-09   |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |  0..   |
|   0_2018-01   |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |  0..   |
|   0_2018-04   |   0    |   0    |   0    |   0    |   0    |   1    |   0    |   0    |   0    |  0..   |
| 10000_2017-08 |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |   0    |  0..   | 


In [10]:
invoice_product_df = df.groupby(['BasketID', 'Service'])['Service'].count().unstack().fillna(0).applymap(lambda x: 1 if x > 0 else 0)
invoice_product_df.head()

Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,19_6,1_4,20_5,21_5,22_0,23_10,24_10,25_0,26_7,27_7,28_4,29_0,2_0,30_2,31_6,32_4,33_4,34_6,35_11,36_1,37_0,38_4,39_10,3_5,40_8,41_3,42_1,43_2,44_0,45_6,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
BasketID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
0_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
0_2017-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
0_2018-01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
0_2018-04,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10000_2017-08,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


Step 2: Create association rules.

In [11]:
frequent_itemsets = apriori(invoice_product_df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='support', min_threshold=0.01)
rules.sort_values('support', ascending=False)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(15_1),(2_0),0.121,0.13,0.034,0.281,2.154,0.018,1.209,0.61
3,(2_0),(15_1),0.13,0.121,0.034,0.261,2.154,0.018,1.189,0.616
12,(22_0),(2_0),0.048,0.13,0.017,0.349,2.676,0.01,1.335,0.658
13,(2_0),(22_0),0.13,0.048,0.017,0.127,2.676,0.01,1.091,0.72
14,(25_0),(2_0),0.043,0.13,0.013,0.313,2.404,0.008,1.266,0.61
15,(2_0),(25_0),0.13,0.043,0.013,0.103,2.404,0.008,1.067,0.672
0,(13_11),(2_0),0.057,0.13,0.013,0.226,1.738,0.005,1.124,0.45
1,(2_0),(13_11),0.13,0.057,0.013,0.098,1.738,0.005,1.046,0.488
4,(33_4),(15_1),0.027,0.121,0.011,0.411,3.4,0.008,1.493,0.726
5,(15_1),(33_4),0.121,0.027,0.011,0.093,3.4,0.008,1.072,0.803


Step 3: Use the arl_recommender function to recommend a service to a user who had the last 2_0 service.

In [12]:
def arl_recommender(rules_df, product_id, rec_count=1):
    sorted_rules = rules_df.sort_values('lift', ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules['antecedents']):
        for j in list(product):
            if j == product_id:
                recommendation_list.append(list(sorted_rules.iloc[i]['consequents'])[0])
        return recommendation_list[0:rec_count]


arl_recommender(rules,'2_0', 4)

['22_0', '25_0', '15_1', '13_11']