# ARMUT Association Rule Learning Recommendation System

***

In [213]:
import pandas as pd

### Dataset variables
> **UserId**: Unique Id for the a certain user
***
> **ServiceId**: Unique Id assigned to a specific service
***
> **CategoryId**: Unique Category in which service provided falls into.
***
> **Create Date**: Time Stamp for the service provided
***

In [51]:
df=pd.read_csv(r"C:\Users\ONUR\armut_data.csv")
df

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate
0,25446,4,5,2017-08-06 16:11:00
1,22948,48,5,2017-08-06 16:12:00
2,10618,0,8,2017-08-06 16:13:00
3,7256,9,4,2017-08-06 16:14:00
4,25446,48,5,2017-08-06 16:16:00
...,...,...,...,...
162518,10591,25,0,2018-08-06 14:40:00
162519,10591,2,0,2018-08-06 14:43:00
162520,10591,31,6,2018-08-06 14:47:00
162521,12666,38,4,2018-08-06 16:01:00


In [52]:
df.dtypes

UserId         int64
ServiceId      int64
CategoryId     int64
CreateDate    object
dtype: object

In [53]:
df['CreateDate'] = pd.to_datetime(df['CreateDate'])

#### Number of unique values for each columns
* There are 24826 unique User
* There are 12 unique categories

In [54]:
[ (col,df[col].nunique()) for col in df.columns]

[('UserId', 24826),
 ('ServiceId', 50),
 ('CategoryId', 12),
 ('CreateDate', 117510)]

#### As can be seen above and below, There are 50 different services dispersed into 12 categories unevenly

In [55]:
df.groupby("CategoryId")["ServiceId"].nunique()

CategoryId
0     6
1     4
2     2
3     2
4     8
5     7
6     4
7     6
8     3
9     1
10    3
11    4
Name: ServiceId, dtype: int64

***
## *Merging CategoryId and ServiceId columns*
>Since each combination of ServiceId and CombinationId represents an different task, the two must be merged into one cell to differentiate 

>To exemplifiy, service 4 of Category 4 and service 4 of Category 5. Eventhough they have the same serviceId they represent different 

>task since they are assesed within the context of category they fall into. This new columns will be named as **"Service"**

In [56]:
df["Service"]=df["ServiceId"].astype(str)+"_"+df["CategoryId"].astype(str)

In [214]:
df.head(2)

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,NewDate,BasketId
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08


***
## *Creating Baskets (Invoices) for the services provided based on the month they are purchased*

#### 1- Defining a new column with only month and year in it from CreateDate column

In [58]:
df["NewDate"]=df["CreateDate"].map(lambda x: '{year}-{month:02}'.format(year=x.year,month=x.month))
df.head()

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,NewDate
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08
2,10618,0,8,2017-08-06 16:13:00,0_8,2017-08
3,7256,9,4,2017-08-06 16:14:00,9_4,2017-08
4,25446,48,5,2017-08-06 16:16:00,48_5,2017-08


### 2-Creating an imaginary "BasketId" for the Invoice-Product Matrix for ARL

In [215]:
df["BasketId"]=df["UserId"].astype(str)+"_"+df["NewDate"]
df.head(2)

Unnamed: 0,UserId,ServiceId,CategoryId,CreateDate,Service,NewDate,BasketId
0,25446,4,5,2017-08-06 16:11:00,4_5,2017-08,25446_2017-08
1,22948,48,5,2017-08-06 16:12:00,48_5,2017-08,22948_2017-08


***

### Invoice-Product Matrix Preparation for Assocation Rule Learning Algorithms
> A certain type of matrix should be prepared as an input for ARL Algorithms

In [96]:
Invoice_Product_Matrix=df.groupby(["BasketId","Service"])["UserId"].count().unstack().fillna(0)\
                                                                    .applymap(lambda x:True if x>0 else False)
# Every row represents a unique user in a given month and year.
# Columns represent the services and values indicate whether the service in a given column
# has been purchased or not.

# WE ARE INTERESTED IN THE FREQUENCY OF A CERTAIN SERVICE FOR A CERTAIN USER BUT ONLY THE OCCURENCE SO
# IF A SERVICE IS BOUGHT 3 OR 4 TIMES IN A MONTH BY A USER WE JUST TRANSFORM THAT TO 1 TO INDICATE ONLY THE OCCURENCE

In [97]:
# Bool type values are used for better computational performance.
# True:1 (There has been at least one purchase of that certain service by a that certain customer)
# False:1 (There hasn't been any purchase of that certain service by a that certain customer)

In [98]:
Invoice_Product_Matrix.head(2)

Service,0_8,10_9,11_11,12_7,13_11,14_7,15_1,16_8,17_5,18_4,...,46_4,47_7,48_5,49_1,4_5,5_11,6_7,7_3,8_5,9_4
BasketId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0_2017-08,False,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,False,False,False,False,False
0_2017-09,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False


In [99]:
# importing required algorithms for Association Rule Learning
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [122]:
frequent_services=apriori(df=Invoice_Product_Matrix,
                          min_support=0.01,
                          use_colnames=True)

In [123]:
# There are 56 possible servicesets
frequent_services.shape[0]

56

In [216]:
#Support= Occurence probability of first and second service sets
#(if there is only one argument it's basically independent probability)

# Let's explain. Support value of 0.019 for itemset (0_8) means that out of every 100 transaction 2 of them are happen
# to be service 0_8
frequent_services.head(2)

Unnamed: 0,support,itemsets
0,0.019728,(0_8)
1,0.026523,(11_11)


In [125]:
# Let's see itemset with the highest support values
frequent_services.sort_values(by="support",ascending=False).head(20)
# Let's check index 47. Out of every 100 transaction 3 of them contains services 15_1 and 2_0 together.

Unnamed: 0,support,itemsets
8,0.238121,(18_4)
19,0.130286,(2_0)
5,0.120963,(15_1)
39,0.067762,(49_1)
28,0.066568,(38_4)
3,0.056627,(13_11)
12,0.047515,(22_0)
9,0.045563,(19_6)
15,0.042895,(25_0)
7,0.041533,(17_5)


In [219]:
#Antecedents= First service set
#Consequents= Second service set
#Antecedent support= The probability of occurence of first service set independently
#Consequent support= The probability of occurence of second service set independently
#Support= Occurence probability of first and second service sets(if there is only one argument it's independent probability)
#Confidence= The occurence probability of second service set when first service set is already bought
#Lift= The probability impact of occurence of first service set to the occurence of second service set
# The value of 17 for lift means that if you buy the first service the second service is 17 times more likely to be bought
rules=association_rules(frequent_services,
                       metric="support",
                       min_threshold=0.01)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(13_11),(2_0),0.056627,0.130286,0.012819,0.226382,1.737574,0.005442,1.124216
1,(2_0),(13_11),0.130286,0.056627,0.012819,0.098394,1.737574,0.005442,1.046325
2,(15_1),(2_0),0.120963,0.130286,0.033951,0.280673,2.154278,0.018191,1.209066
3,(2_0),(15_1),0.130286,0.120963,0.033951,0.260588,2.154278,0.018191,1.188833
4,(15_1),(33_4),0.120963,0.02731,0.011233,0.092861,3.400299,0.007929,1.072262


In [152]:
# Let's sort the values in order to see the strongest relationships
# Personally I have chosen lift factor to sort values
#Lift= The probability impact of occurence of first service set to the occurence of second service set
#The value of 17 for lift means that if you buy the first service the second service is 17 times more likely to be bought
sorted_rules=rules.sort_values(by="lift",ascending=False)
sorted_rules.head(3)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
10,(25_0),(22_0),0.042895,0.047515,0.01112,0.259247,5.456141,0.009082,1.285834
11,(22_0),(25_0),0.047515,0.042895,0.01112,0.234043,5.456141,0.009082,1.249553
19,(38_4),(9_4),0.066568,0.041393,0.010067,0.151234,3.653623,0.007312,1.129413


## Creating a function to make a recommendation for a specific service and using that function

In [201]:
# creating a list to store our recommendations
def arl_recommender(rules_df,service_id,rec_count=1,statistics=False):
    sorted_rules=rules_df.sort_values("lift",ascending=False)
    stats=[]
    recommendation_list=[]
    for i,service in enumerate(sorted_rules["antecedents"]):
        for j in list(service):
            if j==service_id:
                stats.append(sorted_rules.iloc[i])
                recommendation_list.append(list(sorted_rules.iloc[i]["consequents"])[0])
    if statistics:
        return recommendation_list[0:rec_count],stats[0:rec_count]
    else:
        return recommendation_list[0:rec_count]

### We want to recommend three services to someone who has bought the service "2_0"
#### *We can recommend services **"22_0"**, **"25_0"**, **"15_1"** 

In [211]:
arl_recommender(rules,"2_0",3,statistics=False)

['22_0', '25_0', '15_1']

In [212]:
arl_recommender(rules,"2_0",3,statistics=True)
# We would recommend these three services mentioned above because everytime the service 2_0 is bought
# the independent occurence probability of three recommended services at least doubles.

(['22_0', '25_0', '15_1'],
 [antecedents               (2_0)
  consequents              (22_0)
  antecedent support     0.130286
  consequent support    0.0475147
  support               0.0165684
  confidence             0.127169
  lift                    2.67641
  leverage              0.0103779
  conviction              1.09126
  Name: 13, dtype: object,
  antecedents                (2_0)
  consequents               (25_0)
  antecedent support      0.130286
  consequent support     0.0428953
  support                0.0134372
  confidence              0.103136
  lift                     2.40437
  leverage              0.00784857
  conviction               1.06717
  Name: 15, dtype: object,
  antecedents               (2_0)
  consequents              (15_1)
  antecedent support     0.130286
  consequent support     0.120963
  support               0.0339511
  confidence             0.260588
  lift                    2.15428
  leverage              0.0181913
  conviction              