# How to use the R package arules from Python


## Define some helper functions

In [None]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

### activate automatic conversion of pandas dataframes to R dataframes
pandas2ri.activate()

### import the R arules package
arules = importr("arules")

### parameters for apriori and eclat.
def parameters(x):
    return ro.ListVector(x)

### convert arules associations (rules/itemsets) into a pandas dataframe
def as_df(x):
    with (ro.default_converter + ro.pandas2ri.converter).context():
        pd_df = ro.conversion.get_conversion().rpy2py(arules.DATAFRAME(x))
    return pd_df

### convert arules associations (rules/itemsets) into a binary numpy matrix (what can be "items", "lhs", "rhs")
def as_matrix(x, what = "items"):
    return np.array(ro.r('function(x) as(' + what + '(x), "matrix")')(x))

### convert arules associations (rules/itemsets) into a dictionary (what can be "items", "lhs", "rhs")
def as_dict(x, what = "items"):
    l = ro.r('function(x) as(' + what + '(x), "list")')(x)
    l.names = [*range(0, len(l))]
    return dict(zip(l.names, map(list,list(l))))

### extract the quality measures from arules associations (rules/itemsets)
def quality(x):
    return x.slots["quality"]

## Install the Python modules
We need to set up the R package arules and [rpy2](https://github.com/rpy2/rpy2) to connect to R.

## Using Conda

Create a new conda environment.
To install rpy2 and pandas use:

```
conda install -c conda-forge rpy2
conda install -c conda-forge pandas
```

## Using pip

Activate the virtual environment and use:
```
pip install rpy2 pandas
```


In [None]:
## Install the arules package

In [204]:
utils = importr("utils")

if not 'arules' in utils.installed_packages():
    utils.install_packages("arules")
else:
    print ("arules package already installed")

arules package already installed


## Data

The data need to be prepared as a Pandas dataframe. Here we have 9 transactions with three items called A, B and C. True means that a transaction contains the item.

In [149]:
import pandas as pd
import numpy as np

df = pd.DataFrame (
    [
        [True,True, True],
        [True, False,False],
        [True, True, True],
        [True, False, False],
        [True, True, True],
        [True, False, True],
        [True, True, True],
        [False, False, True],
        [False, True, True],
        [True, False, True],
    ],
    columns=list ('ABC')) 

df

Unnamed: 0,A,B,C
0,True,True,True
1,True,False,False
2,True,True,True
3,True,False,False
4,True,True,True
5,True,False,True
6,True,True,True
7,False,False,True
8,False,True,True
9,True,False,True


In [151]:
trans = arules.transactions(df)
print(trans)

as_df(trans)

transactions in sparse format with
 10 transactions (rows) and
 3 items (columns)



Unnamed: 0,items,transactionID
1,"{A,B,C}",0
2,{A},1
3,"{A,B,C}",2
4,{A},3
5,"{A,B,C}",4
6,"{A,C}",5
7,"{A,B,C}",6
8,{C},7
9,"{B,C}",8
10,"{A,C}",9


In [152]:
arules.itemFrequency(trans)

array([0.8, 0.5, 0.8])

## Mine frequent itemsets

In [153]:
itsets = arules.apriori(trans, 
   parameter = parameters({"supp": 0.1, "target": "frequent itemsets"}))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
         NA    0.1    1 none FALSE            TRUE       5     0.1      1
 maxlen            target  ext
     10 frequent itemsets TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 1 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3 item(s), 10 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [7 set(s)] done [0.00s].
creating S4 object  ... done [0.00s].


In [154]:
print(itsets)

as_df(itsets)

set of 7 itemsets 



Unnamed: 0,items,support,count
1,{B},0.5,5
2,{A},0.8,8
3,{C},0.8,8
4,"{A,B}",0.4,4
5,"{B,C}",0.5,5
6,"{A,C}",0.6,6
7,"{A,B,C}",0.4,4


The frequent itemsets can be accessed as a binary matrix.

In [155]:
as_matrix(itsets)

array([[0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 1, 0],
       [0, 1, 1],
       [1, 0, 1],
       [1, 1, 1]], dtype=int32)

Access itemsets as a dictionary

In [156]:
as_dict(itsets)

{'0': ['B'],
 '1': ['A'],
 '2': ['C'],
 '3': ['A', 'B'],
 '4': ['B', 'C'],
 '5': ['A', 'C'],
 '6': ['A', 'B', 'C']}

Accessing the quality measures

In [157]:
quality(itsets)

Unnamed: 0,support,count
1,0.5,5
2,0.8,8
3,0.8,8
4,0.4,4
5,0.5,5
6,0.6,6
7,0.4,4


## Mine association rules

In [158]:
rules = arules.apriori(trans, 
   parameter = parameters({"supp": 0.1, "conf": 0.8}))

Apriori

Parameter specification:
 confidence minval smax arem  aval originalSupport maxtime support minlen
        0.8    0.1    1 none FALSE            TRUE       5     0.1      1
 maxlen target  ext
     10  rules TRUE

Algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

Absolute minimum support count: 1 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[3 item(s), 10 transaction(s)] done [0.00s].
sorting and recoding items ... [3 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [6 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].


In [159]:
print(rules)

set of 6 rules 



In [160]:
as_df(rules)

Unnamed: 0,LHS,RHS,support,confidence,coverage,lift,count
1,{},{A},0.8,0.8,1.0,1.0,8
2,{},{C},0.8,0.8,1.0,1.0,8
3,{B},{A},0.4,0.8,0.5,1.0,4
4,{B},{C},0.5,1.0,0.5,1.25,5
5,"{A,B}",{C},0.4,1.0,0.4,1.25,4
6,"{B,C}",{A},0.4,0.8,0.5,1.0,4


Get the left-hand-side, the right-hand-side and the rule quality.

In [161]:
as_matrix(rules, what = "lhs")

array([[0, 0, 0],
       [0, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [1, 1, 0],
       [0, 1, 1]], dtype=int32)

In [162]:
as_matrix(rules, what = "rhs")

array([[1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0]], dtype=int32)

In [163]:
quality(rules)

Unnamed: 0,support,confidence,coverage,lift,count
1,0.8,0.8,1.0,1.0,8
2,0.8,0.8,1.0,1.0,8
3,0.4,0.8,0.5,1.0,4
4,0.5,1.0,0.5,1.25,5
5,0.4,1.0,0.4,1.25,4
6,0.4,0.8,0.5,1.0,4


In [165]:
rules = arules.apriori(trans, 
    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
    control = parameters({"verbose": False}))

rules = arules.sort(rules, by = "lift")
as_df(rules)

Unnamed: 0,LHS,RHS,support,confidence,coverage,lift,count
4,{B},{C},0.5,1.0,0.5,1.25,5
5,"{A,B}",{C},0.4,1.0,0.4,1.25,4
1,{},{A},0.8,0.8,1.0,1.0,8
2,{},{C},0.8,0.8,1.0,1.0,8
3,{B},{A},0.4,0.8,0.5,1.0,4
6,"{B,C}",{A},0.4,0.8,0.5,1.0,4


In [None]:
arules.size(rules)

0,1,2,3,4,5
1,1,2,2,3,3


In [None]:
methods = importr('methods')
r_subset_rules = methods.selectMethod("[", "associations")

print(r_subset_rules(rules, ro.IntVector([1,2,3])))

set of 3 rules 



In [179]:
print(r_subset_rules(rules, quality(rules)['lift'] > 1))

set of 2 rules 



In [190]:
print(arules.interestMeasure(rules, measure = ["chiSquared", "improvement"], transaction = trans))

  chiSquared improvement
1   2.500000         0.2
2   1.666667         0.0
3        NaN         0.8
4        NaN         0.8
5   0.000000         0.0
6   0.000000         0.0



# Appendix: Functions available in arules

arules can be called directly using `arules.<funciton name>`. These functions operate on the `rpy2.robject` like `rules` and `itsets` created above.

Documentation of these functions can be found here: https://mhahsler.r-universe.dev/arules/doc/manual.html

In [196]:
for f in dir(arules):
    if not f.startswith('_') and not f.startswith('R_'):
        print(f)

%ain%
%in%
%oin%
%pin%
DATAFRAME
LIST
abbreviate
addAggregate
addComplement
addl_doc
affinity
aggregate
apriori
check_installed
ci_norm
ci_or_exact
ci_prop
ci_prop_binom
ci_prop_wilson
compatible
confint_rules
coverage
create_measures_doc
crossTable
decode
discretize
discretizeDF
dissimilarity
duplicated
eclat
encode
filterAggregate
fim4r
generatingItemsets
hits
info
info<-
inspect
interestMeasure
intersect
intersect_associations
intersect_itemMatrix
is_closed
is_element
is_element_associations
is_element_itemMatrix
is_generator
is_maximal
is_redundant
is_significant
is_subset
is_superset
itemFrequency
itemFrequencyPlot
itemInfo
itemInfo<-
itemIntersect
itemLabels
itemLabels<-
itemSetdiff
itemUnion
items
items<-
itemsetInfo
itemsetInfo<-
itemsets
labels
lhs
lhs<-
match
measuresItemsets
measuresRules
merge
nitems
plot_associations
plot_itemMatrix
predict
quality
quality<-
random_patterns
random_transactions
read_PMML
read_transactions
recode
rhs
rhs<-
ruleInduction
ruleInduction_apriori