In [1]:
# Title:  CBA: Classification Based on Association

# INSTALL AND LOAD PACKAGES ################################

# Install pacman if you don't have it (uncomment next line)
# install.packages("pacman")

# Install and/or load packages with pacman
pacman::p_load(  # Use p_load function from pacman
  arulesCBA,     # Classification Based on Association
  caret,         # Confusion matrix for predictions
  magrittr,      # Pipes
  pacman,        # Load/unload packages
  rio,           # Import/export data
  tidyverse      # So many reasons
)

also installing the dependencies ‘shape’, ‘discretization’, ‘glmnet’





The downloaded binary packages are in
	/var/folders/rh/pfd6hrw52s35skb7d44l36nr0000gn/T//RtmpLHccwp/downloaded_packages



arulesCBA installed



In [2]:
# LOAD AND PREPARE DATA ####################################

# Set random seed for reproducibility in processes like
# splitting the data. You can use any number.
set.seed(1)

In [4]:
# Import pre-processed penguin data
df <- import("../data/penguins.rds") %>%
  print()

[90m# A tibble: 342 x 5[39m
   y      bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   [3m[90m<fct>[39m[23m           [3m[90m<dbl>[39m[23m         [3m[90m<dbl>[39m[23m             [3m[90m<int>[39m[23m       [3m[90m<int>[39m[23m
[90m 1[39m Adelie           39.1          18.7               181        [4m3[24m750
[90m 2[39m Adelie           39.5          17.4               186        [4m3[24m800
[90m 3[39m Adelie           40.3          18                 195        [4m3[24m250
[90m 4[39m Adelie           36.7          19.3               193        [4m3[24m450
[90m 5[39m Adelie           39.3          20.6               190        [4m3[24m650
[90m 6[39m Adelie           38.9          17.8               181        [4m3[24m625
[90m 7[39m Adelie           39.2          19.6               195        [4m4[24m675
[90m 8[39m Adelie           34.1          18.1               193        [4m3[24m475
[90m 9[39m Adelie           42 

In [5]:
# Discretize data using the "Minimum Description Length
# Principle" (MDLP) algorithm and save to `df`; by naming
# the `Species` variable `y` (done earlier), it's easier to
# reuse code
df <- 
  discretizeDF.supervised(  # Function from `arulesCBA`
    y ~ .,                  # Species based on rest
    data = df,              # Data source
    method = "mdlp"         # Algorithm to use
  )

In [6]:
# Check the first few rows of data
df        

y,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
<fct>,<fct>,<fct>,<fct>,<fct>
Adelie,"[-Inf,42.3)","[17.4, Inf]","[-Inf,192)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[-Inf,192)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[192,206)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[192,206)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[-Inf,192)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[-Inf,192)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[192,206)","[4.32e+03,4.82e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[192,206)","[-Inf,4.09e+03)"
Adelie,"[-Inf,42.3)","[17.4, Inf]","[-Inf,192)","[4.09e+03,4.32e+03)"
Adelie,"[-Inf,42.3)","[16.4,17.4)","[-Inf,192)","[-Inf,4.09e+03)"


In [7]:
# Split data into training (trn) and testing (tst) sets
df %<>% mutate(ID = row_number())  # Add row ID
trn <- df %>% sample_frac(.70)     # 70% in trn
tst <- df %>%                      # Start with df
  anti_join(trn, by = "ID") %>%    # Rest in tst
  select(-ID)                      # Remove id from tst
trn %<>% select(-ID)               # Remove id from trn

In [8]:
# MODEL DATA ###############################################

# Create a CBA model using `CBA` from `arulesCBA`
fit <- CBA(   # `fit` is a generic name for models
  y ~ .,      # Species based on all other variables
  data = trn  # Use training data
)

In [9]:
# Basic info on the model in `fit`
fit    

CBA Classifier Object
Class: 
Default Class: NA
Number of rules: 10
Classification method: first  
Description: CBA algorithm (Liu et al., 1998)


In [10]:
# Check the rules
options(digits = 2)  # Reset R session when done
inspect(rules(fit))  # Need a (very) wide Console window

     lhs                               rhs           support confidence coverage lift count size coveredTransactions totalErrors
[1]  {bill_length_mm=[-Inf,42.3),                                                                                               
      bill_depth_mm=[17.4, Inf]}    => {y=Adelie}       0.28       1.00     0.28  2.4    68    3                  68          81
[2]  {bill_depth_mm=[-Inf,15.4)}    => {y=Gentoo}       0.26       1.00     0.26  2.7    63    2                  63          60
[3]  {body_mass_g=[4.82e+03, Inf]}  => {y=Gentoo}       0.25       1.00     0.25  2.7    60    2                  27          33
[4]  {bill_length_mm=[46, Inf],                                                                                                 
      bill_depth_mm=[17.4, Inf]}    => {y=Chinstrap}    0.14       1.00     0.14  5.0    34    3                  34          14
[5]  {bill_length_mm=[46, Inf],                                                                  

In [11]:
# Check accuracy of the model on the training data
confusionMatrix(      # Create a confusion matrix
  reference = trn$y,  # True values
  predict(            # Predicted values
    fit,              # Based on the training model
    newdata = trn     # Use the training data
  )
)

Confusion Matrix and Statistics

           Reference
Prediction  Adelie Chinstrap Gentoo
  Adelie        94         1      0
  Chinstrap      6        47      0
  Gentoo         1         0     90

Overall Statistics
                                        
               Accuracy : 0.967         
                 95% CI : (0.935, 0.985)
    No Information Rate : 0.423         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.948         
                                        
 Mcnemar's Test P-Value : NA            

Statistics by Class:

                     Class: Adelie Class: Chinstrap Class: Gentoo
Sensitivity                  0.931            0.979         1.000
Specificity                  0.993            0.969         0.993
Pos Pred Value               0.989            0.887         0.989
Neg Pred Value               0.951            0.995         1.000
Prevalence                   0.423            0.201         

In [12]:
# TEST MODEL ###############################################

# Check accuracy of the model on the testing data
confusionMatrix(      # Create a confusion matrix
  reference = tst$y,  # True values
  predict(            # Predicted values
    fit,              # Based on the training model
    newdata = tst     # Use the testing data
  )
)

Confusion Matrix and Statistics

           Reference
Prediction  Adelie Chinstrap Gentoo
  Adelie        47         1      0
  Chinstrap      3        19      0
  Gentoo         0         0     33

Overall Statistics
                                        
               Accuracy : 0.961         
                 95% CI : (0.904, 0.989)
    No Information Rate : 0.485         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.938         
                                        
 Mcnemar's Test P-Value : NA            

Statistics by Class:

                     Class: Adelie Class: Chinstrap Class: Gentoo
Sensitivity                  0.940            0.950          1.00
Specificity                  0.981            0.964          1.00
Pos Pred Value               0.979            0.864          1.00
Neg Pred Value               0.945            0.988          1.00
Prevalence                   0.485            0.194         