We run tests on some benchmark problems using standard RIPPERk

In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn import datasets
from sklearn.model_selection import cross_val_score

In [2]:
import uci_dataset as data

In [3]:
import random
random.seed(10)

In [31]:
# The results will depend on the random train-test split, so we average the accuracies over a certain number of repetitions
n_rep = 10

In [5]:
import wittgenstein2 as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

# Thyroid dataset

In [22]:
df = data.load_thyroid_disease()

In [23]:
df.columns

Index(['sick-euthyroid', 'age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured', 'TT4',
       'T4U_measured', 'T4U', 'FTI_measured', 'FTI', 'TBG_measured', 'TBG'],
      dtype='object')

In [24]:
X = df.loc[:,df.columns != 'sick-euthyroid']

In [26]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df['sick-euthyroid']
y = y.map(lambda x: 1 if x=='sick-euthyroid' else 0)

In [27]:
np.sum(y==1)

293

In [28]:
X

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,sex_F,sex_M,on_thyroxine_f,...,T3_measured_n,T3_measured_y,TT4_measured_n,TT4_measured_y,T4U_measured_n,T4U_measured_y,FTI_measured_n,FTI_measured_y,TBG_measured_n,TBG_measured_y
0,72.0,,1.0,83.0,0.95,87.0,,0,1,1,...,0,1,0,1,0,1,0,1,1,0
1,45.0,1.90,1.0,82.0,0.73,112.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
2,64.0,0.09,1.0,101.0,0.82,123.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3,56.0,0.00,0.8,76.0,0.77,99.0,,0,1,1,...,0,1,0,1,0,1,0,1,1,0
4,78.0,2.60,0.3,87.0,0.95,91.0,,1,0,0,...,0,1,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,40.0,2.10,1.2,76.0,0.90,84.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3159,69.0,2.60,1.8,126.0,1.02,124.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3160,58.0,5.80,1.7,86.0,0.91,95.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3161,29.0,0.80,1.8,99.0,1.01,98.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0


In [36]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9473579443357426

# Audiology (Standardized) Data Set

In [39]:
df = data.load_audiology()

In [40]:
df

Unnamed: 0,age_gt_60,air,airBoneGap,ar_c,ar_u,bone,boneAbnormal,bser,history_buzzing,history_dizziness,...,s_sn_gt_2k,s_sn_gt_4k,speech,static_normal,tymp,viith_nerve_signs,wave_V_delayed,waveform_ItoV_prolonged,indentifier,Class
0,f,mild,f,normal,normal,,t,,f,f,...,f,f,normal,t,a,f,f,f,p1,cochlear_unknown
1,f,moderate,f,normal,normal,,t,,f,f,...,f,f,normal,t,a,f,f,f,p2,cochlear_unknown
2,t,mild,t,,absent,mild,t,,f,f,...,f,f,normal,t,as,f,f,f,p3,mixed_cochlear_age_fixation
3,t,mild,t,,absent,mild,f,,f,f,...,f,f,normal,t,b,f,f,f,p4,mixed_cochlear_age_otitis_media
4,t,mild,f,normal,normal,mild,t,,f,f,...,f,f,good,t,a,f,f,f,p5,cochlear_age
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,t,mild,f,absent,normal,mild,t,,f,f,...,f,f,very_good,t,a,f,f,f,p196,cochlear_age
196,t,mild,f,normal,absent,mild,f,,f,f,...,f,f,very_good,t,c,f,f,f,p197,mixed_cochlear_age_otitis_media
197,f,normal,f,normal,normal,unmeasured,f,degraded,f,f,...,f,f,normal,f,a,f,f,f,p198,possible_brainstem_disorder
198,t,mild,f,normal,normal,unmeasured,f,,f,f,...,f,f,very_good,t,a,f,f,f,p199,cochlear_age


In [41]:
df.Class.unique()

array(['cochlear_unknown', 'mixed_cochlear_age_fixation',
       'mixed_cochlear_age_otitis_media', 'cochlear_age', 'normal_ear',
       'cochlear_poss_noise', 'cochlear_age_and_noise',
       'acoustic_neuroma', 'mixed_cochlear_unk_ser_om',
       'conductive_discontinuity', 'retrocochlear_unknown',
       'conductive_fixation', 'bells_palsy',
       'cochlear_noise_and_heredity', 'mixed_cochlear_unk_fixation',
       'mixed_poss_noise_om', 'otitis_media', 'possible_menieres',
       'possible_brainstem_disorder', 'cochlear_age_plus_poss_menieres',
       'mixed_cochlear_age_s_om', 'mixed_cochlear_unk_discontinuity',
       'mixed_poss_central_om', 'poss_central'], dtype=object)

In [42]:
class_feat = 'Class'
pos_class = 'cochlear_poss_noise'

In [44]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [45]:
# When working with multiclass classification we start off by separating the least represented classes

for cl in df.Class.unique():
    count = (df.Class == cl).sum()
    print(str(cl) + ': ' + str(count))

cochlear_unknown: 48
mixed_cochlear_age_fixation: 1
mixed_cochlear_age_otitis_media: 4
cochlear_age: 46
normal_ear: 20
cochlear_poss_noise: 16
cochlear_age_and_noise: 18
acoustic_neuroma: 1
mixed_cochlear_unk_ser_om: 3
conductive_discontinuity: 2
retrocochlear_unknown: 2
conductive_fixation: 6
bells_palsy: 1
cochlear_noise_and_heredity: 2
mixed_cochlear_unk_fixation: 5
mixed_poss_noise_om: 2
otitis_media: 4
possible_menieres: 8
possible_brainstem_disorder: 4
cochlear_age_plus_poss_menieres: 1
mixed_cochlear_age_s_om: 2
mixed_cochlear_unk_discontinuity: 2
mixed_poss_central_om: 1
poss_central: 1


In [46]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9640000000000001

# Autism Screening Dataset

In [60]:
df = data.load_autism_screening()

In [61]:
df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5.0,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8.0,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6.0,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,,no,no,Egypt,no,2.0,18 and more,,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0,1,0,1,1,0,1,1,1,1,...,f,White-European,no,no,Russia,no,7.0,18 and more,Self,YES
700,1,0,0,0,0,0,0,1,0,1,...,m,Hispanic,no,no,Mexico,no,3.0,18 and more,Parent,NO
701,1,0,1,1,1,0,1,1,0,1,...,f,,no,no,Russia,no,7.0,18 and more,,YES
702,1,0,0,1,1,0,1,0,1,1,...,m,South Asian,no,no,Pakistan,no,6.0,18 and more,Self,NO


In [62]:
(df['Class/ASD'] == 'YES').sum()

189

In [63]:
class_feat = 'Class/ASD'
pos_class = 'YES'

In [64]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [65]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9992897384305836

#  Adult Data Set 

In [66]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

In [67]:
df

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [68]:
class_feat = ' <=50K'
pos_class = ' <=50K'

In [69]:
(df[' <=50K'] == ' <=50K').sum()

24719

In [70]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [71]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.7208691646191646

#  Arrhythmia Data Set 

In [72]:
df = data.load_arrhythmia()

In [73]:
df

Unnamed: 0,Age,Sex,Height,Weight,QRS_Dur,P-R_Int,Q-T_Int,T_Int,P_Int,QRS,...,V6271,V6272,V6273,V6274,V6275,V6276,V6277,V6278,V6279,diagnosis
0,75,0,190,80,91,193,371,174,121,-16,...,0.0,9.0,-0.9,0.0,0.0,0.9,2.9,23.3,49.4,8
1,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0.0,0.2,2.1,20.4,38.8,6
2,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0.0,0.3,3.4,12.3,49.0,10
3,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0.0,0.4,2.6,34.6,61.6,1
4,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0.0,-0.1,3.9,25.4,62.8,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,53,1,160,70,80,199,382,154,117,-37,...,0.0,4.3,-5.0,0.0,0.0,0.7,0.6,-4.4,-0.5,1
448,37,0,190,85,100,137,361,201,73,86,...,0.0,15.6,-1.6,0.0,0.0,0.4,2.4,38.0,62.4,10
449,36,0,166,68,108,176,365,194,116,-85,...,0.0,16.3,-28.6,0.0,0.0,1.5,1.0,-44.2,-33.2,2
450,32,1,155,55,93,106,386,218,63,54,...,-0.4,12.0,-0.7,0.0,0.0,0.5,2.4,25.0,46.6,1


In [74]:
# When the diagnosis is 16, it means that the patient wasn't classified. We remove these observations

df = df.loc[df.diagnosis != 16]

In [75]:
class_feat = 'diagnosis'
pos_class = 1

In [76]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [77]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.5406976744186047