We run tests on some benchmark problems using standard RIPPERk

In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn import datasets
from sklearn.model_selection import cross_val_score

In [2]:
import uci_dataset as data

In [3]:
import random
random.seed(10)

In [31]:
# The results will depend on the random train-test split, so we average the accuracies over a certain number of repetitions
n_rep = 10

In [5]:
import wittgenstein2 as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
from sklearn.metrics import roc_curve, roc_auc_score
from matplotlib import pyplot as plt

# Thyroid dataset

In [22]:
df = data.load_thyroid_disease()

In [23]:
df.columns

Index(['sick-euthyroid', 'age', 'sex', 'on_thyroxine', 'query_on_thyroxine',
       'on_antithyroid_medication', 'thyroid_surgery', 'query_hypothyroid',
       'query_hyperthyroid', 'pregnant', 'sick', 'tumor', 'lithium', 'goitre',
       'TSH_measured', 'TSH', 'T3_measured', 'T3', 'TT4_measured', 'TT4',
       'T4U_measured', 'T4U', 'FTI_measured', 'FTI', 'TBG_measured', 'TBG'],
      dtype='object')

In [24]:
X = df.loc[:,df.columns != 'sick-euthyroid']

In [26]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df['sick-euthyroid']
y = y.map(lambda x: 1 if x=='sick-euthyroid' else 0)

In [27]:
np.sum(y==1)

293

In [28]:
X

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,TBG,sex_F,sex_M,on_thyroxine_f,...,T3_measured_n,T3_measured_y,TT4_measured_n,TT4_measured_y,T4U_measured_n,T4U_measured_y,FTI_measured_n,FTI_measured_y,TBG_measured_n,TBG_measured_y
0,72.0,,1.0,83.0,0.95,87.0,,0,1,1,...,0,1,0,1,0,1,0,1,1,0
1,45.0,1.90,1.0,82.0,0.73,112.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
2,64.0,0.09,1.0,101.0,0.82,123.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3,56.0,0.00,0.8,76.0,0.77,99.0,,0,1,1,...,0,1,0,1,0,1,0,1,1,0
4,78.0,2.60,0.3,87.0,0.95,91.0,,1,0,0,...,0,1,0,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3158,40.0,2.10,1.2,76.0,0.90,84.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3159,69.0,2.60,1.8,126.0,1.02,124.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3160,58.0,5.80,1.7,86.0,0.91,95.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0
3161,29.0,0.80,1.8,99.0,1.01,98.0,,1,0,1,...,0,1,0,1,0,1,0,1,1,0


In [36]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9473579443357426

# Audiology (Standardized) Data Set

In [39]:
df = data.load_audiology()

In [40]:
df

Unnamed: 0,age_gt_60,air,airBoneGap,ar_c,ar_u,bone,boneAbnormal,bser,history_buzzing,history_dizziness,...,s_sn_gt_2k,s_sn_gt_4k,speech,static_normal,tymp,viith_nerve_signs,wave_V_delayed,waveform_ItoV_prolonged,indentifier,Class
0,f,mild,f,normal,normal,,t,,f,f,...,f,f,normal,t,a,f,f,f,p1,cochlear_unknown
1,f,moderate,f,normal,normal,,t,,f,f,...,f,f,normal,t,a,f,f,f,p2,cochlear_unknown
2,t,mild,t,,absent,mild,t,,f,f,...,f,f,normal,t,as,f,f,f,p3,mixed_cochlear_age_fixation
3,t,mild,t,,absent,mild,f,,f,f,...,f,f,normal,t,b,f,f,f,p4,mixed_cochlear_age_otitis_media
4,t,mild,f,normal,normal,mild,t,,f,f,...,f,f,good,t,a,f,f,f,p5,cochlear_age
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,t,mild,f,absent,normal,mild,t,,f,f,...,f,f,very_good,t,a,f,f,f,p196,cochlear_age
196,t,mild,f,normal,absent,mild,f,,f,f,...,f,f,very_good,t,c,f,f,f,p197,mixed_cochlear_age_otitis_media
197,f,normal,f,normal,normal,unmeasured,f,degraded,f,f,...,f,f,normal,f,a,f,f,f,p198,possible_brainstem_disorder
198,t,mild,f,normal,normal,unmeasured,f,,f,f,...,f,f,very_good,t,a,f,f,f,p199,cochlear_age


In [41]:
df.Class.unique()

array(['cochlear_unknown', 'mixed_cochlear_age_fixation',
       'mixed_cochlear_age_otitis_media', 'cochlear_age', 'normal_ear',
       'cochlear_poss_noise', 'cochlear_age_and_noise',
       'acoustic_neuroma', 'mixed_cochlear_unk_ser_om',
       'conductive_discontinuity', 'retrocochlear_unknown',
       'conductive_fixation', 'bells_palsy',
       'cochlear_noise_and_heredity', 'mixed_cochlear_unk_fixation',
       'mixed_poss_noise_om', 'otitis_media', 'possible_menieres',
       'possible_brainstem_disorder', 'cochlear_age_plus_poss_menieres',
       'mixed_cochlear_age_s_om', 'mixed_cochlear_unk_discontinuity',
       'mixed_poss_central_om', 'poss_central'], dtype=object)

In [42]:
class_feat = 'Class'
pos_class = 'cochlear_poss_noise'

In [44]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [45]:
# When working with multiclass classification we start off by separating the least represented classes

for cl in df.Class.unique():
    count = (df.Class == cl).sum()
    print(str(cl) + ': ' + str(count))

cochlear_unknown: 48
mixed_cochlear_age_fixation: 1
mixed_cochlear_age_otitis_media: 4
cochlear_age: 46
normal_ear: 20
cochlear_poss_noise: 16
cochlear_age_and_noise: 18
acoustic_neuroma: 1
mixed_cochlear_unk_ser_om: 3
conductive_discontinuity: 2
retrocochlear_unknown: 2
conductive_fixation: 6
bells_palsy: 1
cochlear_noise_and_heredity: 2
mixed_cochlear_unk_fixation: 5
mixed_poss_noise_om: 2
otitis_media: 4
possible_menieres: 8
possible_brainstem_disorder: 4
cochlear_age_plus_poss_menieres: 1
mixed_cochlear_age_s_om: 2
mixed_cochlear_unk_discontinuity: 2
mixed_poss_central_om: 1
poss_central: 1


In [46]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9640000000000001

# Autism Screening Dataset

In [60]:
df = data.load_autism_screening()

In [61]:
df

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,United States,no,6.0,18 and more,Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5.0,18 and more,Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8.0,18 and more,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,United States,no,6.0,18 and more,Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,,no,no,Egypt,no,2.0,18 and more,,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,0,1,0,1,1,0,1,1,1,1,...,f,White-European,no,no,Russia,no,7.0,18 and more,Self,YES
700,1,0,0,0,0,0,0,1,0,1,...,m,Hispanic,no,no,Mexico,no,3.0,18 and more,Parent,NO
701,1,0,1,1,1,0,1,1,0,1,...,f,,no,no,Russia,no,7.0,18 and more,,YES
702,1,0,0,1,1,0,1,0,1,1,...,m,South Asian,no,no,Pakistan,no,6.0,18 and more,Self,NO


In [62]:
(df['Class/ASD'] == 'YES').sum()

189

In [63]:
class_feat = 'Class/ASD'
pos_class = 'YES'

In [64]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [65]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

0.9992897384305836

#  Adult Data Set 

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

In [None]:
df

In [None]:
class_feat = ' <=50K'
pos_class = ' <=50K'

In [None]:
(df[' <=50K'] == ' <=50K').sum()

In [None]:
# First dummify your categorical features and booleanize your class values to make sklearn happy

X = df.loc[:,df.columns != class_feat]
X = pd.get_dummies(X, columns=X.select_dtypes('object').columns)
y = df[class_feat]
y = y.map(lambda x: 1 if x==pos_class else 0)

In [None]:
acc = []

for i in range(n_rep):
    ripper_clf = lw.RIPPER(k=2)
    scores = cross_val_score(ripper_clf, X, y, cv = 10) 
    acc += [scores]
    
np.mean(acc)

#  Arrhythmia Data Set 

In [None]:
df = data.load_arrhythmia()

In [None]:
df

In [None]:
# When the diagnosis is 16, it means that the patient wasn't classified. We remove these observations

df = df.loc[df.diagnosis != 16]

In [None]:
class_feat = 'diagnosis'
pos_class = 1

In [85]:
acc = []

for i in range(n_rep):
    X_train, X_test = train_test_split(df, test_size = 0.2)
    ripper_clf = lw.RIPPER(k=2)
    ripper_clf.fit(X_train, class_feat = 'diagnosis', pos_class = 1)
    y_test = X_test['diagnosis']
    score = ripper_clf.score(X_test, y_test)
    acc += [score]
    
np.mean(acc)

0.5325581395348837