In [18]:
# Import all the necessary packages

import numpy as np
import pandas as pd
import scipy
import uci_dataset as data
import random
random.seed(10)

# The results will depend on the random train-test split, so we average the accuracies over a certain number of repetitions
n_rep = 40

import wittgenstein3 as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime

In [19]:
# Define a dictionary with the datasets we want to consider when experimenting
# dfs = ...

In [101]:
# Dictionary of accuracies of standard RIPPERk on all datasets

stand_acc = {'Thyroid': 0.946, 'audiology': 0.9738, 'autism': 0.9968, 'adult': 0.7099, 'arrhythmia': 0.5262}

In [102]:
import numpy as np
import time
from Helper import LearningCurvePlot, smooth


def average_over_repetitions(n_repetitions, df, W, class_feat, pos_class, name_test, verbosity = 0):
    acc = np.empty([n_repetitions]) # accuracies array
    start = datetime.now()
    start2 = time.time()
    
    for i in range(n_repetitions):
        X_train, X_test = train_test_split(df, test_size = 0.2)
        ripper_clf = lw.RIPPER(k=2, verbosity = verbosity, W = W)
        ripper_clf.fit(X_train, class_feat = class_feat, pos_class = pos_class)
        y_test = X_test[pos_class]
        score = ripper_clf.score(X_test, y_test)
        acc += [score]
        
    # Save results to a text file
    np.savetxt('Results/' + name_test + '_W' + str(W) + '_time' + start.strftime("%d-%m-%Y_%H.%M.%S"),acc,delimiter = ',')
    
    print('Running one setting takes {} minutes'.format((time.time()-start2)/60))
    learning_curve = np.mean(acc) # average over repetitions
    return learning_curve, acc

def experiment(n_repetitions, DFs, verbosity = 0, Ws = np.arange(0,1,0.01)):
    ####### Settings
    # Experiment    
    # Here DFs should be a dataframe with columns:
    #        data: containing the keys to a dictionary of dataframes
    #        class_feat: containing the class feature for each dataset
    #        pos_class: containing the name of the positive class
    
    # Plotting parameters
    plot = True
    
    ####### Experiments
    
    #Plot1 = LearningCurvePlot(title = labels[problem] + ' problem solved with ' + labels[method])
    
    for name_test in DFs['tests']:
        Plot1 = LearningCurvePlot(title = name_test)
        Plot1.add_hline(stand_acc[name_test], 'standard' + name_test)
        class_feat = DFs['class'][DFs.tests == name_test][0]
        pos_class = DFs['pos'][DFs.tests == name_test][0]
        
        for W in Ws:
            learning_curve, _ = average_over_repetitions(n_repetitions, dfs[name_test], W,
                                                            class_feat, pos_class, name_test, verbosity)
            Plot1.add_curve(learning_curve,label = name_test + str(W))
    Plot1.save(name_test + '_' + labels[problem] + '.png')

Now let's rerun the previous tests by looking at all the possible values of W.

# Thyroid dataset

In [103]:
df = data.load_thyroid_disease()

In [104]:
name_test = 'Thyroid'

In [105]:
dfs = {'Thyroid':df}

In [106]:
DFs = pd.DataFrame({'tests': [name_test], 'class': ['sick-euthyroid'], 'pos': ['sick-euthyroid']})

In [None]:
experiment(40, DFs, Ws = np.arange(0,1,0.01))

# Audiology (Standardized) Data Set

In [None]:
df = data.load_audiology()

In [None]:
name_test = 'audiology'

In [None]:
dfs = {'audiology':df}

In [None]:
DFs = pd.DataFrame({'tests': [name_test], 'class': ['Class'], 'pos': ['cochlear_poss_noise']})

In [None]:
experiment(40, DFs)

# Autism Screening Dataset

In [None]:
df = data.load_autism_screening()

In [None]:
name_test = 'autism'

In [None]:
dfs = {'autism':df}

In [None]:
DFs = pd.DataFrame({'tests': [name_test], 'class': ['Class/ASD'], 'pos': ['YES']})

In [None]:
experiment(40, DFs)

# Adult Data Set

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

In [None]:
name_test = 'adult'

In [None]:
dfs = {'adult':df}

In [None]:
DFs = pd.DataFrame({'tests': [name_test], 'class': [' <=50K'], 'pos': [' <=50K']})

In [None]:
experiment(40, DFs)

# Arrhythmia Data Set 

In [None]:
df = data.load_arrhythmia()

In [None]:
name_test = 'arrhythmia'

In [None]:
dfs = {'arrhythmia':df}

In [None]:
DFs = pd.DataFrame({'tests': [name_test], 'class': ['diagnosis'], 'pos': [1]})

In [None]:
experiment(40, DFs)