In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
from os.path import join

import arff
from snapshottest import TestCase

from src import DEFAULT_DIR
from src.XPLAIN_explainer import XPLAIN_explainer
from src.dataset import Dataset


def load_arff(f) -> Dataset:
    a = arff.load(f)
    dataset = Dataset(a['data'], a['attributes'])

    return dataset


def import_dataset_arff(f, explain_indices,random_explain_dataset: bool):
    dataset = load_arff(f)

    dataset_len = len(dataset)
    train_indices = list(range(dataset_len))

    if random_explain_dataset:
        random.seed(1)
        # small dataset
        MAX_SAMPLE_COUNT = 100
        if dataset_len < (2 * MAX_SAMPLE_COUNT):
            samples = int(0.2 * dataset_len)
        else:
            samples = MAX_SAMPLE_COUNT

        # Randomly pick some instances to remove from the training dataset and use in the
        # explain dataset
        explain_indices = list(random.sample(train_indices, samples))
    for i in explain_indices:
        train_indices.remove(i)

    train_dataset = Dataset.from_indices(train_indices, dataset)
    explain_dataset = Dataset.from_indices(explain_indices, dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def import_datasets_arff(f, f_explain, explain_indices,random_explain_dataset: bool):
    train_dataset = load_arff(f)
    explain_dataset = load_arff(f_explain)

    len_explain_dataset = len(explain_dataset)

    if random_explain_dataset:
        random.seed(7)
        explain_indices = list(random.sample(range(len_explain_dataset), 300))
        explain_dataset = Dataset.from_indices(explain_indices, explain_dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def get_classifier(classifier_name: str):
    if classifier_name == "sklearn_nb":
        from sklearn.naive_bayes import MultinomialNB

        skl_clf = MultinomialNB()

        return skl_clf

    elif classifier_name == "sklearn_rf":
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import OneHotEncoder

        pipe = make_pipeline(OneHotEncoder(), RandomForestClassifier(random_state=42))
        skl_clf = pipe

        return skl_clf

    else:
        raise ValueError("Classifier not available")


def get_explanation(dataset_name: str, classifier_name: str):
    explain_dataset_indices = []
    if dataset_name in [join(DEFAULT_DIR, "datasets/adult_d.arff"),
                        join(DEFAULT_DIR, "datasets/compas-scores-two-years_d.arff")]:
        with open(dataset_name) as f, open(dataset_name[:-5] + "_explain.arff") as f_explain:
            train_dataset, explain_dataset, explain_indices = import_datasets_arff(f, f_explain,
                                                                                   explain_dataset_indices,
                                                                                   True)
    else:
        with open(dataset_name) as f:
            train_dataset, explain_dataset, explain_indices = import_dataset_arff(
                f, explain_dataset_indices, True)

    clf = get_classifier(classifier_name).fit(train_dataset.X_numpy(),
                                              train_dataset.Y_numpy())
    explainer = XPLAIN_explainer(clf, train_dataset)

    instance = explain_dataset.get_decoded(0)

    cc = explain_dataset.class_column_name()
    target_class_index = instance[cc]

    return explainer.explain_instance(explain_dataset[0], target_class_index)

/home/andrea/Documents/Politecnico/tirocinio/src/../


In [3]:
def l3wrapper(instance_ix, train):
    from l3wrapper.l3wrapper import L3Classifier
    
    clf = L3Classifier(min_sup=0.01, min_conf=0.50)
    clf.fit(train.X_decoded(),
            train.Y_decoded(),
            column_names=train.X_decoded().columns.to_list(),
           remove_files=False)
    
    decoded_instance = train.inverse_transform_instance(train[instance_ix])
    encoded_rules = clf.lvl1_rules_

    def decode_rule(r, clf):
        r_class = clf._class_dict[r.class_id]
        r_attr_ixs_and_values = sorted([clf._item_id_to_item[i] for i in r.item_ids])
        r_attrs_and_values = [(clf._column_id_to_name[c], v) for c, v in r_attr_ixs_and_values]
        return {'body': r_attrs_and_values, 'class': r_class}

    rules = []
    
    for r in encoded_rules:
        # For each of its attributes and values
        for a,v in decode_rule(r, clf)['body']:
            # If rule uses an attribute's value different from the instance's
            if decoded_instance[a] != v:
                # Exit the inner loop, not entering the else clause, therefore not adding the rule
                break
        # https://docs.python.org/3/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
        else:
            # If the inner loop has completed normally without break-ing, then all of the rule's
            # attribute values are in the instance as well, so we will use this rule

            # Get the instance attribute index from the rule's item_ids
            di = decoded_instance.index
            
            # Class matching
            if decode_rule(r, clf)['class'] == decoded_instance.iloc[-1]:
                rules.append(list(sorted([di.get_loc(a) + 1 for a, v in decode_rule(r, clf)['body']])))
    
    return rules

In [4]:
with open(join(DEFAULT_DIR, "datasets/zoo.arff")) as zoo_f:
    zoo_train, zoo_explain, zoo_explain_indices = import_dataset_arff(zoo_f, [], True)

In [5]:
l3wrapper(44, zoo_train)

[[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15]]

In [6]:
from multiprocessing import Pool
def f(i):
    return i,l3wrapper(i, zoo_train)

In [7]:
%%time
for i in range(40):
    print(f(i))

(0, [[2, 3, 4, 8, 9, 10, 11]])
(1, [[2, 3, 4, 8, 9, 10, 11]])
(2, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(3, [[2, 3, 4, 8, 9, 10, 11]])
(4, [[2, 3, 4, 8, 9, 10, 11]])
(5, [[2, 3, 4, 8, 9, 10, 11]])
(6, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(7, [[2, 3, 4, 8, 9, 10, 11]])
(8, [[2, 3, 4, 8, 9, 10, 11]])
(9, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(10, [[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15], [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16]])
(11, [[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15]])
(12, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(13, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(14, [[2, 3, 4, 8, 9, 10, 11]])
(15, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(16, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(17, [[2, 3, 4, 8, 9, 10, 11]])
(18, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(19, [[2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 16]])
(20, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16]])
(21, [[2, 3, 4, 8, 9, 10, 11]])
(22, [[2, 3, 4, 8, 9, 10, 11]])
(23, [[2, 3, 4, 

In [8]:
%%time
with Pool(4) as p:
    [print(r) for r in p.map(f, range(min(len(zoo_train), 40)))]

(0, [[2, 3, 4, 8, 9, 10, 11]])
(1, [[2, 3, 4, 8, 9, 10, 11]])
(2, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(3, [[2, 3, 4, 8, 9, 10, 11]])
(4, [[2, 3, 4, 8, 9, 10, 11]])
(5, [[2, 3, 4, 8, 9, 10, 11]])
(6, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(7, [[2, 3, 4, 8, 9, 10, 11]])
(8, [[2, 3, 4, 8, 9, 10, 11]])
(9, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(10, [[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15], [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16]])
(11, [[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15]])
(12, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(13, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]])
(14, [[2, 3, 4, 8, 9, 10, 11]])
(15, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(16, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(17, [[2, 3, 4, 8, 9, 10, 11]])
(18, [[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]])
(19, [[2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 16]])
(20, [[1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 16]])
(21, [[2, 3, 4, 8, 9, 10, 11]])
(22, [[2, 3, 4, 8, 9, 10, 11]])
(23, [[2, 3, 4, 