# Compare `l3wrapper` and `AL3.jar`

## Setup

In [1]:
import random
from os.path import join
from typing import Tuple, List

import arff
from snapshottest import TestCase

from src import DEFAULT_DIR
from src.XPLAIN_explainer import XPLAIN_explainer
from src.dataset import Dataset


def load_arff(f) -> Dataset:
    a = arff.load(f)
    dataset = Dataset(a['data'], a['attributes'])

    return dataset


def import_dataset_arff(f, explain_indices: List[int],
                        random_explain_dataset: bool) -> Tuple[Dataset, Dataset, List[str]]:
    dataset = load_arff(f)

    dataset_len = len(dataset)
    train_indices = list(range(dataset_len))

    if random_explain_dataset:
        random.seed(1)
        # small dataset
        MAX_SAMPLE_COUNT = 100
        if dataset_len < (2 * MAX_SAMPLE_COUNT):
            samples = int(0.2 * dataset_len)
        else:
            samples = MAX_SAMPLE_COUNT

        # Randomly pick some instances to remove from the training dataset and use in the
        # explain dataset
        explain_indices = list(random.sample(train_indices, samples))
    for i in explain_indices:
        train_indices.remove(i)

    train_dataset = Dataset.from_indices(train_indices, dataset)
    explain_dataset = Dataset.from_indices(explain_indices, dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def import_datasets_arff(f, f_explain, explain_indices: List[int],
                         random_explain_dataset: bool) -> Tuple[Dataset, Dataset, List[str]]:
    train_dataset = load_arff(f)
    explain_dataset = load_arff(f_explain)

    len_explain_dataset = len(explain_dataset)

    if random_explain_dataset:
        random.seed(7)
        explain_indices = list(random.sample(range(len_explain_dataset), 300))
        explain_dataset = Dataset.from_indices(explain_indices, explain_dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def get_classifier(classifier_name: str):
    if classifier_name == "sklearn_nb":
        from sklearn.naive_bayes import MultinomialNB

        skl_clf = MultinomialNB()

        return skl_clf

    elif classifier_name == "sklearn_rf":
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import OneHotEncoder

        pipe = make_pipeline(OneHotEncoder(), RandomForestClassifier(random_state=42))
        skl_clf = pipe

        return skl_clf

    else:
        raise ValueError("Classifier not available")


def get_explanation(dataset_name: str, classifier_name: str):
    explain_dataset_indices = []
    if dataset_name in [join(DEFAULT_DIR, "datasets/adult_d.arff"),
                        join(DEFAULT_DIR, "datasets/compas-scores-two-years_d.arff")]:
        with open(dataset_name) as f, open(dataset_name[:-5] + "_explain.arff") as f_explain:
            train_dataset, explain_dataset, explain_indices = import_datasets_arff(f, f_explain,
                                                                                   explain_dataset_indices,
                                                                                   True)
    else:
        with open(dataset_name) as f:
            train_dataset, explain_dataset, explain_indices = import_dataset_arff(
                f, explain_dataset_indices, True)

    clf = get_classifier(classifier_name).fit(train_dataset.X_numpy(),
                                              train_dataset.Y_numpy())
    explainer = XPLAIN_explainer(clf, train_dataset)

    instance = explain_dataset.get_decoded(0)

    cc = explain_dataset.class_column_name()
    target_class_index = instance[cc]

    return explainer.explain_instance(explain_dataset[0], target_class_index)

/home/andrea/Documents/Politecnico/tirocinio/src/../


In [20]:
def l3wrapper(instance_ix, train):
    from l3wrapper.l3wrapper import L3Classifier
    
    clf = L3Classifier(min_sup=0.01, min_conf=0.50)
    clf.fit(train.X_decoded(),
            train.Y_decoded(),
            column_names=train.X_decoded().columns.to_list())
    
    decoded_instance = train.inverse_transform_instance(train[instance_ix])
    encoded_rules = clf.lvl1_rules_

    def decode_rule(r, clf):
        r_class = clf._class_dict[r.class_id]
        r_attr_ixs_and_values = sorted([clf._item_id_to_item[i] for i in r.item_ids])
        r_attrs_and_values = [(clf._column_id_to_name[c], v) for c, v in r_attr_ixs_and_values]
        return {'body': r_attrs_and_values, 'class': r_class}

    rules = []
    
    for r in encoded_rules:
        # For each of its attributes and values
        for a,v in decode_rule(r, clf)['body']:
            # If rule uses an attribute's value different from the instance's
            if decoded_instance[a] != v:
                # Exit the inner loop, not entering the else clause, therefore not adding the rule
                break
        # https://docs.python.org/3/tutorial/controlflow.html#break-and-continue-statements-and-else-clauses-on-loops
        else:
            # If the inner loop has completed normally without break-ing, then all of the rule's
            # attribute values are in the instance as well, so we will use this rule

            # Get the instance attribute index from the rule's item_ids
            di = decoded_instance.index
            rules.append(list(sorted([di.get_loc(a) + 1 for a, v in decode_rule(r, clf)['body']])))
    
    return rules

In [3]:
import subprocess  
import os
from shutil import rmtree
import tempfile

def AL3(instance_ix, train):
    with tempfile.TemporaryDirectory() as d:
        with open(join(d, 'Knnres.arff'), "w") as knnres:
            arff.dump(
                train.to_arff_obj(),
                knnres)

        with open(join(d, 'Filetest.arff'), "w") as filetest:
            arff.dump(
                Dataset(
                    [train.inverse_transform_instance(train[instance_ix])],
                    train.columns
                ).to_arff_obj(),
                filetest)

        subprocess.call(['java', '-jar', 'AL3.jar',
                         '-no-cv',
                         '-t', knnres.name,
                         '-T', filetest.name,
                         '-S', '1.0', # minimum support
                         '-C', '50.0', # minimum confidence
                         '-PN', d, # temporary files folder
                         '-SP', '10', # measure threshold
                         '-NRUL','1']) #  maximum number of rules to classify a transaction

        with open(join(d, 'impo_rules.txt'), "r") as impo_rules:
            rules = impo_rules.readlines()

        def parse_rules(rules_lines):
            union_rule = []
            rules = []

            for rule_line in rules_lines:
                rule = []

                for attribute_str in rule_line.split(","):
                    attribute = int(attribute_str)
                    rule.append(attribute)
                    union_rule.append(attribute)

                rules.append(rule)

            # Remove duplicates
            union_rule = list(sorted(set(union_rule)))

            if union_rule not in rules:
                rules.append(union_rule)

            return rules

        return parse_rules(rules)

## Monks

In [4]:
with open(join(DEFAULT_DIR, "datasets/monks.arff")) as monks_f:
    monks_train, monks_explain, monks_explain_indices = import_dataset_arff(monks_f, [], True)

In [21]:
for i in range(min(len(monks_train),50)):
    print(i)
    a = AL3(i, monks_train)
    b = l3wrapper(i, monks_train)
    print(a)
    print(b)
    

0
[[1, 2]]
[[1, 2]]
1
[[1, 2]]
[[1, 2]]
2
[[1, 2]]
[[1, 2]]
3
[[1, 2]]
[[1, 2]]
4
[[1, 2]]
[[1, 2]]
5
[[1, 2]]
[[1, 2]]
6
[[1, 2]]
[[1, 2]]
7
[[1, 2]]
[[5], [1, 2]]
8
[[1, 2]]
[[1, 2, 5]]
9
[[1, 2]]
[[1, 2, 5]]
10
[[1, 2]]
[[1, 2, 5]]
11
[[1, 2]]
[[5]]
12
[[1, 2]]
[[1, 2, 5]]
13
[[1, 2]]
[[1, 2, 5]]
14
[[1, 2]]
[[1, 2, 5]]
15
[[1, 2]]
[[1, 2, 5]]
16
[[1, 2]]
[[1, 2, 5]]
17
[[1, 2]]
[[1, 2, 5]]
18
[[1, 2]]
[[1, 2, 5]]
19
[[1, 2]]
[[1, 2, 5]]
20
[[1, 2]]
[[1, 2, 5]]
21
[[1, 2]]
[[1, 2, 5]]
22
[[1, 2]]
[[1, 2, 5]]
23
[[1, 2]]
[[5]]
24
[[1, 2]]
[[1, 2, 5]]
25
[[1, 2]]
[[1, 2, 5]]
26
[[1, 2]]
[[1, 2, 5]]
27
[[1, 2]]
[[1, 2, 5]]
28
[[1, 2]]
[[5]]
29
[[1, 2]]
[[1, 2, 5]]
30
[[1, 2]]
[[1, 2, 5]]
31
[[1, 2]]
[[1, 2, 5]]
32
[[1, 2]]
[[1, 2, 5]]
33
[[1, 2]]
[[5]]
34
[[1, 2]]
[[1, 2, 5]]
35
[[1, 2]]
[[1, 2, 5]]
36
[[1, 2]]
[[1, 2, 5]]
37
[[1, 2]]
[[1, 2, 5]]
38
[[1, 2]]
[[5]]
39
[[1, 2]]
[[5]]
40
[[1, 2]]
[[1, 2, 5]]
41
[[1, 2]]
[[1, 2, 5]]
42
[[1, 2]]
[[1, 2, 5]]
43
[[1, 2]]
[[1, 2, 5]]
44
[[1, 2

## Zoo

In [22]:
with open(join(DEFAULT_DIR, "datasets/zoo.arff")) as zoo_f:
    zoo_train, zoo_explain, zoo_explain_indices = import_dataset_arff(zoo_f, [], True)

In [24]:
for i in range(min(len(zoo_train), 50)):
    print(i)
    a = AL3(i, zoo_train)
    b = l3wrapper(i, zoo_train)
    print(a)
    print(b)

0
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
1
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
2
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]]
3
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
4
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
5
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
6
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]]
7
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
8
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
9
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]]
10
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15], [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16]]
11
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15]]
12
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14]]
13
[[2, 3, 4, 8, 9, 10, 11]]
[[1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14]]
14
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
15
[[2, 3, 4

## Adult

In [25]:
with open(join(DEFAULT_DIR, "datasets/adult_d.arff")) as adult_f:
    with open(join(DEFAULT_DIR, "datasets/adult_d_explain.arff")) as adult_explain_f:
        adult_train, adult_explain, adult_explain_indices = import_datasets_arff(adult_f, adult_explain_f, [], True)

In [26]:
for i in range(min(len(adult_train), 50)):
    print(i)
    a = AL3(i, adult_train)
    print(a)
    b = l3wrapper(i, adult_train)
    print(b)


0
[[1, 2, 5, 8], [1, 5, 8, 9], [2, 5, 8, 9], [2, 5, 8], [5, 8, 9], [8, 9, 10], [1, 2, 5, 8, 9, 10]]
[[2, 5, 8, 10, 11], [4, 5, 8, 10, 11], [1, 5, 8, 10, 11], [5, 8, 10, 11], [1, 4, 5, 8, 9, 10], [5, 7, 8, 11], [1, 4, 5, 8, 10], [1, 2, 4, 7, 8, 9, 10], [4, 5, 8, 9, 10], [1, 7, 8, 9, 10], [1, 5, 7, 8], [1, 7, 8, 10], [2, 7, 8, 11], [1, 2, 5, 8, 9], [1, 2, 5, 8], [1, 5, 8, 9, 11], [1, 5, 8, 9], [1, 2, 7, 8, 9], [2, 5, 8, 9], [1, 2, 8, 9, 10, 11], [2, 5, 8], [5, 8, 9], [1, 7, 8, 9], [7, 8, 9], [8, 9, 10, 11], [1, 8, 10], [8, 9, 10]]
1
[[1, 2, 5, 8], [1, 5, 8, 9], [2, 5, 8, 9], [2, 5, 8], [5, 8, 9], [8, 9, 10], [1, 2, 5, 8, 9, 10]]
[[2, 3, 5, 8, 9, 10], [2, 5, 8, 9, 10, 11], [3, 5, 8, 9, 11], [5, 8, 9, 10, 11], [2, 3, 5, 8, 9], [5, 8, 10, 11], [2, 5, 8, 9, 10], [2, 5, 8, 10], [3, 5, 8, 9], [5, 8, 9, 10], [5, 8, 10], [2, 5, 8, 9], [2, 5, 8], [4, 5, 6, 8, 9], [5, 8, 9], [6, 7, 8, 10], [3, 7, 8, 9, 10], [2, 3, 8, 9, 10, 11]]
2
[[1, 2, 5, 8], [1, 5, 8, 9], [2, 5, 8, 9], [2, 5, 8], [5, 8, 9], [8

KeyboardInterrupt: 