# Compare `l3wrapper` and `AL3.jar`

## Setup

In [1]:
import random
from os.path import join
from typing import Tuple, List

import arff
from snapshottest import TestCase

from src import DEFAULT_DIR
from src.XPLAIN_explainer import XPLAIN_explainer
from src.dataset import Dataset


def load_arff(f) -> Dataset:
    a = arff.load(f)
    dataset = Dataset(a['data'], a['attributes'])

    return dataset


def import_dataset_arff(f, explain_indices: List[int],
                        random_explain_dataset: bool) -> Tuple[Dataset, Dataset, List[str]]:
    dataset = load_arff(f)

    dataset_len = len(dataset)
    train_indices = list(range(dataset_len))

    if random_explain_dataset:
        random.seed(1)
        # small dataset
        MAX_SAMPLE_COUNT = 100
        if dataset_len < (2 * MAX_SAMPLE_COUNT):
            samples = int(0.2 * dataset_len)
        else:
            samples = MAX_SAMPLE_COUNT

        # Randomly pick some instances to remove from the training dataset and use in the
        # explain dataset
        explain_indices = list(random.sample(train_indices, samples))
    for i in explain_indices:
        train_indices.remove(i)

    train_dataset = Dataset.from_indices(train_indices, dataset)
    explain_dataset = Dataset.from_indices(explain_indices, dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def import_datasets_arff(f, f_explain, explain_indices: List[int],
                         random_explain_dataset: bool) -> Tuple[Dataset, Dataset, List[str]]:
    train_dataset = load_arff(f)
    explain_dataset = load_arff(f_explain)

    len_explain_dataset = len(explain_dataset)

    if random_explain_dataset:
        random.seed(7)
        explain_indices = list(random.sample(range(len_explain_dataset), 300))
        explain_dataset = Dataset.from_indices(explain_indices, explain_dataset)

    return train_dataset, explain_dataset, [str(i) for i in explain_indices]


def get_classifier(classifier_name: str):
    if classifier_name == "sklearn_nb":
        from sklearn.naive_bayes import MultinomialNB

        skl_clf = MultinomialNB()

        return skl_clf

    elif classifier_name == "sklearn_rf":
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.pipeline import make_pipeline
        from sklearn.preprocessing import OneHotEncoder

        pipe = make_pipeline(OneHotEncoder(), RandomForestClassifier(random_state=42))
        skl_clf = pipe

        return skl_clf

    else:
        raise ValueError("Classifier not available")


def get_explanation(dataset_name: str, classifier_name: str):
    explain_dataset_indices = []
    if dataset_name in [join(DEFAULT_DIR, "datasets/adult_d.arff"),
                        join(DEFAULT_DIR, "datasets/compas-scores-two-years_d.arff")]:
        with open(dataset_name) as f, open(dataset_name[:-5] + "_explain.arff") as f_explain:
            train_dataset, explain_dataset, explain_indices = import_datasets_arff(f, f_explain,
                                                                                   explain_dataset_indices,
                                                                                   True)
    else:
        with open(dataset_name) as f:
            train_dataset, explain_dataset, explain_indices = import_dataset_arff(
                f, explain_dataset_indices, True)

    clf = get_classifier(classifier_name).fit(train_dataset.X_numpy(),
                                              train_dataset.Y_numpy())
    explainer = XPLAIN_explainer(clf, train_dataset)

    instance = explain_dataset.get_decoded(0)

    cc = explain_dataset.class_column_name()
    target_class_index = instance[cc]

    return explainer.explain_instance(explain_dataset[0], target_class_index)

/home/andrea/Documents/Politecnico/tirocinio/src/../


## Monks

In [2]:
with open(join(DEFAULT_DIR, "datasets/monks.arff")) as monks_f:
    monks_train, monks_explain, monks_explain_indices = import_dataset_arff(monks_f, [], True)

In [3]:
def l3wrapper(instance_ix, train):
    from l3wrapper.l3wrapper import L3Classifier
    
    clf = L3Classifier(min_sup=0.01, min_conf=0.50)
    clf.fit(train.X_decoded(),
            train.Y_decoded(),
            column_names=train.X_decoded().columns.to_list())
    
    def get_rule_attrs_and_values(r, clf):
            r_attr_ixs_and_values = sorted([clf._item_id_to_item[i] for i in r.item_ids])
            r_attrs_and_values = [(clf._column_id_to_name[c], v) for c, v in r_attr_ixs_and_values]
            return r_attrs_and_values


    # Perform matching: remove all rules thta use an attibute value not present in the instance to
    # explain
    
    # For each rule
    # Remove rules that use item_ids greater that instance attributes
    rules = [list(sorted(r.item_ids)) for r in clf.lvl1_rules_ if
                 all(i < (len(train.X_decoded().iloc[0]) + 1) for i in r.item_ids)]

    return rules


In [4]:
import subprocess  
import os
from shutil import rmtree
import tempfile

def AL3(instance_ix, train):
    with tempfile.TemporaryDirectory() as d:
        with open(join(d, 'Knnres.arff'), "w") as knnres:
            arff.dump(
                train.to_arff_obj(),
                knnres)

        with open(join(d, 'Filetest.arff'), "w") as filetest:
            arff.dump(
                Dataset(
                    [train.inverse_transform_instance(train[instance_ix])],
                    train.columns
                ).to_arff_obj(),
                filetest)

        subprocess.call(['java', '-jar', 'AL3.jar',
                         '-no-cv',
                         '-t', knnres.name,
                         '-T', filetest.name,
                         '-S', '1.0', # minimum support
                         '-C', '50.0', # minimum confidence
                         '-PN', d, # temporary files folder
                         '-SP', '10', # measure threshold
                         '-NRUL','1']) #  maximum number of rules to classify a transaction

        with open(join(d, 'impo_rules.txt'), "r") as impo_rules:
            rules = impo_rules.readlines()

        def parse_rules(rules_lines):
            union_rule = []
            rules = []

            for rule_line in rules_lines:
                rule = []

                for attribute_str in rule_line.split(","):
                    attribute = int(attribute_str)
                    rule.append(attribute)
                    union_rule.append(attribute)

                rules.append(rule)

            # Remove duplicates
            union_rule = list(sorted(set(union_rule)))

            if union_rule not in rules:
                rules.append(union_rule)

            return rules

        return parse_rules(rules)

In [5]:
for i in range(min(len(monks_train),50)):
    print(i)
    a = AL3(i, monks_train)
    b = l3wrapper(i, monks_train)
    assert(a == b)
    print(a)
    print(b)
    

0
[[1, 2]]
[[1, 2]]
1
[[1, 2]]
[[1, 2]]
2
[[1, 2]]
[[1, 2]]
3
[[1, 2]]
[[1, 2]]
4
[[1, 2]]
[[1, 2]]
5
[[1, 2]]
[[1, 2]]
6
[[1, 2]]
[[1, 2]]
7
[[1, 2]]
[[1, 2]]
8
[[1, 2]]
[[1, 2]]
9
[[1, 2]]
[[1, 2]]
10
[[1, 2]]
[[1, 2]]
11
[[1, 2]]
[[1, 2]]
12
[[1, 2]]
[[1, 2]]
13
[[1, 2]]
[[1, 2]]
14
[[1, 2]]
[[1, 2]]
15
[[1, 2]]
[[1, 2]]
16
[[1, 2]]
[[1, 2]]
17
[[1, 2]]
[[1, 2]]
18
[[1, 2]]
[[1, 2]]
19
[[1, 2]]
[[1, 2]]
20
[[1, 2]]
[[1, 2]]
21
[[1, 2]]
[[1, 2]]
22
[[1, 2]]
[[1, 2]]
23
[[1, 2]]
[[1, 2]]
24
[[1, 2]]
[[1, 2]]
25
[[1, 2]]
[[1, 2]]
26
[[1, 2]]
[[1, 2]]
27
[[1, 2]]
[[1, 2]]
28
[[1, 2]]
[[1, 2]]
29
[[1, 2]]
[[1, 2]]
30
[[1, 2]]
[[1, 2]]
31
[[1, 2]]
[[1, 2]]
32
[[1, 2]]
[[1, 2]]
33
[[1, 2]]
[[1, 2]]
34
[[1, 2]]
[[1, 2]]
35
[[1, 2]]
[[1, 2]]
36
[[1, 2]]
[[1, 2]]
37
[[1, 2]]
[[1, 2]]
38
[[1, 2]]
[[1, 2]]
39
[[1, 2]]
[[1, 2]]
40
[[1, 2]]
[[1, 2]]
41
[[1, 2]]
[[1, 2]]
42
[[1, 2]]
[[1, 2]]
43
[[1, 2]]
[[1, 2]]
44
[[1, 2]]
[[1, 2]]
45
[[1, 2]]
[[1, 2]]
46
[[1, 2]]
[[1, 2]]
47
[[1, 2]]
[[1, 2]]
48

## Zoo

In [6]:
with open(join(DEFAULT_DIR, "datasets/zoo.arff")) as zoo_f:
    zoo_train, zoo_explain, zoo_explain_indices = import_dataset_arff(zoo_f, [], True)

In [7]:
for i in range(min(len(zoo_train), 50)):
    print(i)
    a = AL3(i, zoo_train)
    b = l3wrapper(i, zoo_train)
    assert(a == b)
    print(a)
    print(b)

0
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
1
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
2
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
3
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
4
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
5
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
6
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
7
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
8
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
9
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
10
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
11
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
12
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
13
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
14
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
15
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
16
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
17
[[2, 3, 4, 8, 9, 10, 11]]
[[2, 3, 4, 8, 9, 10, 11]]
18
[[2, 3, 4, 8, 9, 

## Adult

In [9]:
with open(join(DEFAULT_DIR, "datasets/adult_d.arff")) as adult_f:
    with open(join(DEFAULT_DIR, "datasets/adult_d_explain.arff")) as adult_explain_f:
        adult_train, adult_explain, adult_explain_indices = import_datasets_arff(adult_f, adult_explain_f, [], True)

In [11]:
for i in range(min(len(adult_train), 50)):
    print(i)
    a = AL3(i, adult_train)
    print(a)
    b = l3wrapper(i, adult_train)
    print(b)
    assert(a == b)

0
[[1, 2, 5, 8], [1, 5, 8, 9], [2, 5, 8, 9], [2, 5, 8], [5, 8, 9], [8, 9, 10], [1, 2, 5, 8, 9, 10]]
[[2, 5, 8, 10, 11], [4, 5, 8, 10, 11], [1, 5, 8, 10, 11], [5, 8, 10, 11], [1, 4, 5, 8, 9, 10], [5, 7, 8, 11], [1, 4, 5, 8, 10], [1, 2, 4, 7, 8, 9, 10], [4, 5, 8, 9, 10], [1, 7, 8, 9, 10], [1, 5, 7, 8], [1, 7, 8, 10], [2, 7, 8, 11], [1, 2, 5, 8, 9], [1, 2, 5, 8], [1, 5, 8, 9, 11], [1, 5, 8, 9], [1, 2, 7, 8, 9], [2, 5, 8, 9], [1, 2, 8, 9, 10, 11], [2, 5, 8], [5, 8, 9], [1, 7, 8, 9], [7, 8, 9], [8, 9, 10, 11], [1, 8, 10], [8, 9, 10]]


AssertionError: 