In [None]:
DEBUG = True
N_DEBUG = 6

In [None]:
N_PHASE_1_3 = 16
N_PHASE_2_4 = 32

Modified experiment of Hase et al. (see simulatability.ipynb)

In [None]:
import pandas as pd
import os
import numpy as np


In [None]:
from IPython.core.display import HTML


In [None]:
from explainer_wrappers import LIME_Explainer, SHAP_Explainer

In [None]:
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy

In [None]:
test = pd.read_pickle("./dataset_test.pkl")
test = test 

In [None]:
documents = list(test["answer"])
gold_labels = list(test["author"] == "human_answers") # convention: 0: machine, 1: human, see detector.py

In [None]:
detector = DetectGPT()

In [None]:
explainer = SHAP_Explainer(detector)

In [None]:
import lime
import numpy as np
import warnings

In [None]:
# class from LIME, adapted.
class SubmodularPick(object):
    """Class for submodular pick

    Saves a representative sample of explanation objects using SP-LIME,
    as well as saving all generated explanations

    First, a collection of candidate explanations are generated
    (see explain_instance). From these candidates, num_exps_desired are
    chosen using submodular pick. (see marcotcr et al paper)."""

    def __init__(self,
                 fi_explainer,
                 data,
                 predict_fn,
                 method='sample',
                 sample_size=1000,
                 num_exps_desired=5,
         
                 **kwargs):

        """
        Args:
            data: a numpy array where each row is a single input into predict_fn
            predict_fn: prediction function. For classifiers, this should be a
                    function that takes a numpy array and outputs prediction
                    probabilities. For regressors, this takes a numpy array and
                    returns the predictions. For ScikitClassifiers, this is
                    `classifier.predict_proba()`. For ScikitRegressors, this
                    is `regressor.predict()`. The prediction function needs to work
                    on multiple feature vectors (the vectors randomly perturbed
                    from the data_row).
            method: The method to use to generate candidate explanations
                    method == 'sample' will sample the data uniformly at
                    random. The sample size is given by sample_size. Otherwise
                    if method == 'full' then explanations will be generated for the
                    entire data. l
            sample_size: The number of instances to explain if method == 'sample'
            num_exps_desired: The number of explanation objects returned
            num_features: maximum number of features present in explanation


        Sets value:
            sp_explanations: A list of explanation objects that has a high coverage
            explanations: All the candidate explanations saved for potential future use.
              """

        top_labels = kwargs.get('top_labels', 1)
        if 'top_labels' in kwargs:
            del kwargs['top_labels']
        # Parse args
        if method == 'sample':
            if sample_size > len(data):
                warnings.warn("""Requested sample size larger than
                              size of input data. Using all data""")
                sample_size = len(data)
            all_indices = np.arange(len(data))
            np.random.seed(2202)
            np.random.shuffle(all_indices)
            sample_indices = all_indices[:sample_size]
        elif method == 'full':
            sample_indices = np.arange(len(data))
        else:
            raise ValueError('Method must be \'sample\' or \'full\'')

        # Generate Explanations
        self.explanations = []
        for i in sample_indices:
           # explainer.delete_cached_explanation(data[i])
            self.explanations.append(
                explainer.get_explanation_cached(
                    data[i]))
        # Error handling
        try:
            num_exps_desired = int(num_exps_desired)
        except TypeError:
            return("Requested number of explanations should be an integer")
        if num_exps_desired > len(self.explanations):
            warnings.warn("""Requested number of explanations larger than
                           total number of explanations, returning all
                           explanations instead.""")
        num_exps_desired = min(num_exps_desired, len(self.explanations))

        # Find all the explanation model features used. Defines the dimension d'
        features_dict = {}
        feature_iter = 0
        for exp in self.explanations:
            labels = [0] #exp.available_labels() if exp.mode == 'classification' else [1]
            for label in labels:
                for feature, _ in explainer.as_list(exp, label=label):
                    if feature not in features_dict.keys():
                        features_dict[feature] = (feature_iter)
                        feature_iter += 1
        d_prime = len(features_dict.keys())

        # Create the n x d' dimensional 'explanation matrix', W
        # loris: note that this is BOW now
        W = np.zeros((len(self.explanations), d_prime))
        for i, exp in enumerate(self.explanations):
            labels = [0]# if exp.mode == 'classification' else [1]
            for label in labels:
                for feature, value in explainer.as_list(exp, label=label):
                    # loris: TODO BOW! this sums FI scores for the same word in different contexts! 
                    W[i, features_dict[feature]] += value
     #   print("W", W)
    #    print("w.shape", W.shape)
        self.W = W
        self.sample_indices = sample_indices
        self.features_dict = features_dict
        return 
        # Create the global importance vector, I_j described in the paper
        importance = np.sum(abs(W), axis=0)**.5

        # Now run the SP-LIME greedy algorithm
        remaining_indices = set(range(len(self.explanations)))
        V = []
        for _ in range(num_exps_desired):
            best = 0
            best_ind = None
            current = 0
            for i in remaining_indices:
                current = np.dot(
                        (np.sum(abs(W)[V + [i]], axis=0) > 0), importance
                        )  # coverage function
                if current >= best:
                    best = current
                    best_ind = i
            V.append(best_ind)
            remaining_indices -= {best_ind}

        self.sp_explanations = [self.explanations[i] for i in V]
        self.V = V


In [None]:
sp_obj = SubmodularPick(explainer.explainer, documents, explainer.detector.predict_proba, sample_size=10, num_exps_desired=5, method="full")

In [None]:
sample_indices = sp_obj.sample_indices


In [None]:
from sklearn.preprocessing import normalize

In [None]:
np.max(sp_obj.W)

In [None]:
sp_obj.W.shape

In [None]:
np.array([[1,2,3,4,5],[1,2,3,4,5]]).shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize


sim = cosine_similarity(normalize(sp_obj.W, axis=0))

In [None]:
assert np.allclose(np.diag(sim), np.ones_like(np.diag(sim))), "FI scores to small!"

In [None]:
assert np.allclose(np.triu(sim), np.rot90(np.fliplr(np.tril(sim))))

In [None]:
a = np.array([-4.01154804e-18,  3.29326410e-18,  2.92734587e-18,
         1.01915004e-17,  2.00577402e-18,  2.16840434e-19,
        -3.79470760e-18, -2.54787511e-18, -6.66784336e-18,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00])

In [None]:
int(cosine_similarity(a.reshape(1, -1),Y=a.T.reshape(1, -1))[0][0])

In [None]:
sim.shape

In [None]:
# idx_pairs = []
# sim_ = np.copy(sim)
# sim_ = np.triu(sim_,k=1)
# print(sim_)
# for i in range(0,10):
#     idx_max = np.unravel_index(sim_.argmax(), sim_.shape)
#     idx_pairs.append([sample_indices[ii] for ii in idx_max])
#     sim_[idx_max] = 0

In [None]:
idx_pairs = []
features = []
W_ = np.copy(sp_obj.W)

features_dict = sp_obj.features_dict.copy()


for i in range(len(sp_obj.sample_indices)):
    sim = cosine_similarity(normalize(W_, axis=0))
    sim = np.triu(sim,k=1)

    idx_max = np.unravel_index(sim.argmax(), sim.shape)

    
    
    idx_fi_non_zero_in_both = np.intersect1d(W_[idx_max[0]].nonzero(),W_[idx_max[1]].nonzero())
    
   # print("idx_fi_non_zero_in_both",idx_fi_non_zero_in_both)

    features_pair = []
    for iii in idx_fi_non_zero_in_both:
        key = list(features_dict.keys())[list(features_dict.values()).index(iii)]
        features_pair.append(key)
        del features_dict[key]
        features_dict = {key:i for i, key in enumerate(features_dict.keys())}

    if len(features_pair) > 0 and gold_labels[sample_indices[idx_max[0]]] and gold_labels[sample_indices[idx_max[0]]]:
        a,b = detector.predict_label([documents[sample_indices[idx_max[0]]], documents[sample_indices[idx_max[1]]]])
        if a == b:
            idx_pairs.append([sample_indices[ii] for ii in idx_max])
            features.append(features_pair)
    W_ = np.delete(W_, idx_fi_non_zero_in_both, axis=1)



In [None]:
a==b

In [None]:
prompt_template_phase_3 = """
<p><b>This is a {kind_of_document} document.</b></p>
<p>The detector {correctly_or_wrongly} predicted that this document was... </p>
<p>&emsp; ... machine generated with {p_machine} % confidence.</p>
<p>&emsp; ... human written with {p_human} % confidence.</p> 
<div style="float:left; height:30em;">{barplot_machine}{barplot_human}</div>


<div style="float:left;">{highlighted_text}</div>
"""
def printt(document, gold_label):
    p_machine, p_human = detector.predict_proba([document])[0]
    machine, human = explainer.get_barplots_HTML(document)
    display(HTML(prompt_template_phase_3.format(
    p_machine=int(p_machine*100), 
    p_human=int(p_human*100),
    barplot_machine=machine,
    barplot_human=human,
    kind_of_document= "machine generated" if gold_label == False else "human written", 
    correctly_or_wrongly= "correctly" if detector.predict_label([document])[0] == gold_label else "wrongly", 
    highlighted_text=explainer.get_highlighted_text_HTML(document),
    )))

In [None]:
HTML(explainer.get_highlighted_text_HTML("Test !"))

In [None]:
for (a,b), features in zip(idx_pairs, features):
    print(a,b, features)
    printt(documents[a], gold_labels[a])
    printt(documents[b], gold_labels[b])
    print("------------------------------------")