> **Warning**: Always set this flag to `True` before git commit!

In [1]:
OBFUSCATE_RESULTS = False

In [2]:
import torch

In [3]:
from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer

In [4]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from IPython.core.display import HTML
import lime
import numpy as np
import warnings
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import normalize

# Utility functions

In [5]:
template = """
<p><b>This is a {kind_of_document} document.</b></p>
<p>The detector {correctly_or_wrongly} predicted that this document was... </p>
<p>&emsp; ... machine generated with {p_machine} % confidence.</p>
<p>&emsp; ... human written with {p_human} % confidence.</p> 
<div style="float:left;">{highlighted_text}</div>
"""
#<div style="float:left; height:30em;">{barplot_machine}{barplot_human}</div>



#"""
def print_template(document, gold_label, detector, explainer):
    p_machine, p_human = detector.predict_proba([document])[0]
   # machine, human = explainer.get_barplots_HTML(document)
    display(HTML(template.format(
    p_machine=int(p_machine*100) if not OBFUSCATE_RESULTS else "<redacted>", 
    p_human=int(p_human*100) if not OBFUSCATE_RESULTS else "<redacted>",
  #  barplot_machine=machine,
  #  barplot_human=human,
    kind_of_document= (("machine generated" if gold_label == False else "human written") if not OBFUSCATE_RESULTS else "<redacted>"), 
    correctly_or_wrongly= (("correctly" if detector.predict_label([document])[0] == gold_label else "wrongly") if not OBFUSCATE_RESULTS else "<redacted>"), 
    highlighted_text=explainer.get_highlighted_text_HTML((document if not OBFUSCATE_RESULTS else "<redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted>")),
    )))
def print_shared_features(features, fi_scores):
    for feature, (fi_score_a, fi_score_b) in zip(features, fi_scores):
        print(feature)
        print("\t\ta: {} \t b: {}".format(fi_score_a, fi_score_b))
        print("-----------")


In [6]:
def print_pairs(pairs, documents, gold_labels, document_ids, detector, explainer, skip_n=0):

    for (a,b)in pairs:
        if OBFUSCATE_RESULTS:
            print("idx_a: <redacted> idx_b <redacted>")
        else:
            print("idx_a: {} idx_b {}".format(document_ids[a],document_ids[b]))

        print_template(documents[a], gold_labels[a], detector, explainer)
        print_template(documents[b], gold_labels[b], detector, explainer)
        print("------------------------------------------------------------------------------------------")
        print("------------------------------------------------------------------------------------------")
        print("------------------------------------------------------------------------------------------")
  


# Search Strategy for Feature Importance Explainers

In [7]:
# returns a matrix of explanations for all documents in "data"
# This function was once SubmodularPick.__init__() in LIME. It was planned to use its output for a search strategy for similar explanations. 
# Only the code for creating W from the paper (rows are explanations, cols are BOW features) remains
def get_explanation_matrix_W(data, explainer, quiet=False):
    # Get (cached) explanations 
    explanations_and_documents = [(d, explainer.get_fi_scores(d, fill=True)[0]) for d in tqdm(data, desc="Loading all explanations",disable=quiet) ] # [0]: only irt to label machine, fill: return all words, even those with 0 fi

    get_feature_name_signed = lambda feature,value : feature + ("_+" if value >=0 else "_-") # appends "_+" or "_-" to each feature name, e.g. "example" -> "example_+" if fi(example) > 0
    # Ribeiro et al.: Find all the explanation model features used. Defines the dimension d'
    # i.e. determine columns of W: each word (BOW) gets (up to) two columns, one for positive FI scores, one for negative FI scores
    features_dict = {}
    feature_iter = 0
    for d, exp in tqdm(explanations_and_documents, desc="Building global dict of features", disable=quiet):
     #   print("exp",exp)
        for feature_idx, value in exp: # irt to label machine
            feature = explainer.tokenize(d)[feature_idx]
            feature_name = get_feature_name_signed(feature,value) # get_feature_name_signed: see above
            if feature_name not in features_dict.keys():
                features_dict[feature_name] = (feature_iter)
                feature_iter += 1
    d_prime = len(features_dict.keys())

    # Ribeiro et al.: Create the n x d' dimensional 'explanation matrix', W
    W = np.zeros((len(explanations_and_documents), d_prime))

    # fill W, look up cols in dict that was just created
    # W: one row per explanation, one col per feature in feature_dict
    for i, (d, exp) in enumerate(tqdm(explanations_and_documents,  desc="Building W",disable=quiet)):
        for feature_idx, value in exp: # irt to label machine
            # get_feature_name_signed: see above
            feature = explainer.tokenize(d)[feature_idx]
            W[i, features_dict[get_feature_name_signed(feature,value)]] += value
    return W, features_dict


In [8]:
# returns a list of tuples: (pair of documents whose explanations are similar, the features that overlap, fi scores of said features)
# this maximizes similarity between documents (greedy, each document is only part of one tuple)
# another function should select n tuples to maximize coverage in explanation space akin to SP-LIME
sum_two_max = None
def get_pairs(documents, W, detector, features_dict, n_pairs=None):
    if n_pairs is None:
        n_pairs = len(documents)//2
    idx_pairs = [] # tuples of indices of similar documents a,b in "data"
    features = [] # list of features those documents covered
    fi_scores_pairs = []

    W_ = np.copy(W)

    document_indices = np.arange(0, W_.shape[0])
   # print(document_indices.shape, W_.shape)
    for _ in tqdm(range(0,n_pairs), desc="Obtaining pairs"):
        sim = cosine_similarity(W_) # calculate cosine similarity between all explanations
        sim = np.triu(sim,k=1)  # remove redundant information for argmax()

        idx_max = np.unravel_index(sim.argmax(), sim.shape) # get most similar pair, result is (idx_a, idx_b)
       # print(idx_max)
        features_non_zero_in_both = np.intersect1d(W_[idx_max[0]].nonzero(),W_[idx_max[1]].nonzero()) # get features that have non-zero fi in both explanations
                                                                                                      # is used later for selecting a set of tuples with high coverage (as in SP-LIME)
        non_zero_features = [] # list with features that will be returned
        non_zero_fi_scores_tuples = [] # list of tuples with fi scores in a and b that will be returned
    
        # look up feature_idxs in features_dict and append them to the output
        for iii in features_non_zero_in_both:
           key = list(features_dict.keys())[list(features_dict.values()).index(iii)]
           non_zero_features.append(key)
           non_zero_fi_scores_tuples.append((W_[idx_max[0],features_dict[key]], W_[idx_max[1],features_dict[key]]))
        
        # Only add pair to output list if valid: at least one common feature is not zero AND f(a) == f(b) (i.e., the explanation is arguing for the same detector verdict)
        if len(non_zero_features) > 0:
            a,b = detector.predict_label([documents[document_indices[idx_max[0]]], documents[document_indices[idx_max[1]]]])
            if a == b:
                idx_pairs.append(document_indices[list(idx_max)])
                fi_scores_pairs.append(non_zero_fi_scores_tuples)
                features.append(non_zero_features)
        # delete pair from W_:
        W_ = np.delete(W_, idx_max, axis=0) 
        document_indices = np.delete(document_indices, list(idx_max))

    return idx_pairs, features, fi_scores_pairs


In [9]:
# want a set of pairs that maximizes coverage in explanation space akin to the SP-LIME strategy but for pairs 

# this is the maximum coverage problem: e.g., R. Church and C. ReVelle, 1974 http://yalma.fime.uanl.mx/~roger/work/teaching/class_tso/docs_project/problems/MCLP/1974-prs-Church%20ReVelle-maximal%20covering%20location.pdf
# can brute force here as only the 10 top pairs by similarity are used for each class, 


# let the coverage be the number of columns != 0 in W. And coverage((a,b))) := coverage(sum([a,b])), akin to the importance vector in SP-LIME (Note that columns in W are either negative FI or positive FI here (see get_feature_name_signed()), so scores don't cancel out in sum)
def coverage(selection, W):
    return np.count_nonzero(np.sum(W[np.array(list(selection)).flatten()], axis=0)) # coverage(selection)= number of cols in W that have at least one non-zero entry under this selection of pairs. Recall that W has (up to) two entries per word: one for positive and one for negative FI



In [10]:
from itertools import combinations


In [11]:
# returns two pairs, one for f(x) = machine and one for f(x) = human
# checks texts_already_selected and chooses next best pair (for each class) if a document is in texts_already_selected (i.e. it was already selected for another explainer-detector pair)
def obtain_dataset_FI_methods(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):
    W, features_dict = get_explanation_matrix_W(documents, explainer)
    similar_pairs, _, _ = get_pairs(documents, W, detector, features_dict)
    # want a dataset that is balanced irt to the two base classes:
    # two pairs will be returned, one with f(x) == machine, and one with f(x) == human
    top_10_pairs_human = []
    top_10_pairs_machine = []
    for pair in similar_pairs:
        if detector.predict_label([documents[pair[0]]])[0]:
            top_10_pairs_human.append(pair)
        else:
            top_10_pairs_machine.append(pair)
        if len(top_10_pairs_human) >= 10 and len(top_10_pairs_machine) >= 10:
            top_10_pairs_human = top_10_pairs_human[0:10]
            top_10_pairs_machine = top_10_pairs_machine[0:10]
            break

            
    combinations_ = ((a,b) for a in top_10_pairs_machine for b in top_10_pairs_human)
    # return two pairs maximizing coverage, one with f(x) == machine, and one with f(x) == human 
    coverage_ = [((a,b),coverage([a,b],W)) for a,b in combinations_]
    pair_a = None
    pair_b = None
    while True:
        (pair_a, pair_b), c = max(coverage_, key=lambda item : item[1])
        result = [pair_a, pair_b]
        if all([(documents[a] not in texts_already_selected) and (documents[b] not in texts_already_selected) for a,b in result]):
            return [pair_a, pair_b]
        print("Duplicate loop")
        coverage_.remove(((pair_a, pair_b), c))

    
        # break
        # # get f(a) as one example per class is returned
        # predictions = [detector.predict_label([documents[a]])[0] for a,_ in pairs] # wheter a == b was tested before

        # # return example with highest coverage for each class
        # # if a document is in texts_already_selected (i.e. it was already selected for another explainer-detector pair), the one with the next-highest coverage (for that prediction) is returned 
        # for idx_pair, prediction in enumerate(predictions):
        #     a,b = pairs[idx_pair]
        #     # check if a or b are in texts_already_selected
        #     if (documents[a] not in texts_already_selected) and (documents[b] not in texts_already_selected):
        #         if prediction == 0 and pair_machine is None: # only keep first
        #             pair_machine = pairs[idx_pair] 
        #         if prediction == 1 and pair_human is None: # only keep first
        #             pair_human = pairs[idx_pair] 
        #     if pair_human is not None and pair_machine is not None:
        #         return [pair_machine, pair_human] 
        # k+=1 # loop until both pair_machine and pair_human not None



# Search Strategy for Rule-Based Explainers

In [12]:
from anchor.anchor import anchor_explanation
from collections import defaultdict
from itertools import combinations, chain

In [13]:
def jaccard_similarity(document_a, document_b):
    # case sensitive, on spacy tokens
    a = list(chain(*[[token.text for token in sent] for sent in nlp(document_a).sents]))
    b = list(chain(*[[token.text for token in sent] for sent in nlp(document_b).sents]))
    intersection = float(len(list(set(a).intersection(b))))
    union = float((len(set(a)) + len(set(b)))) - intersection
    return intersection / union

In [14]:
# encodes the order of occurence in a list of words, e.g.:
# ["example", "test", "example", "one"] -> ['example_0', 'test_0', 'example_1', 'one_0']
def encode_count(list_of_words):
    d = defaultdict(lambda : 0)
    encoded = []
    for word in list_of_words:
        encoded.append(word + "_" + str(d[word]))
        d[word] +=1
    return encoded

In [15]:
# the dictonary Anchors returns can define multiple anchors:
# {this, is, an, example} : 0.9
# {this, is, an}: 0.8
# {this, is, }: 0.75
# {this}: 0.4
# extract all of them, only keep those with p >= 0.75 (threshold set when searching)
def get_anchors_at_each_k(documents, explainer):
    anchors = []
    p = []
    ids = []
    for i,_ in tqdm(enumerate(documents), desc="Loading all explanations"):#enumerate(documents):
        exp = explainer.get_explanation_cached(documents[i])
        exp["names"] = encode_count(exp["names"]) # Anchors is not BOW. But the algorithm is written with python set()s
        while len(exp["mean"]) >=1:#and exp["mean"][-1] >= 0.75:
            anchors.append(set(exp["names"])) 
            p.append(exp["mean"][-1])
            ids.append(i)

            exp["mean"].pop()
            exp["names"].pop()
    return anchors, p, ids


In [16]:
# searches for pairs of anchors
# returns 2 pairs of documents, one pair for f(x) = machine, one for f(x) = human, both sampled randomly
# checks for and skips documents in "texts_already_selected" (i.e. it was already selected for an other explainer-detector pair)

def obtain_dataset_Anchor(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):

    anchors, p, ids = get_anchors_at_each_k(documents, explainer)
                        # DetectGPT + Anchors is to expensive to run experiments on 
    # find anchors that occur more than once in the dataset, then remove duplicates (created by looping) with set()
    duplicate_anchors = [set(anchor) for anchor in set([frozenset(anchor) for anchor in anchors if anchors.count(anchor) > 1])]
    # get the ids and p for each duplicate_anchor in  duplicate_anchors
    # "candidates" is a list of lists with ids (and all other details) of each duplicate_anchor
    candidates = [[(anchor, p, document_id) for anchor, p, document_id in zip(anchors, p, ids) if anchor == duplicate_anchor] for duplicate_anchor in duplicate_anchors ]
    # now check for each paring of the documents in each sublist of "candidates":
    #   is f(a) == f(b)?, if not: discard
    # then pick pair with highest jaccard_score on the original documents in each "candidate"
    pairs = []

    predictions_cache = {}
    def cached_predict(idx):
        if idx not in predictions_cache:
            predictions_cache[idx] = detector.predict_label([documents[idx]])[0]
        return predictions_cache[idx]
    for candidate in tqdm(candidates, desc="Assessing candidates",position=1):
        anchor_s, p, ids  = zip(*candidate)
        c = list(combinations(ids, 2))
        c = [(a,b) for a,b in c if cached_predict(a) == cached_predict(b)]
        if len(c) == 0:
            continue
        jaccard_scores = [(a,b, jaccard_similarity(documents[a], documents[b])) for a,b in tqdm(c, desc="Calculating Jaccard Similarity (of documents not Anchors)",position=0)]
        a,b, score = max(jaccard_scores, key=lambda x: x[2])
        pairs.append((a,b))

    # sample twice: once for f(x) == human and once for f(x) == machine. f(a) == f(b) is tested earlier

    predictions = [cached_predict(a) for a,_ in pairs] # wheter a == b was tested before

    predictions_ = np.array(predictions)
    pairs_ = np.array(pairs)

    machine = pairs_[predictions_ == False]
    human = pairs_[predictions_ == True]
    
    np.random.seed(42)
    result = []
    # select 2 pairs from pairs_: one for each class
    # need to check if a document from the pair is in texts_already_selected
    while True:       
        # one explainer (DetectGPT) has no explanations for f(x) = human:
        if not(True in predictions):
            print("Warning: No examples for f(x) = human. Returning an additional example for machine")
            result =  machine[np.random.choice(machine.shape[0], 2, replace=False)]
        elif not (False in predictions):
            print("Warning: No examples for f(x) = machine. Returning an additional example for human")
            result = human[np.random.choice(human.shape[0], 2, replace=False)]
        else:
            result =  [machine[np.random.randint(0, machine.shape[0]),:], human[np.random.randint(0, human.shape[0]),:]] # returns a random pair for machine and a random pair for human

        
        # check for duplicates in texts_already_selected, re-sample if the pairs are duplicates.
        if all([(documents[a] not in texts_already_selected) and (documents[b] not in texts_already_selected) for a,b in result]):
            return result
        else:
            print("Loop: Avoiding duplicates")

    


# Load Dataset

In [17]:
def obtain_dataset(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):
    if isinstance(explainer, Anchor_Explainer):
        return obtain_dataset_Anchor(explainer, detector, documents, gold_labels, document_ids, texts_already_selected)
    else:
        return obtain_dataset_FI_methods(explainer, detector, documents, gold_labels, document_ids, texts_already_selected)

In [18]:
test = pd.read_pickle("./dataset_test.pkl")
test = test 

documents = list(test["answer"])
gold_labels = list(test["author"] == "human_answers") # convention: 0: machine, 1: human, see detector.py
document_ids = list(range(0,len(documents))) # note that the search algorithms don't use these ids. They are only used for printing and the exclude_list!!

In [19]:
import spacy
nlp = spacy.load("en_core_web_lg")
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer().fit(documents)

# Perform Document Selection
Some documents are excluded from the user-study for the reasons specified below:

In [20]:

exclude_list = {
    (195, 60,108, 228,143): "Names forum/service explicitly",
    (288,117, 188, 110, 159, 97, 105, 115,266, 158): "Author introduces themselves by name",
    (16,): "References earlier post by other user",
    (190,294,): "Names forum user who asked the question",
    (27,103,): "NSFW",
    
    
}
exclude_list = [x for xs in [ list(key) for key in exclude_list.keys()] for x in xs]
exclude_list

[195,
 60,
 108,
 228,
 143,
 288,
 117,
 188,
 110,
 159,
 97,
 105,
 115,
 266,
 158,
 16,
 190,
 294,
 27,
 103]

In [21]:
# apply exclude_list
documents = [d for i,d in zip(document_ids, documents) if i not in exclude_list]
gold_labels = [gl for i,gl in zip(document_ids, gold_labels) if i not in exclude_list]
document_ids = [i for i in document_ids if i not in exclude_list]

> **Warning**: If you plan to participate in the user study, set `OBFUSCATE_RESULTS` to `True` before proceeding!!!

In [22]:
columns = ["Detector", "Explainer", "Documents Phases 1+3", "Documents Phases 2+4", "f(a)", "f(b)", "GT a", "GT b", "idx a", "idx b", "Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)", "Jaccard Similarity (a,b)", "Cosine Similarity tfidf Vectors","hash a", "hash b"]




In [23]:
def update_selection(selection, pairs, explainer, detector):
    for a,b in pairs:
        if OBFUSCATE_RESULTS:
            continue
        
        tfidf_= tfidf.transform([documents[a], documents[b]])   
        selection.append((detector.__class__.__name__,
                        explainer.__class__.__name__,
                        documents[a], documents[b],
                        *detector.predict_label([documents[a], documents[b]]),
                        gold_labels[a],
                        gold_labels[b],
                        document_ids[a],
                        document_ids[b],
                        nlp(documents[a]).similarity(nlp(documents[b])),
                        jaccard_similarity(documents[a], documents[b]),
                        (tfidf_ * tfidf_.T).toarray()[0,1],
                        explainer.get_hash(documents[a]),
                        explainer.get_hash(documents[b])))
    return selection

In [24]:
selection = []
for detector_class in [DetectorRadford,DetectorGuo]:
    detector = detector_class()
    display(HTML("<h1>{}</h1>".format(detector.__class__.__name__)))
    for explainer_class in [LIME_Explainer,SHAP_Explainer]:
        explainer = explainer_class(detector)
        display(HTML("<h2>{}</h2>".format(explainer.__class__.__name__)))
        
        texts_already_selected = []
        if len(selection) > 0:
            texts_already_selected = list(zip(*selection))[2] + list(zip(*selection))[3]
        pairs = obtain_dataset(explainer, detector, documents, gold_labels, document_ids, texts_already_selected=texts_already_selected)
       # print_pairs(pairs, documents, gold_labels, document_ids, detector, explainer)
        selection = update_selection(selection, pairs, explainer, detector)
        # break
        
    

Loading all explanations: 100%|██████████| 285/285 [00:00<00:00, 807.37it/s]
Building global dict of features: 100%|██████████| 285/285 [00:01<00:00, 152.18it/s]
Building W: 100%|██████████| 285/285 [00:01<00:00, 150.07it/s]
Obtaining pairs: 100%|██████████| 142/142 [00:08<00:00, 16.05it/s]


Loading all explanations: 100%|██████████| 285/285 [00:00<00:00, 3238.67it/s]
Building global dict of features: 100%|██████████| 285/285 [00:19<00:00, 14.79it/s]
Building W: 100%|██████████| 285/285 [00:20<00:00, 13.89it/s]
Obtaining pairs: 100%|██████████| 142/142 [00:08<00:00, 16.07it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading all explanations: 100%|██████████| 285/285 [00:00<00:00, 960.99it/s]
Building global dict of features: 100%|██████████| 285/285 [00:01<00:00, 153.00it/s]
Building W: 100%|██████████| 285/285 [00:01<00:00, 155.68it/s]
Obtaining pairs: 100%|██████████| 142/142 [00:07<00:00, 18.71it/s]


Loading all explanations: 100%|██████████| 285/285 [00:00<00:00, 3064.49it/s]
Building global dict of features: 100%|██████████| 285/285 [00:19<00:00, 14.32it/s]
Building W: 100%|██████████| 285/285 [00:18<00:00, 15.05it/s]
Obtaining pairs: 100%|██████████| 142/142 [00:08<00:00, 17.34it/s]


In [25]:
df = pd.DataFrame(selection, columns=columns)

In [26]:
# if not OBFUSCATE_RESULTS:
#     df.to_pickle("./dataset_user_study.pkl") # file in .gitignore

In [27]:
# df = pd.read_pickle("./dataset_user_study.pkl")
# df

In [28]:
# for idx, row in df.iterrows():
#     print(row["GT a"])
#     print(row["idx a"])
#     print(row["Documents Phases 1+3"])
#     print("---------------")

In [29]:
# for idx, row in df.iterrows():
#     print(row["GT b"])
#     print(row["idx b"])
#     print(row["Documents Phases 2+4"])
#     print("---------------")

# Some Checks

In [30]:
assert not any(df[["Documents Phases 1+3", "Documents Phases 2+4"]].stack().reset_index(drop="True").duplicated()), "Duplicate documents!"

In [31]:
assert all(df.groupby(["Detector", "Explainer"]).count()["Documents Phases 1+3"] == 2)

# Save Selection

In [32]:
# TODO remove from .gitignore after user study
# TODO change format to something else after user study 

In [33]:
import random

In [34]:
prediction_cache = {}
def prediction_cached(detector, document):
    id = (detector.__class__.__name__,document)
    if id not in prediction_cache:
        prediction_cache[id] = detector.predict_label([document])[0]
    return prediction_cache[id]


In [35]:
detector_detectgpt = DetectorDetectGPT()
detector_radford = DetectorRadford()
detector_guo = DetectorGuo()
def get_random_df(df, seed=42):
    selection = []
    random.seed(seed)
    for idx, row in tqdm(list(df.iterrows())):
        detector = None
        explainer = None
        if row["Detector"] == "DetectorDetectGPT":
            detector = detector_detectgpt
        if row["Detector"] == "DetectorRadford":
            detector = detector_radford
        if row["Detector"] == "DetectorGuo":
            detector = detector_guo

        if row["Explainer"]  == "Anchor_Explainer":
            explainer = Anchor_Explainer(detector)
        if row["Explainer"]  == "LIME_Explainer":
            explainer = LIME_Explainer(detector)
        if row["Explainer"]  == "SHAP_Explainer":
            explainer = SHAP_Explainer(detector)
        #               all documents not in exclude_list                                                                            without replacement
        candidates = [i for i in range(0,len(documents)) if i not in exclude_list and (len(selection) == 0 or document_ids[i] not in list(zip(*selection))[8] + list(zip(*selection))[9])]
        random.shuffle(candidates)
        pairs = None
        while True:
            pairs = [(candidates[0], candidates[1])]
            if prediction_cached(detector, documents[candidates[0]]) == row["f(a)"] and prediction_cached(detector, documents[candidates[0]]) == prediction_cached(detector, documents[candidates[1]]):
                break
            candidates = candidates[2:]
        

        selection = update_selection(selection, pairs, explainer, detector)
    return pd.DataFrame(selection, columns=columns)

Using cache dir ./.cache
Loading BASE model EleutherAI/pythia-70m...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MOVING BASE MODEL TO GPU...DONE (0.08s)
DONE (0.10s)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [36]:
dfs_random = [get_random_df(df, seed=i) for i in range(0,3)]


100%|██████████| 8/8 [00:02<00:00,  2.95it/s]
100%|██████████| 8/8 [00:02<00:00,  2.95it/s]
100%|██████████| 8/8 [00:03<00:00,  2.61it/s]


In [37]:
dfs_random

[          Detector       Explainer  \
 0  DetectorRadford  LIME_Explainer   
 1  DetectorRadford  LIME_Explainer   
 2  DetectorRadford  SHAP_Explainer   
 3  DetectorRadford  SHAP_Explainer   
 4      DetectorGuo  LIME_Explainer   
 5      DetectorGuo  LIME_Explainer   
 6      DetectorGuo  SHAP_Explainer   
 7      DetectorGuo  SHAP_Explainer   
 
                                 Documents Phases 1+3  \
 0  Banks generally do not intentionally allow tra...   
 1  The Department of Computer Science and Technol...   
 2  If your deductions are higher than your income...   
 3  First of all you do not "co-sign a car". I ass...   
 4  The sale of agricultural land may be subject t...   
 5  Peter A. Wegner (August 20, 1932 – July 27, 20...   
 6  It is possible that your son is experiencing a...   
 7  You'll likely see several more scary market ev...   
 
                                 Documents Phases 2+4  f(a)  f(b)   GT a  \
 0  Trailing 12-month total returns, or TTM return...   

In [192]:
df_random = pd.concat(dfs_random).groupby(level=0)[['Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)',
 'Jaccard Similarity (a,b)',
 'Cosine Similarity tfidf Vectors',]].agg(["mean", "std"])
df_random

Unnamed: 0_level_0,"Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)","Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)","Jaccard Similarity (a,b)","Jaccard Similarity (a,b)",Cosine Similarity tfidf Vectors,Cosine Similarity tfidf Vectors
Unnamed: 0_level_1,mean,std,mean,std,mean,std
0,0.91521,0.042344,0.131372,0.036806,0.094874,0.05482
1,0.834775,0.083865,0.076315,0.018743,0.058752,0.031411
2,0.950843,0.011448,0.160636,0.02742,0.121287,0.01895
3,0.852902,0.082989,0.097862,0.057449,0.059178,0.036089
4,0.900794,0.020504,0.124395,0.02872,0.065917,0.02465
5,0.836033,0.13049,0.08015,0.031346,0.044484,0.019984
6,0.892366,0.081661,0.133623,0.042848,0.102148,0.035524
7,0.914334,0.031987,0.127099,0.014827,0.092183,0.0107


In [137]:
from scipy.stats.mstats import ttest_rel
from scipy.stats.mstats import ttest_ind
from scipy.stats.mstats import ttest_1samp

In [138]:
from latex_utils import get_p_asterisks, highlight_max, df_to_latex

In [139]:
print(ttest_rel(df["Jaccard Similarity (a,b)"], df_random[("Jaccard Similarity (a,b)", "mean")]))

Ttest_relResult(statistic=0.6991642934389493, pvalue=0.5070086131708251)


In [140]:
def get_p_asterisks_2samp(row):
  
  #  print(row)
   # print(group.name)
    p = float(row["pvalue"])
    val = float(row["method - random"])
    if p <= 0.001:
        val = "{:.2f}\\textsuperscript{{***}}".format(val)
    if p <= 0.01:
        val = "{:.2f}\\textsuperscript{{**}}".format(val)
    if p <= 0.05:
        val = "{:.2f}\\textsuperscript{{*}}".format(val)
    if p > 0.05:
        val = "{:.2f}\\textsuperscript{{ns}}".format(val)
    row["method - random"] = val
    return row

In [141]:
# metrics_df = df.set_index(["Explainer", "Detector","f(d) → f(m)"])[export_cols].groupby(groupby).agg(
# {
#         "a", lambda group: get_p_asterisks_2samp(ttest_ind, a,b),

#     }
# )

# p_results[('n')] = p_results[('[Score 1] cos sim', 'count')]
# p_results = p_results.drop([('[Score 1] cos sim', 'count')], axis=1)

# p_results = p_results[[list(p_results.columns)[-1]] + list(p_results.columns)[:-1]]
# p_results.columns = [a for a, _ in p_results.columns]

# p_results = p_results.style.apply(highlight_max, subset=p_results.columns[1:])

In [157]:
df

Unnamed: 0,Detector,Explainer,Documents Phases 1+3,Documents Phases 2+4,f(a),f(b),GT a,GT b,idx a,idx b,"Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)","Jaccard Similarity (a,b)",Cosine Similarity tfidf Vectors,hash a,hash b
0,DetectorRadford,LIME_Explainer,Credit unions are not-for-profit financial coo...,Gordon Bell is a computer scientist and electr...,0,0,False,False,161,204,0.865433,0.1,0.035942,51d1e47a3a3e56608b1fbc9a3f63dfd38a5d07ecd93837...,49afdd008b7667f8c575f34ad378723e39feb10666c2e9...
1,DetectorRadford,LIME_Explainer,Both are saying essentially the same thing. T...,Sounds like you are reconciling more than once...,1,1,True,True,52,273,0.734486,0.123188,0.068672,714b04dd8923e09ea3f370b93660441d792104140d13d3...,83d0fafaffdb717a9f6ec22b781b56516afc286a1eae78...
2,DetectorRadford,SHAP_Explainer,It is possible that your son is experiencing a...,"There are many potential causes of fever, shiv...",0,0,False,False,170,301,0.919242,0.16,0.089471,53bb29915fb86c3bd323ddf3342af288e3110b3d8c470c...,0e65e1bbbf8133d14e940529817a55398d439910f1376e...
3,DetectorRadford,SHAP_Explainer,Predictive analytics encompasses a variety of ...,Human intelligence is the intellectual capabil...,1,1,True,True,6,247,0.955458,0.108374,0.065098,42b84c175e792cbace8c18218652b0e5fc172d0722913d...,936af4095ba223d41fa40efae657dfe1169abfac77176d...
4,DetectorGuo,LIME_Explainer,A Tensor Processing Unit (TPU) is a custom acc...,It is difficult to determine the frequency wit...,0,0,False,False,33,256,0.935912,0.108571,0.070008,06146f733e03754c43c81770ce1e2b06934eb318f50d93...,ed76b4ba45aefde60af94db8dd4cb92a1c331f5ce954bf...
5,DetectorGuo,LIME_Explainer,Hello dearWarm welcome to Healthcaremagic.com...,Thanks for your question on Healthcare Magic. ...,1,1,True,True,129,194,0.871972,0.092715,0.09372,e5e2abf2af1e944f04c8c9d000f7dbf4e66c4d234f83a9...,47e9700fa53a47fb40d1025a89aec77680c41248c82119...
6,DetectorGuo,SHAP_Explainer,Automated decision-making refers to the use of...,Predictive analytics is a type of data analysi...,0,0,False,False,182,212,0.965771,0.166667,0.215914,bab7c7ca0e1ba531721cb2418eb0b0fde40769c709323d...,150471a9c2522b83a5910e0cd1e92838a39d967a2a183c...
7,DetectorGuo,SHAP_Explainer,"TL;DR: The date they were granted. (Usually, ...",Reuters has a service you can subscribe to tha...,1,1,True,True,71,249,0.953523,0.122093,0.073148,da9b052c0d269c574dcea123c6cb2f9fd6c299427e64b2...,e6b72b7969bbad53385251e1baf30d9f72a75067fc1707...


In [155]:

metrics = ["Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)", "Jaccard Similarity (a,b)", "Cosine Similarity tfidf Vectors"]
t = {}
for metric in metrics:
    tstatistic, pvalue = ttest_ind(df[metric], df_random[(metric, "mean")])
    t[metric] = (tstatistic, pvalue, df[metric].mean(), df_random[(metric, "mean")].mean(), df[metric].mean() - df_random[(metric, "mean")].mean())
pd.DataFrame(t, index=["tstatistic", "pvalue","mean method", "mean random", "method - random" ]).transpose().astype("str").apply(get_p_asterisks_2samp, axis=1).drop(["pvalue","tstatistic"], axis=1)
    

Unnamed: 0,mean method,mean random,method - random
"Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)",0.9002245630907298,0.8871572234713475,0.01\textsuperscript{ns}
"Jaccard Similarity (a,b)",0.1227011425394429,0.1164316178540744,0.01\textsuperscript{ns}
Cosine Similarity tfidf Vectors,0.0889965808614003,0.0798528268016517,0.01\textsuperscript{ns}


In [158]:
df_fi_similarity

NameError: name 'df_fi_similarity' is not defined

In [194]:
def get_metrics_W(df):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford, DetectorDetectGPT]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer]:
            explainer = explainer_class(detector)
            for idx, row in df.iterrows():
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
                W, _ = get_explanation_matrix_W([a,b], explainer, quiet=True)

                sim = cosine_similarity(W) 
                cosine_similarity_in_w = sim[0,1]

                n_tokens_overlap_in_w = np.all(W != 0, axis = 0).sum()# / np.any(W != 0, axis = 0).sum()
                
                sim = euclidean_distances(W) 
                sim_eucledian = sim[0,1]
                results.append((
                    explainer.__class__.__name__,
                    detector.__class__.__name__,
                    sim_eucledian,
                    cosine_similarity_in_w,
                    n_tokens_overlap_in_w))
    df_fi_similarity = pd.DataFrame(results, columns=[
        "Explainer",
        "Detector",
        "sim_eucledian",
        "cosine_similarity_in_w",
        "Overlap in W (tokens)",
        ])
    df_fi_similarity = df_fi_similarity.set_index(["Explainer", "Detector"])
    return df_fi_similarity

In [196]:
df_random

Unnamed: 0_level_0,"Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)","Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)","Jaccard Similarity (a,b)","Jaccard Similarity (a,b)",Cosine Similarity tfidf Vectors,Cosine Similarity tfidf Vectors
Unnamed: 0_level_1,mean,std,mean,std,mean,std
0,0.91521,0.042344,0.131372,0.036806,0.094874,0.05482
1,0.834775,0.083865,0.076315,0.018743,0.058752,0.031411
2,0.950843,0.011448,0.160636,0.02742,0.121287,0.01895
3,0.852902,0.082989,0.097862,0.057449,0.059178,0.036089
4,0.900794,0.020504,0.124395,0.02872,0.065917,0.02465
5,0.836033,0.13049,0.08015,0.031346,0.044484,0.019984
6,0.892366,0.081661,0.133623,0.042848,0.102148,0.035524
7,0.914334,0.031987,0.127099,0.014827,0.092183,0.0107


In [191]:

t = {}
m_method = get_metrics_W(df)
m_random = pd.concat([get_metrics_W(df_random) for df_random in dfs_random]).groupby(level=0).agg(["mean", "std"])

for metric in m_method.columns:
    tstatistic, pvalue = ttest_ind(m_method[metric], m_random[(metric, "mean")])
    t[metric] = (tstatistic, pvalue, m_method[metric].mean(), m_random[(metric, "mean")].mean(), m_method[metric].mean() - m_random[(metric, "mean")].mean())
pd.DataFrame(t, index=["tstatistic", "pvalue","mean method", "mean random", "method - random" ]).transpose().astype("str").apply(get_p_asterisks_2samp, axis=1).drop(["pvalue","tstatistic"], axis=1)

Unnamed: 0,mean method,mean random,method - random
sim_eucledian,0.1474210511595113,0.1927088104541069,-0.05\textsuperscript{ns}
cosine_similarity_in_w,0.6567413929311748,0.2073151200225199,0.45\textsuperscript{*}
Overlap in W (tokens),13.125,9.125,4.00\textsuperscript{ns}


['sim_eucledian', 'cosine_similarity_in_w', 'Overlap in W (tokens)']

In [162]:
get_metrics_W_random(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,"Cosine Similarity a,b in W"
Explainer,Detector,Unnamed: 2_level_1
LIME_Explainer,DetectorGuo,0.500401
LIME_Explainer,DetectorGuo,0.56865
SHAP_Explainer,DetectorGuo,0.889212
SHAP_Explainer,DetectorGuo,0.797556
LIME_Explainer,DetectorRadford,0.554297
LIME_Explainer,DetectorRadford,0.425916
SHAP_Explainer,DetectorRadford,0.824385
SHAP_Explainer,DetectorRadford,0.693515


# Explanation BOW Similarity

In [159]:
def get_cos_sim_in_W(dff, all=False):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer]:
            explainer = explainer_class(detector)

            for idx, row in dff.iterrows():
                if not all and row["Explainer"] != explainer.__class__.__name__ or row["Detector"] != detector.__class__.__name__:
                    continue
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
                W, _ = get_explanation_matrix_W([a,b], explainer, quiet=True)
                sim = cosine_similarity(W) 
                sim = sim[0,1]

                sim_eucledian = euclidean_distances(W) 
                sim_eucledian = sim[0,1]
                results.append((explainer.__class__.__name__, detector.__class__.__name__, sim_eucledian, sim))
    df_fi_similarity = pd.DataFrame(results, columns=["Explainer", "Detector", "Cosine Similarity a,b in W"])
    df_fi_similarity = df_fi_similarity.set_index(["Explainer", "Detector"])
    return df_fi_similarity

In [161]:
for df_random in dfs_random:
    df_fi_similarity_random = get_cos_sim_in_W(df_random)
    df_fi_similarity = get_cos_sim_in_W(df)
    display(df_fi_similarity)
    print("df_fi_similarity.mean()", df_fi_similarity.mean())
    print("df_fi_similarity_random.mean()", df_fi_similarity_random.mean())

Unnamed: 0_level_0,Unnamed: 1_level_0,"Cosine Similarity a,b in W"
Explainer,Detector,Unnamed: 2_level_1
LIME_Explainer,DetectorGuo,0.500401
LIME_Explainer,DetectorGuo,0.56865
SHAP_Explainer,DetectorGuo,0.889212
SHAP_Explainer,DetectorGuo,0.797556
LIME_Explainer,DetectorRadford,0.554297
LIME_Explainer,DetectorRadford,0.425916
SHAP_Explainer,DetectorRadford,0.824385
SHAP_Explainer,DetectorRadford,0.693515


df_fi_similarity.mean() Cosine Similarity a,b in W    0.656741
dtype: float64
df_fi_similarity_random.mean() Cosine Similarity a,b in W    0.230761
dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,"Cosine Similarity a,b in W"
Explainer,Detector,Unnamed: 2_level_1
LIME_Explainer,DetectorGuo,0.500401
LIME_Explainer,DetectorGuo,0.56865
SHAP_Explainer,DetectorGuo,0.889212
SHAP_Explainer,DetectorGuo,0.797556
LIME_Explainer,DetectorRadford,0.554297
LIME_Explainer,DetectorRadford,0.425916
SHAP_Explainer,DetectorRadford,0.824385
SHAP_Explainer,DetectorRadford,0.693515


df_fi_similarity.mean() Cosine Similarity a,b in W    0.656741
dtype: float64
df_fi_similarity_random.mean() Cosine Similarity a,b in W    0.181469
dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,"Cosine Similarity a,b in W"
Explainer,Detector,Unnamed: 2_level_1
LIME_Explainer,DetectorGuo,0.500401
LIME_Explainer,DetectorGuo,0.56865
SHAP_Explainer,DetectorGuo,0.889212
SHAP_Explainer,DetectorGuo,0.797556
LIME_Explainer,DetectorRadford,0.554297
LIME_Explainer,DetectorRadford,0.425916
SHAP_Explainer,DetectorRadford,0.824385
SHAP_Explainer,DetectorRadford,0.693515


df_fi_similarity.mean() Cosine Similarity a,b in W    0.656741
dtype: float64
df_fi_similarity_random.mean() Cosine Similarity a,b in W    0.209715
dtype: float64


In [None]:
for df_random in dfs_random:
    df_fi_similarity_random = get_cos_sim_in_W(df_random, all=True)
    df_fi_similarity = get_cos_sim_in_W(df, all=True)

    print("df_fi_similarity.mean()", df_fi_similarity.mean())
    print("df_fi_similarity_random.mean()", df_fi_similarity_random.mean())

In [None]:
df_fi_similarity.mean()

In [None]:
df_fi_similarity_random.mean()

In [None]:
ttest_ind(df_fi_similarity["Cosine Similarity a,b in W"], df_fi_similarity_random["Cosine Similarity a,b in W"])

In [None]:
def get_eucledian_distance_in_W(dff):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer]:
            explainer = explainer_class(detector)

            for idx, row in dff.iterrows():
                if row["Explainer"] != explainer.__class__.__name__ or row["Detector"] != detector.__class__.__name__:
                    continue
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
                W, _ = get_explanation_matrix_W([a,b], explainer, quiet=True)
                sim = euclidean_distances(W) 
                sim = sim[0,1]
                results.append((explainer.__class__.__name__, detector.__class__.__name__, sim))
    df_fi_similarity = pd.DataFrame(results, columns=["Explainer", "Detector", "Eucledian Distance a,b in W"])
    df_fi_similarity = df_fi_similarity.set_index(["Explainer", "Detector"])
    return df_fi_similarity

In [None]:
df_fi_similarity_eucledian_random = get_eucledian_distance_in_W(df_random)
df_fi_similarity_eucledian = get_eucledian_distance_in_W(df)

In [None]:
df_fi_similarity_eucledian.mean()

In [None]:
df_fi_similarity_eucledian_random.mean()

In [None]:
ttest_ind(df_fi_similarity_eucledian["Eucledian Distance a,b in W"], df_fi_similarity_eucledian_random["Eucledian Distance a,b in W"])

In [None]:
for metric in ["Spacy Semantic Similarity: Cosine Similarity Average of Word Vectors (a,b)", "Jaccard Similarity (a,b)", "Cosine Similarity tfidf Vectors"]:
    print(metric)
    print("     ", ttest_ind(df[metric], df_random[metric]))

## Explanation overlap in non-zero features

In [None]:
def get_overlap_in_W(dff, all=False):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford, DetectorDetectGPT]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer]:
            explainer = explainer_class(detector)
            for idx, row in dff.iterrows():
                if not all and row["Explainer"] != explainer.__class__.__name__ or row["Detector"] != detector.__class__.__name__:
                    continue
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
                W, _ = get_explanation_matrix_W([a,b], explainer, quiet=True)
                sim = np.all(W != 0, axis = 0).sum()# / np.any(W != 0, axis = 0).sum()
                results.append((explainer.__class__.__name__, detector.__class__.__name__, sim))
    df_fi_similarity = pd.DataFrame(results, columns=["Explainer", "Detector", "Overlap in W"])
    df_fi_similarity = df_fi_similarity.set_index(["Explainer", "Detector"])
    return df_fi_similarity

In [None]:
for df_random in dfs_random:
    df_fi_similarity_random = get_overlap_in_W(df_random)
    df_fi_similarity = get_overlap_in_W(df)

    print("get_overlap_in_W.mean()", df_fi_similarity.mean())
    print("get_overlap_in_W.mean_random()", df_fi_similarity_random.mean())

In [None]:
for df_random in dfs_random:
    df_fi_similarity_random = get_overlap_in_W(df_random, all=True)
    df_fi_similarity = get_overlap_in_W(df, all=True)

    print("get_overlap_in_W.mean()", df_fi_similarity.mean())
    print("get_overlap_in_W.mean_random()", df_fi_similarity_random.mean())

In [None]:
df_fi_overlap_in_W_random.groupby("Explainer").describe()

In [None]:
df_fi_overlap_in_W.groupby("Explainer").describe()

In [None]:
df_fi_overlap_in_W.mean()

In [None]:
df_fi_overlap_in_W_random.mean()

In [None]:
ttest_ind(df_fi_overlap_in_W["Overlap in W"], df_fi_overlap_in_W_random["Overlap in W"])

In [None]:
df