> **Warning**: Always set this flag to `True` before git commit!

In [1]:
OBFUSCATE_RESULTS = False

In [2]:
import torch

In [3]:
from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer

In [4]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from IPython.core.display import HTML
import lime
import numpy as np
import warnings
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import normalize

# Utility functions

In [5]:
template = """
<p><b>This is a {kind_of_document} document.</b></p>
<p>The detector {correctly_or_wrongly} predicted that this document was... </p>
<p>&emsp; ... machine generated with {p_machine} % confidence.</p>
<p>&emsp; ... human written with {p_human} % confidence.</p> 
<div style="float:left;">{highlighted_text}</div>
"""
#<div style="float:left; height:30em;">{barplot_machine}{barplot_human}</div>



#"""
def print_template(document, gold_label, detector, explainer):
    p_machine, p_human = detector.predict_proba([document])[0]
   # machine, human = explainer.get_barplots_HTML(document)
    display(HTML(template.format(
    p_machine=int(p_machine*100) if not OBFUSCATE_RESULTS else "<redacted>", 
    p_human=int(p_human*100) if not OBFUSCATE_RESULTS else "<redacted>",
  #  barplot_machine=machine,
  #  barplot_human=human,
    kind_of_document= (("machine generated" if gold_label == False else "human written") if not OBFUSCATE_RESULTS else "<redacted>"), 
    correctly_or_wrongly= (("correctly" if detector.predict_label([document])[0] == gold_label else "wrongly") if not OBFUSCATE_RESULTS else "<redacted>"), 
    highlighted_text=explainer.get_highlighted_text_HTML((document if not OBFUSCATE_RESULTS else "<redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted> <redacted>")),
    )))
def print_shared_features(features, fi_scores):
    for feature, (fi_score_a, fi_score_b) in zip(features, fi_scores):
        print(feature)
        print("\t\ta: {} \t b: {}".format(fi_score_a, fi_score_b))
        print("-----------")


In [6]:
def print_pairs(pairs, documents, gold_labels, document_ids, detector, explainer, skip_n=0):

    for (a,b)in pairs:
        if OBFUSCATE_RESULTS:
            print("idx_a: <redacted> idx_b <redacted>")
        else:
            print("idx_a: {} idx_b {}".format(document_ids[a],document_ids[b]))

        print_template(documents[a], gold_labels[a], detector, explainer)
        print_template(documents[b], gold_labels[b], detector, explainer)
        print("------------------------------------------------------------------------------------------")
        print("------------------------------------------------------------------------------------------")
        print("------------------------------------------------------------------------------------------")
  


# Search Strategy for Feature Importance Explainers

In [7]:
# returns a matrix of explanations for all documents in "data"
# This function was once SubmodularPick.__init__() in LIME. It was planned to use its output for a search strategy for similar explanations. 
# Only the code for creating W from the paper (rows are explanations, cols are BOW features) remains
def get_explanation_matrix_W(data, explainer, quiet=False):
    # Get (cached) explanations 
    explanations_and_documents = [(d, explainer.get_fi_scores(d, fill=True)[0]) for d in tqdm(data, desc="Loading all explanations",disable=quiet) ] # [0]: only irt to label machine, fill: return all words, even those with 0 fi

    get_feature_name_signed = lambda feature,value : feature + ("_+" if value >=0 else "_-") # appends "_+" or "_-" to each feature name, e.g. "example" -> "example_+" if fi(example) > 0
    # Ribeiro et al.: Find all the explanation model features used. Defines the dimension d'
    # i.e. determine columns of W: each word (BOW) gets (up to) two columns, one for positive FI scores, one for negative FI scores
    features_dict = {}
    feature_iter = 0
    for d, exp in tqdm(explanations_and_documents, desc="Building global dict of features", disable=quiet):
     #   print("exp",exp)
        for feature_idx, value in exp: # irt to label machine
            feature = explainer.tokenize(d)[feature_idx]
            feature_name = get_feature_name_signed(feature,value) # get_feature_name_signed: see above
            if feature_name not in features_dict.keys():
                features_dict[feature_name] = (feature_iter)
                feature_iter += 1
    d_prime = len(features_dict.keys())

    # Ribeiro et al.: Create the n x d' dimensional 'explanation matrix', W
    W = np.zeros((len(explanations_and_documents), d_prime))

    # fill W, look up cols in dict that was just created
    # W: one row per explanation, one col per feature in feature_dict
    for i, (d, exp) in enumerate(tqdm(explanations_and_documents,  desc="Building W",disable=quiet)):
        for feature_idx, value in exp: # irt to label machine
            # get_feature_name_signed: see above
            feature = explainer.tokenize(d)[feature_idx]
            W[i, features_dict[get_feature_name_signed(feature,value)]] += value
    return W, features_dict


In [8]:
# returns a list of tuples: (pair of documents whose explanations are similar, the features that overlap, fi scores of said features)
# this maximizes similarity between documents (greedy, each document is only part of one tuple)
# another function should select n tuples to maximize coverage in explanation space akin to SP-LIME
sum_two_max = None
def get_pairs(documents, W, detector, features_dict, n_pairs=None):
    if n_pairs is None:
        n_pairs = len(documents)//2
    idx_pairs = [] # tuples of indices of similar documents a,b in "data"
    features = [] # list of features those documents covered
    fi_scores_pairs = []

    W_ = np.copy(W)

    document_indices = np.arange(0, W_.shape[0])
   # print(document_indices.shape, W_.shape)
    for _ in tqdm(range(0,n_pairs), desc="Obtaining pairs"):
        sim = cosine_similarity(W_) # calculate cosine similarity between all explanations
        sim = np.triu(sim,k=1)  # remove redundant information for argmax()

        idx_max = np.unravel_index(sim.argmax(), sim.shape) # get most similar pair, result is (idx_a, idx_b)
       # print(idx_max)
        features_non_zero_in_both = np.intersect1d(W_[idx_max[0]].nonzero(),W_[idx_max[1]].nonzero()) # get features that have non-zero fi in both explanations
                                                                                                      # is used later for selecting a set of tuples with high coverage (as in SP-LIME)
        non_zero_features = [] # list with features that will be returned
        non_zero_fi_scores_tuples = [] # list of tuples with fi scores in a and b that will be returned
    
        # look up feature_idxs in features_dict and append them to the output
        for iii in features_non_zero_in_both:
           key = list(features_dict.keys())[list(features_dict.values()).index(iii)]
           non_zero_features.append(key)
           non_zero_fi_scores_tuples.append((W_[idx_max[0],features_dict[key]], W_[idx_max[1],features_dict[key]]))
        
        # Only add pair to output list if valid: at least one common feature is not zero AND f(a) == f(b) (i.e., the explanation is arguing for the same detector verdict)
        if len(non_zero_features) > 0:
            a,b = detector.predict_label([documents[document_indices[idx_max[0]]], documents[document_indices[idx_max[1]]]])
            if a == b:
                idx_pairs.append(document_indices[list(idx_max)])
                fi_scores_pairs.append(non_zero_fi_scores_tuples)
                features.append(non_zero_features)
        # delete pair from W_:
        W_ = np.delete(W_, idx_max, axis=0) 
        document_indices = np.delete(document_indices, list(idx_max))

    return idx_pairs, features, fi_scores_pairs


In [9]:
# want a set of pairs that maximizes coverage in explanation space akin to the SP-LIME strategy but for pairs 

# this is the maximum coverage problem: e.g., R. Church and C. ReVelle, 1974 http://yalma.fime.uanl.mx/~roger/work/teaching/class_tso/docs_project/problems/MCLP/1974-prs-Church%20ReVelle-maximal%20covering%20location.pdf
# can brute force here as only the 10 top pairs by similarity are used for each class, 


# let the coverage be the number of columns != 0 in W. And coverage((a,b))) := coverage(sum([a,b])), akin to the importance vector in SP-LIME (Note that columns in W are either negative FI or positive FI here (see get_feature_name_signed()), so scores don't cancel out in sum)
def coverage(selection, W):
    return np.count_nonzero(np.sum(W[np.array(list(selection)).flatten()], axis=0)) # coverage(selection)= number of cols in W that have at least one non-zero entry under this selection of pairs. Recall that W has (up to) two entries per word: one for positive and one for negative FI



In [10]:
from itertools import combinations


In [11]:
# want a set of pairs that maximizes coverage in explanation space akin to the SP-LIME strategy but for pairs 
# this is the maximum coverage problem: e.g., R. Church and C. ReVelle, 1974 http://yalma.fime.uanl.mx/~roger/work/teaching/class_tso/docs_project/problems/MCLP/1974-prs-Church%20ReVelle-maximal%20covering%20location.pdf
# implementing a greedy algorithm here: 
#   "In order to achieve a maximal cover for p facilities under a given service distance, 
#   the algorithm starts with an empty solution set and then adds to this set one at a 
#   time the best facility sites. The GA algorithm picks for the first facility that 
#   site which covers the most of the total population. For the second facility, GA 
#   picks the site that covers the most of the population not covered by the first 
#   facility. Then, for the third facility, GA picks the site that covers the most of the 
#   population not covered by the first and second facilities. This process is continued until either p facilities have been selected or all the population is covered. 
#   Details of the algorithm are given in Church." (R. Church and C. ReVelle, 1974, p. 105f)


# let the coverage be the number of columns != 0 in W. And coverage((a,b))) := coverage(sum([a,b])), akin to the importance vector in SP-LIME (Note that columns in W are either negative FI or positive FI here (see get_feature_name_signed()), so scores don't cancel out in sum)

def coverage(selection, W):
    return np.count_nonzero(np.sum(W[np.array(list(selection)).flatten()], axis=0)) # coverage(selection)= number of cols in W that have at least one non-zero entry under this selection of pairs. Recall that W has (up to) two entries per word: one for positive and one for negative FI

def get_site_with_max_coverage(sites, previous_selections, W):
    best_site = None
    best_coverage = 0
    for site in sites:
        candidate = set(np.array(previous_selections).flatten()).union(site) # extend the previous selection by "site", this addresses "For the second facility, GA picks the site that covers the most of the population not covered by the first "
        cov = coverage(candidate, W) # compute new coverage
        if cov >= best_coverage:
            best_coverage = cov
            best_site = site
    return best_site, best_coverage

def get_p_tuples_with_high_coverage(indices, W, p=10):
  sites = list(indices)
  # "the algorithm start with emty solution set"
  result = list()
  # "and then adds to this set one at a time the best facility sites"
  while True:
      # "The GA algorithm picks for the first facility that 
      # site which covers the most of the total population"
      best_site, best_coverage = get_site_with_max_coverage(sites, result, W)
      result.append(best_site)
      # "This process is continued until either p facilities have been selected or all the population is covered."
      if len(result) == p or best_coverage == W.shape[1]:
          break
  return result


In [12]:
# returns two pairs, one for f(x) = machine and one for f(x) = human
# checks texts_already_selected and chooses next best pair (for each class) if a document is in texts_already_selected (i.e. it was already selected for another explainer-detector pair)
def obtain_dataset_FI_methods(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):
    W, features_dict = get_explanation_matrix_W(documents, explainer)
    similar_pairs, _, _ = get_pairs(documents, W, detector, features_dict)
    # want a dataset that is balanced irt to the two base classes:
    # two pairs will be returned, one with f(x) == machine, and one with f(x) == human
    top_10_pairs_human = []
    top_10_pairs_machine = []
    for pair in similar_pairs:
        if (documents[pair[0]] in texts_already_selected) or (documents[pair[1]] in texts_already_selected):
            continue
        if detector.predict_label([documents[pair[0]]])[0]:
            top_10_pairs_human.append(pair)
        else:
            top_10_pairs_machine.append(pair)
        if len(top_10_pairs_human) >= 10 and len(top_10_pairs_machine) >= 10:
            top_10_pairs_human = top_10_pairs_human[0:10]
            top_10_pairs_machine = top_10_pairs_machine[0:10]
            break
    pairs_human = get_p_tuples_with_high_coverage(top_10_pairs_human, W, p=3)
    pairs_machine = get_p_tuples_with_high_coverage(top_10_pairs_machine, W, p=3)
    return pairs_human + pairs_machine    
    # combinations_ = [(a,b) for a in top_10_pairs_machine for b in top_10_pairs_human]
    # # return two pairs maximizing coverage, one with f(x) == machine, and one with f(x) == human 
    
    # pair_a = None
    # pair_b = None
    # coverage_ = [((a,b),coverage([a,b],W)) for a,b in combinations_]
    # (pair_a, pair_b), c = max(coverage_, key=lambda item : item[1])
    # return [pair_a, pair_b]

        

    
        # break
        # # get f(a) as one example per class is returned
        # predictions = [detector.predict_label([documents[a]])[0] for a,_ in pairs] # wheter a == b was tested before

        # # return example with highest coverage for each class
        # # if a document is in texts_already_selected (i.e. it was already selected for another explainer-detector pair), the one with the next-highest coverage (for that prediction) is returned 
        # for idx_pair, prediction in enumerate(predictions):
        #     a,b = pairs[idx_pair]
        #     # check if a or b are in texts_already_selected
        #     if (documents[a] not in texts_already_selected) and (documents[b] not in texts_already_selected):
        #         if prediction == 0 and pair_machine is None: # only keep first
        #             pair_machine = pairs[idx_pair] 
        #         if prediction == 1 and pair_human is None: # only keep first
        #             pair_human = pairs[idx_pair] 
        #     if pair_human is not None and pair_machine is not None:
        #         return [pair_machine, pair_human] 
        # k+=1 # loop until both pair_machine and pair_human not None



# Search Strategy for Rule-Based Explainers

In [13]:
from anchor.anchor import anchor_explanation
from collections import defaultdict
from itertools import combinations, chain

In [14]:
def jaccard_similarity(document_a, document_b):
    # case sensitive, on spacy tokens
    a = list(chain(*[[token.text for token in sent] for sent in nlp(document_a).sents]))
    b = list(chain(*[[token.text for token in sent] for sent in nlp(document_b).sents]))
    intersection = float(len(list(set(a).intersection(b))))
    union = float((len(set(a)) + len(set(b)))) - intersection
    return intersection / union

In [15]:
# encodes the order of occurence in a list of words, e.g.:
# ["example", "test", "example", "one"] -> ['example_0', 'test_0', 'example_1', 'one_0']
def encode_count(list_of_words):
    d = defaultdict(lambda : 0)
    encoded = []
    for word in list_of_words:
        encoded.append(word + "_" + str(d[word]))
        d[word] +=1
    return encoded

In [16]:
# the dictonary Anchors returns can define multiple anchors:
# {this, is, an, example} : 0.9
# {this, is, an}: 0.8
# {this, is, }: 0.75
# {this}: 0.4
# extract all of them, only keep those with p >= 0.75 (threshold set when searching)
def get_anchors_at_each_k(documents, explainer, quiet=False):
    anchors = []
    p = []
    ids = []
    for i,_ in tqdm(enumerate(documents), desc="Loading all explanations", disable=quiet):#enumerate(documents):
        exp = explainer.get_explanation_cached(documents[i])
        exp["names"] = encode_count(exp["names"]) # Anchors is not BOW. But the algorithm is written with python set()s
        while len(exp["mean"]) >=1:#and exp["mean"][-1] >= 0.75:
            anchors.append(set(exp["names"])) 
            p.append(exp["mean"][-1])
            ids.append(i)

            exp["mean"].pop()
            exp["names"].pop()
    return anchors, p, ids


In [17]:
# searches for pairs of anchors
# returns 2 pairs of documents, one pair for f(x) = machine, one for f(x) = human, both sampled randomly
# checks for and skips documents in "texts_already_selected" (i.e. it was already selected for an other explainer-detector pair)

def obtain_dataset_Anchor(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):

    anchors, p, ids = get_anchors_at_each_k(documents, explainer)
                        # DetectGPT + Anchors is to expensive to run experiments on 
    # find anchors that occur more than once in the dataset, then remove duplicates (created by looping) with set()
    duplicate_anchors = [set(anchor) for anchor in set([frozenset(anchor) for anchor in anchors if anchors.count(anchor) > 1])]
    # get the ids and p for each duplicate_anchor in  duplicate_anchors
    # "candidates" is a list of lists with ids (and all other details) of each duplicate_anchor
    candidates = [[(anchor, p, document_id) for anchor, p, document_id in zip(anchors, p, ids) if anchor == duplicate_anchor] for duplicate_anchor in duplicate_anchors ]
    # now check for each paring of the documents in each sublist of "candidates":
    #   is f(a) == f(b)?, if not: discard
    # then pick pair with highest jaccard_score on the original documents in each "candidate"
    pairs = []

    predictions_cache = {}
    def cached_predict(idx):
        if idx not in predictions_cache:
            predictions_cache[idx] = detector.predict_label([documents[idx]])[0]
        return predictions_cache[idx]
    for candidate in tqdm(candidates, desc="Assessing candidates",position=1):
        
        anchor_s, p, ids  = zip(*candidate)
        c = list(combinations(ids, 2))
        c = [(a,b) for a,b in c if cached_predict(a) == cached_predict(b) if not (documents[a] in texts_already_selected) or (documents[b] in texts_already_selected)]
        if len(c) == 0:
            continue
        jaccard_scores = [(a,b, jaccard_similarity(documents[a], documents[b])) for a,b in tqdm(c, desc="Calculating Jaccard Similarity (of documents not Anchors)",position=0)]
        a,b, score = max(jaccard_scores, key=lambda x: x[2])
        pairs.append((a,b))

    # sample twice: once for f(x) == human and once for f(x) == machine. f(a) == f(b) is tested earlier

    predictions = [cached_predict(a) for a,_ in pairs] # wheter a == b was tested before

    predictions_ = np.array(predictions)
    pairs_ = np.array(pairs)

    machine = pairs_[predictions_ == False]
 
    human = pairs_[predictions_ == True]
   
    
    np.random.seed(42)
    result = []
    # one explainer (DetectGPT) has no explanations for f(x) = human:
    if not(True in predictions) or human.shape[0] < 3:
        print("Warning: Not enough examples for f(x) = human. Returning additional examples for machine")
        result =  list(machine[np.random.choice(machine.shape[0], 6-human.shape[0], replace=False)]) + list(human[np.random.choice(human.shape[0], human.shape[0], replace=False)])
    elif not (False in predictions) or machine.shape[0] < 3:
        print("Warning: Not enough examples for f(x) = machine. Returning additional examples for human")
        result = list(machine[np.random.choice(machine.shape[0], machine.shape[0], replace=False)]) + list(human[np.random.choice(human.shape[0], 6 -machine.shape[0], replace=False)])
    else:
        result =  list(machine[np.random.choice(machine.shape[0], 3, replace=False)]) + list(human[np.random.choice(human.shape[0], 3, replace=False)]) # returns a random pair for machine and a random pair for human

    print(result)
    return result

    


# Load Dataset

In [18]:
def obtain_dataset(explainer, detector, documents, gold_labels, document_ids, texts_already_selected):
    if isinstance(explainer, Anchor_Explainer):
        return obtain_dataset_Anchor(explainer, detector, documents, gold_labels, document_ids, texts_already_selected)
    else:
        return obtain_dataset_FI_methods(explainer, detector, documents, gold_labels, document_ids, texts_already_selected)

In [19]:
test = pd.read_pickle("./dataset_test.pkl")
test = test 

documents = list(test["answer"])
gold_labels = list(test["author"] == "human_answers") # convention: 0: machine, 1: human, see detector.py
document_ids = list(range(0,len(documents))) # note that the search algorithms don't use these ids. They are only used for printing and the exclude_list!!

In [20]:
import spacy
nlp = spacy.load("en_core_web_lg")
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer().fit(documents)

# Perform Document Selection
Some documents are excluded from the user-study for the reasons specified below:

In [21]:

exclude_list = {
    (195, 60,108, 228,143): "Names forum/service explicitly",
    (288,117, 188, 110, 159, 97, 105, 115,266, 158): "Author introduces themselves by name",
    (16,): "References earlier post by other user",
    (190,294,): "Names forum user who asked the question",
    (27,103,): "NSFW",
    
    
}
exclude_list = [x for xs in [ list(key) for key in exclude_list.keys()] for x in xs]
exclude_list

[195,
 60,
 108,
 228,
 143,
 288,
 117,
 188,
 110,
 159,
 97,
 105,
 115,
 266,
 158,
 16,
 190,
 294,
 27,
 103]

In [22]:
# apply exclude_list
documents = [d for i,d in zip(document_ids, documents) if i not in exclude_list]
gold_labels = [gl for i,gl in zip(document_ids, gold_labels) if i not in exclude_list]
document_ids = [i for i in document_ids if i not in exclude_list]

> **Warning**: If you plan to participate in the user study, set `OBFUSCATE_RESULTS` to `True` before proceeding!!!

In [23]:
columns = ["Detector", "Explainer", "Documents Phases 1+3", "Documents Phases 2+4", "f(a)", "f(b)", "GT a", "GT b", "idx a", "idx b", "Spacy Similarity", "Jaccard Similarity", "Cosine Similarity tfidf","hash a", "hash b"]




In [24]:
def update_selection(selection, pairs, explainer, detector):
    for a,b in pairs:
        if OBFUSCATE_RESULTS:
            continue
        
        tfidf_= tfidf.transform([documents[a], documents[b]])   
        selection.append((detector.__class__.__name__,
                        explainer.__class__.__name__,
                        documents[a], documents[b],
                        *detector.predict_label([documents[a], documents[b]]),
                        gold_labels[a],
                        gold_labels[b],
                        document_ids[a],
                        document_ids[b],
                        nlp(documents[a]).similarity(nlp(documents[b])),
                        jaccard_similarity(documents[a], documents[b]),
                        (tfidf_ * tfidf_.T).toarray()[0,1],
                        explainer.get_hash(documents[a]),
                        explainer.get_hash(documents[b])))
    return selection

In [25]:
# selection = []
# for detector_class in [DetectorDetectGPT,DetectorRadford,DetectorGuo]:
#     selection_detector = []
#     detector = detector_class()
#     display(HTML("<h1>{}</h1>".format(detector.__class__.__name__)))
#     for explainer_class in [Anchor_Explainer, LIME_Explainer,SHAP_Explainer]:
#         explainer = explainer_class(detector)
#         display(HTML("<h2>{}</h2>".format(explainer.__class__.__name__)))
#         texts_already_selected = []
#         if len(selection_detector) > 0:
#             texts_already_selected = list(zip(*selection_detector))[2] + list(zip(*selection_detector))[3]
#         pairs = obtain_dataset(explainer, detector, documents, gold_labels, document_ids, texts_already_selected=texts_already_selected)
#        # print_pairs(pairs, documents, gold_labels, document_ids, detector, explainer)
#         selection_detector = update_selection(selection_detector, pairs, explainer, detector)
#     selection = selection + selection_detector
#         # break
        
    

In [26]:
# df = pd.DataFrame(selection, columns=columns)

In [27]:
# if not OBFUSCATE_RESULTS:
#     df.to_pickle("./dataset_user_study_new.pkl") # file in .gitignore

In [28]:
df = pd.read_pickle("./dataset_user_study_new.pkl")
df

Unnamed: 0,Detector,Explainer,Documents Phases 1+3,Documents Phases 2+4,f(a),f(b),GT a,GT b,idx a,idx b,Spacy Similarity,Jaccard Similarity,Cosine Similarity tfidf,hash a,hash b
0,DetectorDetectGPT,Anchor_Explainer,Both are saying essentially the same thing. T...,"Assuming you live in the US, it is quite norma...",0,0,True,True,52,238,0.867257,0.176101,0.15379,714b04dd8923e09ea3f370b93660441d792104140d13d3...,60b992dfcad293c2fbe76d7842a4e469ba041ced7c5270...
1,DetectorDetectGPT,Anchor_Explainer,It can be a good idea to follow the advice of ...,It is generally a good idea to use sponsorship...,0,0,False,False,217,234,0.95504,0.164062,0.203377,c4c411239613f735fe43246c9c0108a876ff3ebcf723ee...,e573296bc2021073ef94c1f60c1f097dc770793203b348...
2,DetectorDetectGPT,Anchor_Explainer,It is possible that a medical bill may have be...,Credit unions are not-for-profit financial coo...,0,0,False,False,145,161,0.968658,0.190476,0.0804,dd5eebda3bd1fe324db9194206ddba635deb68071485af...,51d1e47a3a3e56608b1fbc9a3f63dfd38a5d07ecd93837...
3,DetectorDetectGPT,Anchor_Explainer,Technical analysts use charts and other techni...,Option contracts are generally not subject to ...,0,0,False,False,284,299,0.970607,0.179487,0.187225,c8a011be98cb45f35f081b77177e54806135e858720719...,5b452735e59499d5c471fefba9bbcc41bc42763a5d8553...
4,DetectorDetectGPT,Anchor_Explainer,It is possible that stress could be contributi...,It is generally not recommended for individual...,0,0,False,False,72,244,0.969343,0.195804,0.15645,2896cd03f5a9e20c7aabbc57edfa78908bc88025779a18...,ccda7c8a282ef5799ad05651df3cd89230aaec28541207...
5,DetectorDetectGPT,Anchor_Explainer,It's important to talk to a healthcare provide...,It is important to get the lump on your grands...,0,0,False,False,29,216,0.961467,0.318182,0.317253,f16ea55e488aaf8c01f3e55cb6a6f44112e5f3795c4962...,739372dc4bf92366bc991aab94a8f80ee06f90e30c51cd...
6,DetectorDetectGPT,LIME_Explainer,Yes. I can by all means start my own company a...,"thanks for your query, the bump could be secon...",1,1,True,True,54,175,0.914531,0.127907,0.112285,20951d0d491f061e02e85cf742afd26ae0ebec6b895af3...,b2ae8a1624e51a0894124a9aaae1fb99c3557b284a96c7...
7,DetectorDetectGPT,LIME_Explainer,Hello and Welcome to ‘Ask A Doctor’ service.I ...,"It doesn't generally matter, and I'm not sure ...",1,1,True,True,4,40,0.880217,0.101449,0.054935,99e2f97d4e1e44bdba155c393c422e43995942290aac2f...,5ae5387b74d5d39364702c87dfff6d52121b249c4e7e6b...
8,DetectorDetectGPT,LIME_Explainer,Limit books are managed by exchanges. If an or...,Reuters has a service you can subscribe to tha...,1,1,True,True,191,249,0.94361,0.137931,0.072292,cb18fc73b58feeb14211103bba236c3a96f4bc881f323c...,e6b72b7969bbad53385251e1baf30d9f72a75067fc1707...
9,DetectorDetectGPT,LIME_Explainer,Multivariate statistics is a branch of statist...,"There are many potential causes of fever, shiv...",0,0,False,False,180,301,0.916336,0.108108,0.039615,4c1cbef644e5ad6a64c2209098c975c58e5f80d93e25f5...,0e65e1bbbf8133d14e940529817a55398d439910f1376e...


In [29]:
# df[["Documents Phases 1+3", "Documents Phases 2+4"]].stack()[df[["Documents Phases 1+3", "Documents Phases 2+4"]].stack().duplicated(keep=False)]

In [30]:
assert df.groupby(["Detector", "Explainer"])[["Documents Phases 1+3", "Documents Phases 2+4"]].apply(lambda group: len(group.stack()[group.stack().duplicated(keep=False)])).sum() == 0, "Duplicate documents!"

In [31]:
assert df.groupby(["Detector"])[["Documents Phases 1+3", "Documents Phases 2+4"]].apply(lambda group: len(group.stack()[group.stack().duplicated(keep=False)])).sum() == 0, "Duplicate documents!"

In [32]:
df.groupby(["Explainer"])[["Documents Phases 1+3", "Documents Phases 2+4"]].apply(lambda group: len(group.stack()[group.stack().duplicated(keep=False)]))

Explainer
Anchor_Explainer     9
LIME_Explainer       4
SHAP_Explainer      14
dtype: int64

In [33]:
# TODO remove from .gitignore after user study
# TODO change format to something else after user study 

In [34]:
import random

In [35]:
prediction_cache = {}
def prediction_cached(detector, document):
    id = (detector.__class__.__name__,document)
    if id not in prediction_cache:
        prediction_cache[id] = detector.predict_label([document])[0]
    return prediction_cache[id]


In [36]:
detector_detectgpt = DetectorDetectGPT()
detector_radford = DetectorRadford()
detector_guo = DetectorGuo()
def get_random_df(df, seed=42):
    selection = []
    random.seed(seed)
    for idx, row in tqdm(list(df.iterrows())):
        detector = None
        explainer = None
        if row["Detector"] == "DetectorDetectGPT":
            detector = detector_detectgpt
        if row["Detector"] == "DetectorRadford":
            detector = detector_radford
        if row["Detector"] == "DetectorGuo":
            detector = detector_guo

        if row["Explainer"]  == "Anchor_Explainer":
            explainer = Anchor_Explainer(detector)
        if row["Explainer"]  == "LIME_Explainer":
            explainer = LIME_Explainer(detector)
        if row["Explainer"]  == "SHAP_Explainer":
            explainer = SHAP_Explainer(detector)
        #               all documents not in exclude_list                                                                            without replacement
        candidates = [i for i in range(0,len(documents)) if i not in exclude_list and (len(selection) == 0 or document_ids[i] not in list(zip(*selection))[8] + list(zip(*selection))[9])]
        random.shuffle(candidates)
        pairs = None
        while True:
            pairs = [(candidates[0], candidates[1])]
            if prediction_cached(detector, documents[candidates[0]]) == row["f(a)"] and prediction_cached(detector, documents[candidates[0]]) == prediction_cached(detector, documents[candidates[1]]):
                break
            candidates = candidates[2:]
        

        selection = update_selection(selection, pairs, explainer, detector)
    return pd.DataFrame(selection, columns=columns)

Using cache dir ./.cache
Loading BASE model EleutherAI/pythia-70m...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MOVING BASE MODEL TO GPU...DONE (0.20s)
DONE (0.07s)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
N_RANDOM_SELECTIONS = 10

In [38]:
dfs_random = [get_random_df(df, seed=i) for i in range(0,N_RANDOM_SELECTIONS)]


100%|██████████| 54/54 [03:51<00:00,  4.28s/it]
100%|██████████| 54/54 [02:08<00:00,  2.37s/it]
100%|██████████| 54/54 [01:52<00:00,  2.09s/it]
100%|██████████| 54/54 [01:41<00:00,  1.89s/it]
100%|██████████| 54/54 [01:35<00:00,  1.77s/it]
100%|██████████| 54/54 [01:32<00:00,  1.72s/it]
100%|██████████| 54/54 [01:24<00:00,  1.57s/it]
100%|██████████| 54/54 [01:28<00:00,  1.63s/it]
100%|██████████| 54/54 [01:29<00:00,  1.66s/it]
100%|██████████| 54/54 [01:28<00:00,  1.65s/it]


## Functions that produce dataframes for export to latex

In [39]:
from scipy.stats.mstats import ttest_rel
from scipy.stats.mstats import ttest_ind
from scipy.stats.mstats import ttest_1samp

In [40]:
def get_metrics_FI(df, selecting_combinations_only=False):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford, DetectorDetectGPT]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer]:
            explainer = explainer_class(detector)
            for idx, row in tqdm(list(df.iterrows())):
            #    print(row)
                if selecting_combinations_only and row["Detector"] != detector.__class__.__name__:
                    continue
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
                W, _ = get_explanation_matrix_W([a,b], explainer, quiet=True)

                sim = cosine_similarity(W) 
                cosine_similarity_ = sim[0,1]

                n_tokens_overlap_in_w = np.all(W != 0, axis = 0).sum()# / np.any(W != 0, axis = 0).sum()
                

                results.append((
                    idx,
                    explainer.__class__.__name__,
                    detector.__class__.__name__,
                    cosine_similarity_,
                    n_tokens_overlap_in_w,

                 ))
    df_results = pd.DataFrame(results, columns=[
        "idx",
        "Explainer",
        "Set",
        "Cosine Similarity",
        "\\# Common Features",
        ])
    df_results = df_results.set_index(["Explainer", "Set"])
    return df_results

In [41]:
def get_metrics_Anchor(df, selecting_combinations_only=False):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford, DetectorDetectGPT]:
        detector = detector_class(metadata_only=True)

        explainer = Anchor_Explainer(detector)
        for idx, row in tqdm(list(df.iterrows())):
        #    print(row)
            if selecting_combinations_only and row["Detector"] != detector.__class__.__name__:
                continue
            a = row["Documents Phases 1+3"]
            b = row["Documents Phases 2+4"]
            anchors, p, ids = get_anchors_at_each_k([a,b], explainer, quiet=True)
            # find anchors that occur more than once in the dataset, then remove duplicates (created by looping) with set()
            duplicate_anchors = [set(anchor) for anchor in set([frozenset(anchor) for anchor in anchors if anchors.count(anchor) > 1])]
            results.append((
                idx,
                explainer.__class__.__name__,
                detector.__class__.__name__,
                len(duplicate_anchors),
                max([len(anchor) for anchor in duplicate_anchors]) if len(duplicate_anchors) else 0,
                p[anchors.index(max(duplicate_anchors, key=lambda anchor: len(anchor)))] if len(duplicate_anchors) else 0
                ))
    df_results = pd.DataFrame(results, columns=[
        "idx",
        "Explainer",
        "Set",
        "\\# Matching Anchors",
        "Len Longest Matching Anchor",
        "$\\theta$ Longest Matching Anchor",
        ])
    df_results = df_results.set_index(["Explainer", "Set"])
    return df_results

In [42]:
def get_metrics_Document(df, selecting_combinations_only=False):
    results = []
    for detector_class in [DetectorGuo, DetectorRadford, DetectorDetectGPT]:
        detector = detector_class(metadata_only=True)
        for explainer_class in [LIME_Explainer,SHAP_Explainer, Anchor_Explainer]:
            explainer = explainer_class(detector)
            for idx, row in tqdm(list(df.iterrows())):
            #    print(row)
                if selecting_combinations_only and row["Detector"] != detector.__class__.__name__:
                    continue
                a = row["Documents Phases 1+3"]
                b = row["Documents Phases 2+4"]
            
                results.append((
                    idx,
                    explainer.__class__.__name__,
                    detector.__class__.__name__,
                        row["Spacy Similarity"],
                        row["Jaccard Similarity"],
                        row["Cosine Similarity tfidf"]))
    df_results = pd.DataFrame(results, columns=[
        "idx",
        "Explainer",
        "Set",
        "Spacy Similarity",
        "Jaccard Similarity",
        "Cosine Similarity tfidf",
        ])
    df_results = df_results.set_index(["Explainer", "Set"])
    return df_results

## Export to latex

In [43]:
latex_strings = []

In [44]:
columns=["Metric", "tstatistic", "pvalue","Mean of Method", "Mean of {} Rand. Selections".format(N_RANDOM_SELECTIONS), "Gain Over Random" ]

In [45]:
# get and aggregate results by detector
def get_results_detector_level(m_method, m_random):
    t = []
    for metric in m_method.columns:
        for (detector, group_method), ((detector_r), group_random) in zip(m_method.groupby(["Set"]), m_random.groupby(["Set"])):
            assert detector == detector_r
            tstatistic, pvalue = ttest_ind(group_method[metric], group_random[metric])

            t.append([detector[0], metric[0], tstatistic, pvalue, group_method[metric].mean(), group_random[metric].mean(), group_method[metric].mean() - group_random[metric].mean()])
    df_results_detector_level = pd.DataFrame(t, columns=["Set"]+columns).set_index(["Metric", "Set"])#.apply(get_p_asterisks_2samp).drop(["pvalue","tstatistic"], axis=1)
    return df_results_detector_level.reset_index().set_index(["Metric", "Set"])

In [46]:
# get results for entire selection
def get_results_entire_selection(m_method, m_random):
        t = []
        for metric in m_method.columns:
                tstatistic, pvalue = ttest_ind(m_method[metric], m_random[metric])
                t.append([metric[0],  tstatistic, pvalue, m_method[metric].mean(), m_random[metric].mean(), m_method[metric].mean() - m_random[metric].mean()])
        df_results_selection_level = pd.DataFrame(t, columns=columns)#.apply(get_p_asterisks_2samp).drop(["pvalue","tstatistic"], axis=1)
        # add additional descriptions
        df_results_selection_level["Set"] = "All"
        return df_results_selection_level.reset_index().set_index(["Metric", "Set"])

In [47]:
def highlight_significant(row, props=''):
  #  display(s)
    styles = [''] * len(row)
    styles[-1] = 'font-weight: bold' if row["pvalue"] <= 0.05 else ''
    return styles
def shade_by_type(row, props=''):

  if(row.name[0] == "Explanation Similarity in W"):
    return ['background-color:red'] * len(row)
  else:
    return [''] * len(row)
def shade_by_type_index(row, props=''):
  return ['background-color:red'] * 8 + [''] * 20


In [48]:
m_method = get_metrics_Document(df, selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean"]) # note that nothing is aggregated here, this is just to match the shape of the next line:
m_random = get_metrics_Document(pd.concat(dfs_random), selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean", "std"]) # take mean score across random runs for each metric

df_similarity_document = pd.concat([get_results_entire_selection(m_method, m_random),get_results_detector_level(m_method, m_random)]).sort_index(ascending=False).style.apply(highlight_significant, axis=1).hide(["tstatistic", "pvalue","index"], axis=1)\
    .map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2)  
display(df_similarity_document)
latex = df_similarity_document.to_latex(environment="longtable", convert_css=True, clines="all;data", hrules=True, caption="Similarity of documents (p < 0.05 bold)", label="similarityDocuments")
latex_strings.append(latex)

100%|██████████| 54/54 [00:00<00:00, 53991.04it/s]
100%|██████████| 54/54 [00:00<?, ?it/s]
100%|██████████| 54/54 [00:00<00:00, 53965.31it/s]
100%|██████████| 54/54 [00:00<?, ?it/s]
100%|██████████| 54/54 [00:00<00:00, 53658.47it/s]
100%|██████████| 54/54 [00:00<00:00, 53798.67it/s]
100%|██████████| 54/54 [00:00<00:00, 54107.12it/s]
100%|██████████| 54/54 [00:00<00:00, 27095.64it/s]
100%|██████████| 54/54 [00:00<00:00, 54223.71it/s]
100%|██████████| 540/540 [00:00<00:00, 119792.89it/s]
100%|██████████| 540/540 [00:00<00:00, 97752.45it/s]
100%|██████████| 540/540 [00:00<00:00, 82676.55it/s]
100%|██████████| 540/540 [00:00<00:00, 97853.80it/s]
100%|██████████| 540/540 [00:00<00:00, 108090.30it/s]
100%|██████████| 540/540 [00:00<00:00, 119875.31it/s]
100%|██████████| 540/540 [00:00<00:00, 107992.38it/s]
100%|██████████| 540/540 [00:00<00:00, 108162.57it/s]
100%|██████████| 540/540 [00:00<00:00, 179485.23it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Mean of Method,Mean of 10 Rand. Selections,Gain Over Random
Metric,Set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Spacy Similarity,DetectorRadford,0.93,0.88,0.05
Spacy Similarity,DetectorGuo,0.92,0.88,0.04
Spacy Similarity,DetectorDetectGPT,0.92,0.88,0.04
Spacy Similarity,All,0.92,0.88,0.04
Jaccard Similarity,DetectorRadford,0.16,0.12,0.03
Jaccard Similarity,DetectorGuo,0.15,0.12,0.03
Jaccard Similarity,DetectorDetectGPT,0.16,0.12,0.04
Jaccard Similarity,All,0.16,0.12,0.03
Cosine Similarity tfidf,DetectorRadford,0.12,0.09,0.03
Cosine Similarity tfidf,DetectorGuo,0.13,0.08,0.04


In [49]:
m_method = get_metrics_FI(df, selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean"]) # note that nothing is aggregated here, this is just to match the shape of the next line:
m_random = get_metrics_FI(pd.concat(dfs_random), selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean", "std"]) # take mean score across random runs for each metric

df_similarity_fi = pd.concat([get_results_entire_selection(m_method, m_random),get_results_detector_level(m_method, m_random)]).sort_index(ascending=False).style.apply(highlight_significant, axis=1).hide(["tstatistic", "pvalue","index"], axis=1)\
.map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2)
display(df_similarity_fi)
latex = df_similarity_fi.to_latex(environment="longtable", convert_css=True, clines="all;data", hrules=True, caption="Similarity of FI explanations. Cosine similarity in $W$ is significantly higher then when using random pairs (p < 0.05 bold)", label="similarityFI")
latex_strings.append(latex)


100%|██████████| 54/54 [00:00<00:00, 64.74it/s] 
100%|██████████| 54/54 [00:05<00:00,  9.45it/s] 
100%|██████████| 54/54 [00:00<00:00, 62.06it/s] 
100%|██████████| 54/54 [00:06<00:00,  8.57it/s]
100%|██████████| 54/54 [00:00<00:00, 69.71it/s]
100%|██████████| 54/54 [00:05<00:00,  9.75it/s]
100%|██████████| 540/540 [00:06<00:00, 86.37it/s] 
100%|██████████| 540/540 [00:49<00:00, 10.88it/s]
100%|██████████| 540/540 [00:06<00:00, 80.70it/s] 
100%|██████████| 540/540 [00:52<00:00, 10.24it/s]
100%|██████████| 540/540 [00:06<00:00, 86.17it/s] 
100%|██████████| 540/540 [00:53<00:00, 10.14it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Mean of Method,Mean of 10 Rand. Selections,Gain Over Random
Metric,Set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
\# Common Features,DetectorRadford,12.22,9.22,3.0
\# Common Features,DetectorGuo,10.61,8.7,1.91
\# Common Features,DetectorDetectGPT,5.0,3.75,1.25
\# Common Features,All,9.28,7.22,2.05
Cosine Similarity,DetectorRadford,0.33,0.18,0.14
Cosine Similarity,DetectorGuo,0.38,0.22,0.16
Cosine Similarity,DetectorDetectGPT,0.25,0.1,0.14
Cosine Similarity,All,0.32,0.17,0.15


In [50]:
m_method = get_metrics_Anchor(df, selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean"]) # note that nothing is aggregated here, this is just to match the shape of the next line:
m_random = get_metrics_Anchor(pd.concat(dfs_random), selecting_combinations_only=True).groupby(["Set", "Explainer", "idx"]).agg(["mean", "std"]) # take mean score across random runs for each metric

df_similarity_anchors = pd.concat([get_results_entire_selection(m_method, m_random),get_results_detector_level(m_method, m_random)]).sort_index(ascending=False).style.apply(highlight_significant, axis=1).hide(["tstatistic", "pvalue","index"], axis=1)\
.map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2)
display(df_similarity_anchors)
latex = df_similarity_anchors.to_latex(environment="longtable", convert_css=True, clines="all;data", hrules=True, caption="Similarity of Anchor explanations (p < 0.05 bold)", label="similarityAnchors")
latex_strings.append(latex)

100%|██████████| 54/54 [00:00<00:00, 588.66it/s]
100%|██████████| 54/54 [00:00<00:00, 551.75it/s]
100%|██████████| 54/54 [00:00<00:00, 1110.60it/s]
100%|██████████| 540/540 [00:00<00:00, 772.62it/s]
100%|██████████| 540/540 [00:00<00:00, 788.71it/s]
100%|██████████| 540/540 [00:00<00:00, 1255.98it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Mean of Method,Mean of 10 Rand. Selections,Gain Over Random
Metric,Set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
\# Matching Anchors,DetectorRadford,0.39,0.1,0.29
\# Matching Anchors,DetectorGuo,0.94,0.47,0.48
\# Matching Anchors,DetectorDetectGPT,0.39,0.08,0.31
\# Matching Anchors,All,0.57,0.21,0.36
Len Longest Matching Anchor,DetectorRadford,0.39,0.1,0.29
Len Longest Matching Anchor,DetectorGuo,0.94,0.47,0.48
Len Longest Matching Anchor,DetectorDetectGPT,0.39,0.08,0.31
Len Longest Matching Anchor,All,0.57,0.21,0.36
$\theta$ Longest Matching Anchor,DetectorRadford,0.14,0.06,0.08
$\theta$ Longest Matching Anchor,DetectorGuo,0.52,0.33,0.19


In [51]:
for s in latex_strings:
    print(s)

\begin{longtable}{llrrr}
\caption{Similarity of documents (p < 0.05 bold)} \label{similarityDocuments} \\
\toprule
 &  & \rotatebox{45}{Mean of Method} & \rotatebox{45}{Mean of 10 Rand. Selections} & \rotatebox{45}{Gain Over Random} \\
Metric & Set &  &  &  \\
\midrule
\endfirsthead
\caption[]{Similarity of documents (p < 0.05 bold)} \\
\toprule
 &  & \rotatebox{45}{Mean of Method} & \rotatebox{45}{Mean of 10 Rand. Selections} & \rotatebox{45}{Gain Over Random} \\
Metric & Set &  &  &  \\
\midrule
\endhead
\midrule
\multicolumn{5}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
\multirow[c]{4}{*}{Spacy Similarity} & DetectorRadford & 0.93 & 0.88 & \bfseries 0.05 \\
\cline{2-5}
 & DetectorGuo & 0.92 & 0.88 & \bfseries 0.04 \\
\cline{2-5}
 & DetectorDetectGPT & 0.92 & 0.88 & \bfseries 0.04 \\
\cline{2-5}
 & All & 0.92 & 0.88 & \bfseries 0.04 \\
\cline{1-5} \cline{2-5}
\multirow[c]{4}{*}{Jaccard Similarity} & DetectorRadford & 0.16 & 0.12 & \bfseries 0.03 \\
\clin

In [77]:
columns = [c.replace("_Explainer", "") for c in df.groupby(["Detector","Explainer"]).count().index.get_level_values(1).unique()]
index = [i.replace("Detector","") for i in df.groupby(["Detector","Explainer"]).count().index.get_level_values(0).unique()]
r = []
u = 1
for detector_name in index:
    row = []
    for explainer_name in columns:
        users = []
        for i in range(0,3):
            users.append("U"+str(u))
            u+=1
        row.append(users)
        
    r.append(row)
print(pd.DataFrame(r, columns = columns, index = index ).to_latex(caption="Assignment",label="assignmentusers"))


\begin{table}
\caption{Assignment}
\label{assignmentusers}
\begin{tabular}{llll}
\toprule
 & Anchor & LIME & SHAP \\
\midrule
DetectGPT & ['U1', 'U2', 'U3'] & ['U4', 'U5', 'U6'] & ['U7', 'U8', 'U9'] \\
Guo & ['U10', 'U11', 'U12'] & ['U13', 'U14', 'U15'] & ['U16', 'U17', 'U18'] \\
Radford & ['U19', 'U20', 'U21'] & ['U22', 'U23', 'U24'] & ['U25', 'U26', 'U27'] \\
\bottomrule
\end{tabular}
\end{table}

