In [None]:
CACHE_DIR = "./results/benchmark/"

In [None]:
import pandas as pd
import os
import pickle

from detector_radford import DetectorRadford
from detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorGuo, DetectorRadford, DetectorDetectGPT]

results = []


In [None]:
test = pd.read_pickle("./dataset_test.pkl")
train = pd.read_pickle("./dataset_train.pkl")

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import time

In [None]:
if not os.path.exists(CACHE_DIR): 
    os.makedirs(CACHE_DIR)

In [None]:
len(train)+len(test)

In [None]:
documents = pd.concat([test["answer"], train["answer"]])
gold_labels = pd.concat([(test["author"] == "human_answers") ,  train["author"] == "human_answers"])

In [None]:
results = []
columns = ["Detector", "Acc", "F1", "ROC AUC", "TN", "FP", "FN", "TP", "ms/evaluation"]

In [None]:
def run(label, detector):
    if os.path.isfile(os.path.join(CACHE_DIR, label)):
        return
    start = time.time_ns()
    predictions = detector.predict_label(documents) # seed is set in detectors by default
    end = time.time_ns()
    with open(os.path.join(CACHE_DIR, label), 'wb') as f:
        pickle.dump((predictions, ((end - start) / len(documents))// 1000000), f)


In [None]:
# import transformers

# detectGPT_default = DetectorDetectGPT()
# detectGPT_default.n_perturbations = 100

# detectGPT_default.base_model_name = "gpt2-xl"
# base_model, base_tokenizer = detectGPT_default.load_base_model_and_tokenizer(detectGPT_default.base_model_name)
# detectGPT_default.base_model = base_model
# detectGPT_default.base_tokenizer = base_tokenizer

# mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(detectGPT_default.mask_filling_model_name, cache_dir=detectGPT_default.cache_dir)
# detectGPT_default.mask_model = mask_model

# mask_tokenizer = transformers.AutoTokenizer.from_pretrained(detectGPT_default.mask_filling_model_name, model_max_length=mask_model.config.n_positions, cache_dir=detectGPT_default.cache_dir)
# detectGPT_default.mask_tokenizer = mask_tokenizer

# detectGPT_default.load_base_model()
# detectGPT_default.load_mask_model()

# run(DetectorDetectGPT.__name__+" @100 GPT-2", detectGPT_default)

In [None]:
# detectGPT_pythia_100 = DetectorDetectGPT()
# detectGPT_pythia_100.n_perturbations = 100

# run(DetectorDetectGPT.__name__ +" @100", detectGPT_pythia_100)

In [None]:
# for detector_class in detector_classes:
#     run(detector_class.__name__, detector_class())

In [None]:
results_test = []
results_full = []
for label in os.listdir(CACHE_DIR): 
    with open(os.path.join(CACHE_DIR, label) , 'rb') as f:
        predictions, time = pickle.load(f)
        results_test.append((label,
                    accuracy_score(gold_labels[0:len(test)], predictions[0:len(test)]),
                    f1_score(gold_labels[0:len(test)], predictions[0:len(test)]),
                    roc_auc_score(gold_labels[0:len(test)], predictions[0:len(test)]),
                    *confusion_matrix(gold_labels[0:len(test)], predictions[0:len(test)]).ravel(), # TN, FP, FN, TP
                    time
                    ))
        results_full.append((label,
                    accuracy_score(gold_labels, predictions),
                    f1_score(gold_labels, predictions),
                    roc_auc_score(gold_labels, predictions),
                    *confusion_matrix(gold_labels, predictions).ravel(), # TN, FP, FN, TP
                    time
                    ))
df_test = pd.DataFrame(results_test, columns=columns).set_index("Detector")
df_full = pd.DataFrame(results_full, columns=columns).set_index("Detector")

display(df_test)
display(df_full)

In [None]:
with open("figures/benchmark_test.tex", "w", encoding="UTF-8") as text_file:
    text_file.write(df_test.style.format(precision=3).to_latex(environment="table", 
                                        convert_css=True, 
                                        clines="all;data", 
                                        hrules=True, 
                                        caption="Performance on the dataset explanations where generated for (balanced, n={})".format(len(test)), 
                                        label="table-benchmark_test"))

In [None]:
with open("figures/benchmark_full.tex", "w", encoding="UTF-8") as text_file:
    text_file.write(df_full.style.to_latex(environment="table", 
                                        convert_css=True, 
                                        clines="all;data", 
                                        hrules=True, 
                                        caption="Performance on the full dataset (balanced, n={})".format(len(train)+len(test)), 
                                        label="table-benchmark_full"))