This notebook creates new train and test splits from Guo et al.'s Huggingface dataset.
Their CSV file on Google Drive appears to mix up questions and answers (see [this Github issue](https://github.com/Hello-SimpleAI/chatgpt-comparison-detection/issues/30)). 
While not directly relevant here (the questions won't be used), it can't be easily verified if other issues were introduced when generating those splits.

Also, human answers in reddit_eli5 and open_qa appear to have artifacts in the form of spaces added before/after punctuation. For open_qa is already part of WikiQACorpus! E.g. whenever a word is a link. This effectively watermarks everything! 

Those two datasets are therefore excluded



In [None]:
# filter by document length as human responses tend to be shorter in this dataset on average
MAX_WORDS = 150
MIN_WORDS = 50

In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

h3_dataset_hf_raw = load_dataset("Hello-SimpleAI/HC3",name="all")

In [None]:
h3_dataset_hf = pd.DataFrame(h3_dataset_hf_raw["train"], columns=list(h3_dataset_hf_raw["train"].info.features.keys()))
h3_dataset_hf = h3_dataset_hf.explode("human_answers").explode("chatgpt_answers")
h3_dataset_hf = pd.melt(h3_dataset_hf, id_vars=["question", "source", "id"], value_vars=["human_answers", "chatgpt_answers"], value_name="answer", var_name="author")
#h3_dataset_hf["label"] = h3_dataset_hf["author"] == "chatgpt_answers"
#h3_dataset_hf["label"] = h3_dataset_hf["label"].astype(int)
h3_dataset_hf["id"] = h3_dataset_hf["id"].astype(int)
h3_dataset_hf = h3_dataset_hf.dropna(subset=["answer"])

# h3_dataset_hf["answer"] = h3_dataset_hf["answer"].replace(r'\n','', regex=True) # for comparision only: the csvs don't have nl
# h3_dataset_hf["question"] = h3_dataset_hf["question"].replace(r'\n','', regex=True)

h3_dataset_hf = h3_dataset_hf[~(h3_dataset_hf["source"].str.contains("open_qa"))] # the original human dataset has artifacts form hyperlinks effectively watermarking human text 
h3_dataset_hf = h3_dataset_hf[~(h3_dataset_hf["source"].str.contains("reddit_eli5"))] # the human dataset has artifacts 
#h3_dataset_hf = h3_dataset_hf[~(h3_dataset_hf["source"].str.contains("reddit_eli5"))] # the human dataset has artifacts 
h3_dataset_hf

In [None]:
# filter by document length as human responses tend to be shorter in this dataset on average

doc_within_range = h3_dataset_hf["answer"].str.split().str.len().apply(lambda l : (l <= MAX_WORDS and l >= MIN_WORDS))
df_min_max_len = h3_dataset_hf[doc_within_range]
df_min_max_len

In [None]:
# filter out responses that contain some of the "indicating words" provided by Guo et al.
# note that only some are used: they also remove certain stock phrases like "There are several ways" for their filtered version


indicating_words_chatgpt_en = [
    "AI assistant",
    "AI language model",
    "I'm sorry", # e.g. ... but I am not a medical doctor; but I am an AI language; to hear about your husband's symptoms 
    "It is difficult for me",
    "Contents may violate our content",
    "This content may violate our content policy",
    "Can you please provide the statement",
    "If you have any more questions, please don't hesitate to ask.",
    "If you have any questions about",
    "Let me know if you have any other questions",
    "If you have any more questions, feel free to ask!",
    "!\rnetwork error\r\r\r\r", # new
    "Free Research Preview.", # new
    "Your feedback will help us improve.", # new
    ]

remove = df_min_max_len["answer"].str.contains('|'.join(indicating_words_chatgpt_en), regex=True)

print("Removing {} documents, specifically:".format(len(df_min_max_len[remove])))
display(df_min_max_len[remove]["author"].value_counts())

df = df_min_max_len[~remove]

In [None]:
len_before = len(df)
df = df.drop_duplicates(subset=["question", "answer", "author"])

# There are duplicated human answers, not chat
df = df.drop_duplicates(subset=[ "answer", "author"])
print("Dropped {} duplicates (was {})".format(len_before - len(df),len_before))


In [None]:
df[df.duplicated(subset=["answer"])].sort_values(by="answer")

In [None]:
def get_equal_numbers_of_answers(group):
    human_answers = group[group["author"] == "human_answers"]
    chatgpt_answers = group[group["author"] == "chatgpt_answers"]
    n = min(len(human_answers), len(chatgpt_answers))
    return pd.concat([human_answers.sample(n, random_state=42), chatgpt_answers.sample(n, random_state=42)])
    

In [None]:
# want a balanced dataset: sample pairs of answers to the same question

df = df.groupby(["question"]).apply(get_equal_numbers_of_answers)

In [None]:
print("{} human answers, {} chat".format(len(df[df["author"] == "human_answers"]), len(df[df["author"] == "chatgpt_answers"])))
assert len(df[df["author"] == "human_answers"]) == len(df[df["author"] == "chatgpt_answers"]), "Sampling not balanced"

In [None]:
df = df.reset_index(drop=True)
df

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42, stratify=df["author"])

In [None]:
print("train", len(train))
print("test", len(test))

In [None]:
train.to_pickle("./dataset_train.pkl")
train.to_csv("./dataset/train.csv", index=False, encoding="utf8")

In [None]:
test.to_pickle("./dataset_test.pkl")
test.to_csv("./dataset/test.csv", index=False, encoding="utf8")

In [None]:
# the experiments where run on the .pkl files, providing .csv files for convenience
from pandas.testing import assert_frame_equal
assert_frame_equal(pd.read_pickle("./dataset_train.pkl").reset_index(drop=True), pd.read_csv("./dataset/train.csv").reset_index(drop=True),check_dtype=False)
assert_frame_equal(pd.read_pickle("./dataset_test.pkl").reset_index(drop=True), pd.read_csv("./dataset/test.csv").reset_index(drop=True), check_dtype=False)

