# Identify the samples that need to be recleaned

General info: this script was executed twice, one time for the polishing prompts and one time for all the others, as polishing had some unique contamination features.

The script __TODO__ has to run before to have the trained models inplace.

# 0. Setup

In [5]:
import os
import sys
import pandas as pd
import pickle
from transformers import GPT2TokenizerFast
import nltk

# Ensure punkt is downloaded for sentence tokenization
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

BASE_DIR = "../../"
sys.path.append(BASE_DIR)

from src.general_functions_and_patterns_for_detection import (
    analyze_df_for_specific_hints_of_llms,
    TrainRobertaHelper, TrainingDataHandler,
    RESULT_DIR, REGEX_CLEANED_FILES, ORIGINAL_DATA_DIR, DOMAINS
)

prepare_df_for_roberta_training = TrainRobertaHelper.prepare_df_for_roberta_training

DEBUG = True
DRY_RUN = False
ALL_DATA = True
SEED = 2023

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pdingfelder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# TODO: adjust CUDA setup depending on your setup
# Disable NCCL features incompatible with RTX 40xx
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# Restrict to only GPU 0 (CUDA:0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 1. Load trained samples to get those with low prediction

## 1.1 get the predictions

__for the transfer dataset__: uncleaned data, to see if low high probability in llm detection show contaminations

In [9]:
PROMPTS = ["paraphrase_polish_human", "paraphrase_polish_llm"]
# PROMPTS = ["direct_prompt", "prompt_few_shot", "prompt_SICO"]
_prompt_str = "_" + "-".join(PROMPTS)

df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(REGEX_CLEANED_FILES, _suffix_path="_cleaned_all_v3.parquet", prompts=PROMPTS, paraphrase_polish_human_as_ai=False)

df_claude_nc, df_llama_nc, df_palm_nc, df_chatgpt_nc = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(ORIGINAL_DATA_DIR, _suffix_path=".json", prompts=PROMPTS, paraphrase_polish_human_as_ai=False)

# train using the cleaned domains and training structures
train_df_claude, test_df_claude, adjusted_df_claude, sample_ids_claude = \
    TrainingDataHandler.split_training_data_frame_and_adjust_transfer_test_df(df_claude_cleaned, df_claude_nc)

_model_path_uncleaned = f"{RESULT_DIR}T01claude_direct_prompt_all_domains_multi_llm_not_cleaned{_prompt_str}"
_dfs_nc = [adjusted_df_claude, df_llama_nc, df_palm_nc, df_chatgpt_nc]

In [10]:
_model_path_uncleaned

'/mnt/hdd-baracuda/pdingfelder/mt_philipp_dingfelder_generated_text_detection/src/../results/T01claude_direct_prompt_all_domains_multi_llm_not_cleaned_paraphrase_polish_human-paraphrase_polish_llm'

In [11]:
# use full claude dataset and not only test dataset as a subset
print(_model_path_uncleaned)
results_nc = TrainRobertaHelper.get_predictions_for_dataframes([df_claude_nc, df_llama_nc, df_palm_nc, df_chatgpt_nc], model_path=_model_path_uncleaned)
results_nc.info(), results_nc.shape, results_nc.drop_duplicates().shape

/mnt/hdd-baracuda/pdingfelder/mt_philipp_dingfelder_generated_text_detection/src/../results/T01claude_direct_prompt_all_domains_multi_llm_not_cleaned_paraphrase_polish_human-paraphrase_polish_llm


  return forward_call(*args, **kwargs)
100%|██████████| 11200/11200 [01:28<00:00, 127.18it/s]


<class 'pandas.core.frame.DataFrame'>
Index: 44800 entries, 0 to 11199
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      44800 non-null  int64  
 1   context                 44800 non-null  object 
 2   llm_type                44800 non-null  object 
 3   text                    44800 non-null  object 
 4   domain                  44800 non-null  object 
 5   label                   44800 non-null  object 
 6   llm_prompting_strategy  44800 non-null  object 
 7   prediction              44800 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 3.1+ MB


(None, (44800, 8), (44800, 8))

In [None]:
results_nc.drop_duplicates().shape

In [None]:
results_nc = results_nc.sort_values(by="prediction")
results_nc

In [None]:
results_nc_wo_claude = results_nc[results_nc["llm_type"]!="Claude-instant"]
results_nc_wo_claude

## 1.2 load the cleaned data

1. analyze for specific hints (some might still be present, as the applied regular expression for cleansing are different based on domain and used prompt, but only quite few compared to before shall be present)
2. predict using an uncleaned model and the cleaned data

In [23]:
df_claude_cleaned_v3, df_llama_cleaned_v3, df_palm_cleaned_v3, df_chatgpt_cleaned_v3 = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(REGEX_CLEANED_FILES, _suffix_path="_cleaned_all_v3.parquet",
                                         prompts=["paraphrase_polish_llm", "paraphrase_polish_human"]
                                         )

In [24]:
analyze_df_for_specific_hints_of_llms(df_palm_cleaned_v3, column_generated_text="text", print_results=False, print_summary_by_llm=False)
analyze_df_for_specific_hints_of_llms(df_claude_cleaned_v3, column_generated_text="text", print_results=False, print_summary_by_llm=False)
analyze_df_for_specific_hints_of_llms(df_llama_cleaned_v3, column_generated_text="text", print_results=False, print_summary_by_llm=False)
_, _ = analyze_df_for_specific_hints_of_llms(df_chatgpt_cleaned_v3, column_generated_text="text", print_results=False, print_summary_by_llm=False)

Entries with typical LLM Patterns:  193
Entries without typical LLM Patterns:  10454
Entries with typical LLM Patterns:  233
Entries without typical LLM Patterns:  10848
Entries with typical LLM Patterns:  376
Entries without typical LLM Patterns:  10810
Entries with typical LLM Patterns:  419
Entries without typical LLM Patterns:  10778


In [25]:
df_palm_cleaned_v3.tail(50)

Unnamed: 0,id,context,llm_type,text,domain,label,llm_prompting_strategy
10597,2745,"It's a basic, few frills theater.",Google-PaLM,"Their screens are on the small side, but I exp...",yelp_review,llm,paraphrase_polish_human
10598,2746,If you were to ask me about my in store purcha...,Google-PaLM,"However, that all changed after my purchase. T...",yelp_review,llm,paraphrase_polish_human
10599,2747,I would rate 4.,Google-PaLM,"Five stars, but only because we’re relatively ...",yelp_review,llm,paraphrase_polish_human
10600,2748,[This is part of a set of reviews covering est...,Google-PaLM,"Finally, I am getting to the last few reviews ...",yelp_review,llm,paraphrase_polish_human
10601,2749,This place is what I expect from a Marriott.,Google-PaLM,"The staff is friendly, the rooms are modern, a...",yelp_review,llm,paraphrase_polish_human
10602,2750,I went to the main location to adopt a dog for...,Google-PaLM,"I had to seek assistance, but stood there wait...",yelp_review,llm,paraphrase_polish_human
10603,2751,This is the McDonalds of Sushi.,Google-PaLM,**Polished Review:**You know you love those Mc...,yelp_review,llm,paraphrase_polish_human
10604,2752,Wow I am a sushi expert and this was VERY FRES...,Google-PaLM,The smoked or raw salmon was a nice choice. It...,yelp_review,llm,paraphrase_polish_human
10605,2753,When I thought about a review I would write fo...,Google-PaLM,I have had better.This restaurant reminded me ...,yelp_review,llm,paraphrase_polish_human
10606,2754,There's a rule in my family.,Google-PaLM,"When purchasing a home, ensure it is within wa...",yelp_review,llm,paraphrase_polish_human


In [26]:
_dfs_cl = [df_claude_cleaned_v3, df_llama_cleaned_v3, df_palm_cleaned_v3, df_chatgpt_cleaned_v3]
results_df_claude = TrainRobertaHelper.get_predictions_for_dataframes(_dfs_cl, model_path=_model_path_uncleaned)
results_df_chatgpt = TrainRobertaHelper.get_predictions_for_dataframes(_dfs_cl, model_path=_model_path_uncleaned.replace("claude", "chatgpt"))
results_df_llama = TrainRobertaHelper.get_predictions_for_dataframes(_dfs_cl, model_path=_model_path_uncleaned.replace("claude", "llama"))
results_df_palm = TrainRobertaHelper.get_predictions_for_dataframes(_dfs_cl, model_path=_model_path_uncleaned.replace("claude", "palm"))

100%|██████████| 11081/11081 [01:31<00:00, 121.56it/s]
100%|██████████| 11186/11186 [01:33<00:00, 120.18it/s]
100%|██████████| 10647/10647 [01:27<00:00, 121.65it/s]
100%|██████████| 11197/11197 [01:34<00:00, 118.88it/s]
100%|██████████| 11081/11081 [01:31<00:00, 121.69it/s]
100%|██████████| 11186/11186 [01:33<00:00, 120.20it/s]
100%|██████████| 10647/10647 [01:27<00:00, 121.27it/s]
100%|██████████| 11197/11197 [01:34<00:00, 118.51it/s]
100%|██████████| 11081/11081 [01:31<00:00, 121.14it/s]
100%|██████████| 11186/11186 [01:33<00:00, 119.86it/s]
100%|██████████| 10647/10647 [01:27<00:00, 121.44it/s]
100%|██████████| 11197/11197 [01:34<00:00, 118.69it/s]
100%|██████████| 11081/11081 [01:31<00:00, 120.70it/s]
100%|██████████| 11186/11186 [01:33<00:00, 119.72it/s]
100%|██████████| 10647/10647 [01:27<00:00, 121.15it/s]
100%|██████████| 11197/11197 [01:34<00:00, 118.68it/s]


In [27]:
results_df_claude["llm_training"] = "Claude-instant"
results_df_chatgpt["llm_training"] = "ChatGPT"
results_df_llama["llm_training"] = "Llama-2-70b"
results_df_palm["llm_training"] = "Google-PaLM"
_result_dfs_list = [results_df_claude, results_df_chatgpt, results_df_llama, results_df_palm]
results_df = pd.concat(_result_dfs_list).sort_values(by="prediction")
results_df

Unnamed: 0,id,context,llm_type,text,domain,label,llm_prompting_strategy,prediction,llm_training
4153,2798,Write a story which begins or ends in a barber...,Google-PaLM,"In a quaint town nestled amid rolling hills, ""...",writing_prompt,llm,paraphrase_polish_llm,0.000395,ChatGPT
2465,366,Cosmology from String Theory,ChatGPT,"In this study, we investigate the cosmological...",arxiv,llm,paraphrase_polish_human,0.000396,ChatGPT
9767,1382,Sports bars.,Llama-2-70b,"Sporting a modern and sleek atmosphere, [Sport...",yelp_review,llm,paraphrase_polish_llm,0.000396,ChatGPT
2448,349,The Solar Neighborhood. XIX. Discovery and Cha...,ChatGPT,We present spectra for 33 previously unclassif...,arxiv,llm,paraphrase_polish_human,0.000397,ChatGPT
11108,611,"Morton's, was without a doubt one of the origi...",ChatGPT,Morton's used to be a remarkable establishment...,yelp_review,llm,paraphrase_polish_human,0.000397,ChatGPT
...,...,...,...,...,...,...,...,...,...
500,1901,Hybrid Quantum Cloning Machine,Claude-instant,"In this work, we introduce a special kind of q...",arxiv,human,paraphrase_polish_llm,0.999527,Llama-2-70b
2056,657,The Peculiar Velocities of Local Type Ia Super...,ChatGPT,We quantify the effect of supernova Type Ia pe...,arxiv,human,paraphrase_polish_human,0.999527,ChatGPT
656,657,The Peculiar Velocities of Local Type Ia Super...,ChatGPT,We quantify the effect of supernova Type Ia pe...,arxiv,human,paraphrase_polish_llm,0.999527,ChatGPT
1857,2561,The influence of the cylindrical shape of the ...,Google-PaLM,We present a model improving the two-angle mod...,arxiv,human,paraphrase_polish_human,0.999530,Llama-2-70b


## 1.3 take a look at the cleaned data to identify those that have to be cleaned again (highest probability for llm generated content)

Re-clean for every of the four domains:
- 150 of the Claude LLM
- 40 of Llama 
- 20 each of PaLM and ChatGPT 

In [30]:
to_be_recleaned = []

for _domain in DOMAINS:
    TrainRobertaHelper.add_information_of_top_x_predictions(to_be_recleaned, results_df_claude[results_df_claude["domain"]==_domain], 150)
    _sorted = results_df_palm[results_df_palm["domain"]==_domain]
    TrainRobertaHelper.add_information_of_top_x_predictions(to_be_recleaned, _sorted, 40)
    _sorted = results_df_llama[results_df_llama["domain"]==_domain]
    TrainRobertaHelper.add_information_of_top_x_predictions(to_be_recleaned, _sorted, 20)
    _sorted = results_df_chatgpt[results_df_chatgpt["domain"]==_domain]
    TrainRobertaHelper.add_information_of_top_x_predictions(to_be_recleaned, _sorted, 20)
    
with open(f"{RESULT_DIR}/to_be_recleaned.pkl", "wb") as f:
    pickle.dump(to_be_recleaned, f)
len(to_be_recleaned)

920

In [31]:
for _domain in DOMAINS:
    print(f"######### {_domain} #######")
    _sorted = results_df_claude[results_df_claude["domain"]==_domain]
    TrainRobertaHelper.print_top_x_predictions(_sorted, 50)

######### arxiv #######
1761, Claude-instant (llm), 0.0005, 
Training LLM: Claude-instant, prompt: paraphrase_polish_llm
J1128+592: a highly variable IDV source

    J1128+592, a redshift 1.72 active galactic nucleus, exhibits extreme and periodic variability in radio brightness across epochs. Discovered in 2001 to display intraday variability with flux density changes within hours to days, it is classified as an intraday variable source. We conducted a two-month daily monitoring using multiple radio telescopes, densely sampling the flux density. The light curves from these observations reveal the variability occurs in discrete brightening and fading events. Spectral analysis during outbursts indicates variations stem from interstellar scintillation within the Milky Way. Modeling the observed variations constrains properties including an angular size less than 10 microarcseconds. This extensive monitoring campaign of J1128+592 to date finds it remains one of the most extreme intraday v

In [32]:
for _domain in DOMAINS:
    print(f"######### {_domain} #######")
    _sorted = results_df_palm[results_df_palm["domain"]==_domain]
    TrainRobertaHelper.print_top_x_predictions(_sorted, 20)

######### arxiv #######
2675, Google-PaLM (llm), 0.0005, 
Training LLM: Google-PaLM, prompt: paraphrase_polish_llm
Odd-frequency pairing in normal metal/superconductor junctions

    Odd-frequency pairing in normal metal/superconductor junctionsSuperconductivity, characterized by zero electrical resistance and magnetic field expulsion, is a fascinating state of matter. When a normal metal contacts a superconductor, the proximity effect may induce superconductivity in the normal metal. Furthermore, this proximity effect can lead to the formation of odd-frequency Cooper pairs, which are pairs of electrons with opposite spins and energies. Herein, we explore the properties of odd-frequency Cooper pairs in normal metal/superconductor junctions. These pairs induce intriguing phenomena, such as a minigap in the density of states and a magnetic field-dependent Josephson current. Our findings provide new insights into the proximity effect and the behavior of superconductors near normal metals.

In [33]:
for _domain in DOMAINS:
    print(f"######### {_domain} #######")
    _sorted = results_df_llama[results_df_llama["domain"]==_domain]
    TrainRobertaHelper.print_top_x_predictions(_sorted, 20)

######### arxiv #######
1372, Llama-2-70b (llm), 0.0004, 
Training LLM: Llama-2-70b, prompt: paraphrase_polish_human
Constrained simulations of the local universe: I. Mass and motion in the  Local Volume

    This study aims to investigate the relationship between the local gravitational field and the peculiar velocities of galaxies within the LV, taking into account the effects of dark matter and cosmic structure formation.Using constrained N-body cosmological simulations, we examine the flat-Lambda, open, and flat matter only CDM cosmogonies and compare them to unconstrained simulations. Our analysis focuses on LG-like objects, which are selected to mimic the real-life environment of the Local Group.We find that there is no correlation between the exact gravitational field, obtained through pairwise Newtonian forces between halos, and the local gravitational field. Moreover, the local gravitational field is uncorrelated with the peculiar velocities of halos. Although the exact gravit

In [34]:
for _domain in DOMAINS:
    print(f"######### {_domain} #######")
    _sorted = results_df_chatgpt[results_df_chatgpt["domain"]==_domain]
    TrainRobertaHelper.print_top_x_predictions(_sorted, 20)

######### arxiv #######
366, ChatGPT (llm), 0.0004, 
Training LLM: ChatGPT, prompt: paraphrase_polish_human
Cosmology from String Theory

    In this study, we investigate the cosmological implications of Salam-Sezgin six-dimensional supergravity. By analyzing the field equations, we identify a solution that aligns qualitatively with observations of distant supernovae, primordial nucleosynthesis abundances, and recent measurements of the cosmic microwave background. The mechanism responsible for the present accelerated expansion in the de Sitter epoch is a quintessence field slowly rolling down an exponential potential. Our model incorporates a second modulus, which is automatically stabilized and serves as a source of cold dark matter characterized by a mass that varies exponentially with the quintessence field. However, attempts to fully account for the current cold dark matter component using this mechanism lead to deviations from cosmological data that are deemed unacceptable; our 

In [35]:
for counter, item in results_df[results_df["label"]=="human"].iloc[:100].iterrows():
    print(f"""{item["id"]}, {item["llm_type"]}, {round(item["prediction"], 4)}, {item["context"]}
    
    {item.text}

""")

1157, Llama-2-70b, 0.0004, You are a young Jesus and you wake up after a massive party to find out that in a blackout you turned the world' s oceans into wine.

    The sprawled and sprightly forms of my party-goers lay scattered on the carpet of my flat. Below me raged an ocean of blood. A sickening sun snaked its tendrils over the line of the horizon. A massive crowd thronged the beach, with many wading into the red waves. It was a strange sight, seeing people drink from the ocean and enjoy it. Already, bodies sailed the gentle ridges of the waves. Men and women frantically sucked at the water, their bodies soiling with crimson as their children pulled at their clothes. I surveyed the army of drunkards below, knowing that I had fortuitously committed murder in the midst of my youthful play. I had forsaken the directives of my father and all things right in the world. But as I saw the dregs of society succumb to their demons, cleansing humanity of those that weighed them down, I ponde

In [36]:
for counter, item in results_df[results_df["label"]=="llm"].sort_values(by="prediction", ascending=False).iloc[:100].iterrows():
    print(f"""{item["id"]}, {item["llm_type"]}, {round(item["prediction"], 4)}, {item["context"]}
    
    {item.text}

""")

2783, Google-PaLM, 0.9994, Northern Ireland midfielder Chris Brunt will now learn on Wednesday the extent of a knee injury amid fears it may rule him out of Euro 2016.

    Chris Brunt, 31, was stretchered off during West Brom's 1-0 Premier League win over Crystal Palace on Saturday. There are fears the damage could be serious. The Northern Irishman, who missed the 2-2 draw with Leicester on Tuesday, is now expected to see a specialist on Wednesday instead of Tuesday.Baggies boss Tony Pulis said Monday: "Brunty is going to see a specialist so we will wait until he has done that. I have only seen it from a distance."His injury came days after he was struck in the face by a coin thrown by an Albion fan at the end of the FA Cup defeat at Reading.Northern Ireland have been drawn in Group C for Euro 2016 in France, alongside Ukraine, Poland, and Germany. Their opening match is against Poland in Nice on Sunday, June 12.


2111, Google-PaLM, 0.9994, Neil Warnock has described his Cardiff City

# 2. Check the token length and number of sentences of the cleaned samples 

In [52]:
df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(REGEX_CLEANED_FILES, _suffix_path="_cleaned_all_v3.parquet",
                                         prompts=[
                                             "direct_prompt", "prompt_few_shot", "prompt_SICO",
                                             "paraphrase_polish_human", "paraphrase_polish_llm"],
                                         prepare_for_training=False)

cleaned_df: dict = {
    "claude": df_claude_cleaned, "llama": df_llama_cleaned, 
    "palm": df_palm_cleaned, "chatgpt": df_chatgpt_cleaned
}

df_all_llms_cleaned = pd.concat(cleaned_df.values())

In [54]:
# Load GPT-2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Sample DataFrame: assume df is your existing DataFrame
columns_to_compare = ["direct_prompt", "prompt_few_shot", "prompt_SICO",
                      "paraphrase_polish_human", "paraphrase_polish_llm"]

# Melt dataframe to long format
df_long = df_all_llms_cleaned.melt(id_vars=["id", "llm_type", "domain", "text_human"], value_vars=columns_to_compare,
                  var_name="comparison_type", value_name="comparison_text")

print(df_long.shape)
# Drop any rows with missing values
df_long.dropna(subset=["text_human", "comparison_text"], inplace=True)
print(df_long.shape)

# Sentence and token count functions
def count_sentences(text):
    return len(sent_tokenize(text))

def count_tokens(text):
    return len(tokenizer.encode(text))

# Compute metrics
df_long["sentences_human"] = df_long["text_human"].apply(count_sentences)
df_long["sentences_other"] = df_long["comparison_text"].apply(count_sentences)
df_long["sentence_diff"] = (df_long["sentences_other"] - df_long["sentences_human"]).abs()
df_long["sentence_diff_nominal"] = df_long["sentences_other"] - df_long["sentences_human"]
df_long["sentence_ratio"] = (df_long["sentences_other"] / df_long["sentences_human"]).replace(0, 1)

df_long["tokens_human"] = df_long["text_human"].apply(count_tokens)
df_long["tokens_other"] = df_long["comparison_text"].apply(count_tokens)
df_long["token_diff"] = (df_long["tokens_other"] - df_long["tokens_human"]).abs()
df_long["token_ratio"] = df_long["tokens_other"] / df_long["tokens_human"].replace(0, 1)

# Sort and display
top_sentence_diff = df_long.sort_values(by="sentence_diff", ascending=False).head(100)
top_token_diff = df_long.sort_values(by="token_ratio", ascending=False).head(100)

(56000, 6)
(53796, 6)


In [56]:
df_long.sentences_other.value_counts()

sentences_other
5     5784
6     5571
7     4894
4     4213
8     4050
9     3521
10    2925
11    2711
12    2596
13    2290
14    2155
15    1832
16    1658
17    1376
18    1136
19     977
20     820
1      704
21     658
22     556
23     427
3      407
24     362
2      291
25     283
26     254
27     209
28     189
29     145
30     128
31     105
33      89
32      87
34      65
36      45
35      36
37      34
38      34
40      30
39      27
43      21
41      20
42      17
44      12
45      11
47       8
46       7
48       6
51       4
50       4
49       3
58       3
54       2
53       2
61       1
52       1
Name: count, dtype: int64

In [77]:
df_temp = df_long.sort_values(by=["sentence_diff_nominal"], ascending=False)
df_temp = df_temp[(df_temp["token_ratio"]>1.5) & (df_temp["sentence_ratio"]>1.5)]
print(len(df_temp))
print(df_temp.llm_type.value_counts())

578
llm_type
Llama-2-70b       234
ChatGPT           223
Google-PaLM        94
Claude-instant     27
Name: count, dtype: int64


In [78]:
df_temp_llama = df_temp[df_temp["llm_type"]=="Google-PaLM"]
for _, item in df_temp_llama.iloc[:100].iterrows():
    print(item.sentence_diff_nominal, item.token_ratio, item.sentence_ratio)
    print(item.comparison_text)

26 1.7316017316017316 2.3684210526315788
The service was great, but the food was just okay. The atmosphere was nice, but the prices were a bit high. The location was convenient, but the parking was limited. Overall, it was a decent experience, but not one I would repeat.The food was bland and tasteless. The service was slow and inattentive. The atmosphere was sterile and uninviting. The prices were exorbitant. The location was inconvenient.The food was delicious and well-prepared. The service was attentive and friendly. The atmosphere was warm and inviting. The prices were reasonable. The location was convenient.The food was okay, but not great. The service was good, but not exceptional. The atmosphere was nice, but not special. The prices were reasonable, but not a bargain. The location was convenient, but not ideal.The food was terrible. The service was awful. The atmosphere was depressing. The prices were outrageous. The location was horrible.The food was amazing. The service was im

In [79]:
df_temp_llama = df_temp[df_temp["llm_type"]=="Llama-2-70b"]
for _, item in df_temp_llama.iloc[:100].iterrows():
    print(item.sentence_diff_nominal, item.token_ratio, item.sentence_ratio)
    print(item.comparison_text)

29 1.9631147540983607 2.611111111111111
It was a beautiful summer day, and Sarah had decided that it was time to end her life. She had been struggling with depression for years, and she had finally reached a point where she couldn't see any other way out. She had arranged to meet her best friend, Rachel, at the park one last time.As they sat on a bench together, Sarah handed Rachel a letter. "This is my suicide note," she said, her voice trembling. "I wanted to leave it with someone I love, so that they can understand why I had to do this."Rachel took the letter and promised to read it later. "But first, can we talk about this?" she asked. "Is there anything I can do to change your mind?"Sarah shook her head. "No, Rachel. I've thought about this for a long time. I'm sorry, but I can't keep going."Rachel looked at her friend with tears in her eyes. "I understand," she said. "But please, just tell me one thing. What will be your last words?"Sarah took a deep breath before answering. "My 

In [80]:
to_be_cleaned_length = df_long[df_long["tokens_other"] < 60]

In [81]:
df_temp = df_long.sort_values(by=["sentence_diff_nominal"])
for _, item in to_be_cleaned_length.iterrows():
    print(df_temp.comparison_text)

37278    As I walked through the surreal hallway, I cou...
51728    In a not-so-distant future, as the sun's fusio...
6826     **Story:**In the realm of peculiar professions...
6928     In the not-so-distant future, when the Sun's f...
51626    **Story:**In the realm of peculiar professions...
                               ...                        
15348    Oh no, this is not good. I'm on fire. How did ...
3715     It was a beautiful summer day, and Sarah had d...
14722    Douglas Adams woke up in heaven, his head poun...
26387    "Hey, have you seen my phone?""No, haven't see...
43232    "Look, man, I really like you. I really do. Bu...
Name: comparison_text, Length: 53796, dtype: object
37278    As I walked through the surreal hallway, I cou...
51728    In a not-so-distant future, as the sun's fusio...
6826     **Story:**In the realm of peculiar professions...
6928     In the not-so-distant future, when the Sun's f...
51626    **Story:**In the realm of peculiar professions...
    

In [82]:
df_long["sentence_token_ratio_hmean"] = 2 * (
    df_long["sentence_ratio"] * df_long["token_ratio"]
) / (df_long["sentence_ratio"] + df_long["token_ratio"]).replace(0, 1)
df_long.sort_values("sentence_token_ratio_hmean", ascending=True, inplace=True)

In [83]:
to_be_cleaned_removed_to_much = []
for _, item in df_long.iloc[:10000].iterrows():
    if (item["sentences_human"] >= 6) & (item["sentences_other"] >= 6) and (abs(1 - item["token_ratio"]) < 0.5) or (abs(1 - item["token_ratio"]) < 0.75):
        continue
    else:
        to_be_cleaned_removed_to_much.append(item)

In [85]:
df_to_be_recleaned_length = pd.concat([to_be_cleaned_length, pd.DataFrame(to_be_cleaned_removed_to_much)])
print(df_to_be_recleaned_length.columns)
df_to_be_recleaned_length.head()

Index(['id', 'llm_type', 'domain', 'text_human', 'comparison_type',
       'comparison_text', 'sentences_human', 'sentences_other',
       'sentence_diff', 'sentence_diff_nominal', 'sentence_ratio',
       'tokens_human', 'tokens_other', 'token_diff', 'token_ratio',
       'sentence_token_ratio_hmean'],
      dtype='object')


Unnamed: 0,id,llm_type,domain,text_human,comparison_type,comparison_text,sentences_human,sentences_other,sentence_diff,sentence_diff_nominal,sentence_ratio,tokens_human,tokens_other,token_diff,token_ratio,sentence_token_ratio_hmean
5841,2342,Google-PaLM,arxiv,Steady state accretion discs larger than ~ 0.0...,direct_prompt,only,12,1,11,-11,0.083333,408,1,407,0.002451,0.004762
51910,2211,Google-PaLM,xsum,"Hasselbaink, 44, was filmed as part of an inve...",paraphrase_polish_llm,',8,1,7,-7,0.125,341,1,340,0.002933,0.005731
5988,2489,Google-PaLM,arxiv,A release of multicolor broad band (BVRI) phot...,direct_prompt,',10,1,9,-9,0.1,331,1,330,0.003021,0.005865
17333,2634,Google-PaLM,arxiv,We compute the reduced genus 1 Gromov-Witten i...,prompt_few_shot,only.,7,1,6,-6,0.142857,243,2,241,0.00823,0.015564
6612,2413,Google-PaLM,writing_prompt,""" I just want to hold you. "" Death said, the f...",direct_prompt,"In the realm of dreams, Death wove intricate t...",49,1,48,-48,0.020408,516,18,498,0.034884,0.025751


In [86]:
df_long.head()

Unnamed: 0,id,llm_type,domain,text_human,comparison_type,comparison_text,sentences_human,sentences_other,sentence_diff,sentence_diff_nominal,sentence_ratio,tokens_human,tokens_other,token_diff,token_ratio,sentence_token_ratio_hmean
5841,2342,Google-PaLM,arxiv,Steady state accretion discs larger than ~ 0.0...,direct_prompt,only,12,1,11,-11,0.083333,408,1,407,0.002451,0.004762
51910,2211,Google-PaLM,xsum,"Hasselbaink, 44, was filmed as part of an inve...",paraphrase_polish_llm,',8,1,7,-7,0.125,341,1,340,0.002933,0.005731
5988,2489,Google-PaLM,arxiv,A release of multicolor broad band (BVRI) phot...,direct_prompt,',10,1,9,-9,0.1,331,1,330,0.003021,0.005865
17333,2634,Google-PaLM,arxiv,We compute the reduced genus 1 Gromov-Witten i...,prompt_few_shot,only.,7,1,6,-6,0.142857,243,2,241,0.00823,0.015564
6612,2413,Google-PaLM,writing_prompt,""" I just want to hold you. "" Death said, the f...",direct_prompt,"In the realm of dreams, Death wove intricate t...",49,1,48,-48,0.020408,516,18,498,0.034884,0.025751


In [87]:
with open(f"{RESULT_DIR}/to_be_recleaned.pkl", "rb") as f:
    to_be_recleaned = pickle.load(f)
len(to_be_recleaned)

920

In [88]:
df_wo_polishing = pd.read_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all_wo_polishing.parquet")
df_with_polishing = pd.read_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all.parquet")
df_wo_polishing.prompting_strategy.value_counts()

prompting_strategy
direct_prompt              543
prompt_few_shot            352
prompt_SICO                292
paraphrase_polish_llm      183
paraphrase_polish_human     13
Name: count, dtype: int64

In [89]:
df_with_polishing.prompting_strategy.value_counts()

prompting_strategy
paraphrase_polish_llm      627
paraphrase_polish_human    452
direct_prompt              141
prompt_few_shot             97
prompt_SICO                 65
Name: count, dtype: int64

In [91]:
recleaned_df = pd.DataFrame(to_be_recleaned)
print(recleaned_df.columns)
recleaned_df.head()

Index(['id', 'llm_type_test', 'label', 'domain', 'text', 'prompting_strategy'], dtype='object')


Unnamed: 0,id,llm_type_test,label,domain,text,prompting_strategy
0,1761,Claude-instant,llm,arxiv,"J1128+592, a redshift 1.72 active galactic nuc...",paraphrase_polish_llm
1,1957,Claude-instant,llm,arxiv,Standard inflationary cosmological models prec...,paraphrase_polish_llm
2,45,ChatGPT,llm,arxiv,This study investigates the phenomenon of a lo...,paraphrase_polish_llm
3,2604,Google-PaLM,llm,arxiv,We investigate the thermal interaction between...,paraphrase_polish_human
4,1786,Claude-instant,llm,arxiv,"Bose-Einstein condensates (BECs), which are co...",paraphrase_polish_llm


In [92]:
recleaned_df.sort_values(by=["domain", "id", "llm_type_test", "label"], inplace=True)
print(recleaned_df.prompting_strategy.value_counts())
recleaned_df.llm_type_test.value_counts()

prompting_strategy
paraphrase_polish_human    461
paraphrase_polish_llm      459
Name: count, dtype: int64


llm_type_test
Claude-instant    429
Llama-2-70b       180
ChatGPT           177
Google-PaLM       134
Name: count, dtype: int64

In [94]:
df_to_be_recleaned_length = pd.concat([df_temp_llama, df_to_be_recleaned_length])
df_to_be_recleaned_length.rename(columns = {"comparison_type": "prompting_strategy"}, inplace=True)
recleaned_df.rename(columns = {"comparison_type": "prompting_strategy", "llm_type_test": "llm_type"}, inplace=True)
recleaned_df = pd.concat([recleaned_df, ])
necessary_columns_for_mapping = ["id", "domain", "llm_type", "prompting_strategy"]
df_to_be_recleaned_all = pd.concat([df_to_be_recleaned_length[necessary_columns_for_mapping], recleaned_df[necessary_columns_for_mapping]])
print("df_to_be_recleaned_all", df_to_be_recleaned_all.shape)
df_to_be_recleaned_all = df_to_be_recleaned_all.drop_duplicates().reset_index(drop=True)
print("df_to_be_recleaned_all", df_to_be_recleaned_all.shape)
df_to_be_recleaned_all_with_polishing = pd.concat([df_to_be_recleaned_all, df_with_polishing, df_wo_polishing])
print("df_to_be_recleaned_all_with_polishing", df_to_be_recleaned_all_with_polishing.shape)
df_to_be_recleaned_all_with_polishing = df_to_be_recleaned_all_with_polishing.drop_duplicates().reset_index(drop=True)
print("df_to_be_recleaned_all_with_polishing", df_to_be_recleaned_all_with_polishing.shape)

df_to_be_recleaned_all (1813, 4)
df_to_be_recleaned_all (1610, 4)
df_to_be_recleaned_all_with_polishing (4375, 4)
df_to_be_recleaned_all_with_polishing (2490, 4)


In [96]:
# Remove rows in df_to_be_recleaned_all_with_polishing that are also in df_wo_polishing
df_filtered = df_to_be_recleaned_all_with_polishing.merge(
    df_wo_polishing.drop_duplicates(), 
    how='left', 
    indicator=True
)

# Keep only the rows that are NOT in df_wo_polishing
df_to_be_recleaned_all_with_polishing_unique = df_filtered[df_filtered['_merge'] == 'left_only']
print(df_to_be_recleaned_all_with_polishing_unique.shape)

df_to_be_recleaned_all_with_polishing_unique.to_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all_with_polishing_unique.parquet")

(1107, 5)
