# 1. Setup

In [None]:
import pandas as pd
import os
from hashlib import sha512
from tqdm import tqdm
import sys
import time
import glob
import re

# === CONFIG ===
BASE_DIR = "../../"
SRC_DIR = f"{BASE_DIR}/src"
sys.path.append(BASE_DIR)
sys.path.append(SRC_DIR)

from src.general_functions_and_patterns_for_detection import (
    load_dataframe_from_json,
    get_info_based_on_input_path,
    read_files,
    PATTERN_BEGINNING_LLM_SENTENCE,
    PATTERN_ENDING_LLM_SENTENCE,
    BASE_DIR, RESULT_DIR, ORIGINAL_DATA_DIR, REGEX_CLEANED_FILES, RECLEANED_FILES_DIR,
    DOMAINS,
    json_path_abstract,
    json_path_writing,
    json_path_xsum,
    json_path_review,
)

from src.fewshot_prompting import (
    REGULAR_EXPRESSION,
    get_and_format,
    generate_jsonl_file
)


from src.openAI_client import (
    client, 
    download_if_recleaning,
    get_batches
)

# 2. Load data of all domains

In [2]:
df_abstract = load_dataframe_from_json(json_path_abstract, filter_llm=False)
df_writing = load_dataframe_from_json(json_path_writing, filter_llm=False)
df_xsum = load_dataframe_from_json(json_path_xsum, filter_llm=False)
df_review = load_dataframe_from_json(json_path_review, filter_llm=False)

df_writing.head(20)

Unnamed: 0,id,story,story_prompt,direct_prompt,llm_type,domain,paraphrase_polish_human,paraphrase_polish_llm,prompt_few_shot,prompt_SICO,...,adversarial_character_llm,adversarial_word_human,adversarial_word_llm,adversarial_character_word_human,adversarial_character_word_llm,paraphrase_back_translation_human,paraphrase_back_translation_llm,paraphrase_dipper_human,paraphrase_dipper_llm,icl_prompt
0,1,The mountain stood still and large beneath the...,Through Iron And Flame,Through Iron and FlameDeep in the heart of the...,ChatGPT,writing_prompt,The massive mountain loomed beneath the Warrio...,Through Iron and FlameDeep in the heart of the...,"The war raged on, its fury echoing through the...","Through Iron And FlameBarefoot and fearless, s...",...,Through Iron and FlameDeep in the heart of the...,The mountain stood furthermore and large benea...,Through Iron and FlameDeep in the heart of the...,The mountain stod still and large beneath the ...,Through Iron and FlameDeep in the heart of the...,The mountain stood below the warrior to stand ...,It is an extraordinary journey through iron an...,It had not trembled since the day when the peo...,"A young blacksmith named Alistair, with a fier...",
1,2,"""Sadie! I told you not to stand under the tree...","You are at the park with your kids, when you s...","It was a sunny Saturday afternoon, and I decid...",ChatGPT,writing_prompt,"""Sadie! I explicitly told you to avoid standin...","It was a sunny Saturday afternoon, and I decid...","It was a sunny afternoon at the park, and I wa...","So, dude, picture this: I'm at the park with m...",...,"It was a sunny Saturday afternoon, and I decid...","""Lottie! I told you not to stand under the tre...","It was a sunny Saturday afternoon, and I decid...","""Sadie! j told you not to stand under the tree...","It was a sunny Saturday afternoon, and I decid...","""Sadi! I tell you not to stand under the tree ...","It was a sunny Saturday afternoon, and I decid...",I told you not to stand under the tree during ...,They were so excited when we arrived that they...,
2,3,"Janice turned to me, her big blue eyes still f...",""" My fellow Americans... "" The newly elected P...","""My fellow Americans,"" the newly elected Presi...",ChatGPT,writing_prompt,"Janice turned to me, her big, innocent blue ey...","""My fellow Americans,"" the newly elected Presi...","""My fellow Americans,"" the newly elected Presi...","""My fellow Americans,"" the newly elected Presi...",...,"""yM fellow Americans,"" the newly elected Presi...","Janice turned to me, her big blue eyes still e...","""My fellow Americano,"" the newly elected Presi...","Janice turned to me, her Ьig blue eyes still f...","""My fellow Aｍericans,"" the newly elected Presi...","Janice turned to me, and her big blue eyes wer...","The newly elected president began to say, ""My ...","“Daddy,” she said, “what does the president me...","""I stand before you today to make a deeply per...","""My fellow Americans,"" the newly elected Presi..."
3,4,Roslyn stepped down the ladder facing forward ...,What' s on the tape?,As Anna rummaged through her grandmother's att...,ChatGPT,writing_prompt,"Roslyn descended the ladder, facing forward, a...",As Anna carefully rummaged through her grandmo...,I stumbled upon an old cardboard box in the co...,As I rummaged through the dusty box that had b...,...,As Anna rummaged through her grandmother's att...,Roslyn stepped down the ladder facing forward ...,As Anna rummaged through her grandmother's att...,Roslyn stepped down the ladder facing forward ...,As Anna rummaged through her grandmother's att...,Roslyn walked down the ladder and headed forwa...,"When Anna read on her grandmother's loft, she ...",She caught it with her left hand. She lugged t...,She blew off the dust and opened it with care....,
4,5,""" Aw, do n' t cry my sweet little girl! You we...","Write a story that is perfectly normal, until ...","Once upon a time, in the small town of Willowb...",ChatGPT,writing_prompt,"""Oh, don't cry, my sweet little girl! You were...","Once upon a time, in the quaint town of Willow...","Sarah woke up early in the morning, the sunlig...",Samantha woke up to the sound of birds chirpin...,...,"Once upon a time, in the small town of Willowb...",""" Aw, do n' t cry my belle little girl! You we...","Once upon a time, in the small midtown of Will...",""" Aw, do n' t weep my sweet little girl! You w...","After upon a time, in the small town of Willow...","""Oh, cry, my cute little girl! You are very qu...","Once upon a time, everything was calm and stab...","She's heavy. She was so quiet before, even wit...","The sun shone brightly in the clear blue sky, ...",Emily woke up to the sound of birds chirping o...
5,6,""" Do you ever think about what it' s like up t...","Even with all the stars on the sky, the night ...",Even with all the stars scattered across the e...,ChatGPT,writing_prompt,"""Do you ever think about what it's like up the...",Despite the multitude of stars scattered acros...,"Even with all the stars in the sky, the night ...",Even with all the stars sprinkling the sky abo...,...,Even with all the stars scattered across the e...,""" Do you ever figured about what it' s like up...",Even with all the stars scattered across the e...,""" Do you ever think about what it' s loves up ...",Even with all the stars scattered across the e...,"""Have you ever thought about it?"" Her hair was...",Even though all the stars are scattered on the...,The city was far away. Her hair was spread out...,"For generations, men had gazed at the sky, mar...",
6,7,The world came crashing down in minutes. Many ...,"Over night, 90 % of the world' s population ha...","Overnight, a cataclysmic event struck the worl...",ChatGPT,writing_prompt,"The world crumbled in a matter of minutes, sha...","In the blink of an eye, a cataclysmic event ra...","Overnight, the world was consumed by an eerie ...","Wow, have you ever imagined waking up to a wor...",...,"Overinght, a cataclysmic event struck the worl...",The globe came crashing down in minutes. Many ...,"Overnight, a cataclysmic happenings struck the...",The world ϲame crashing down in minutes. Many ...,"Overnight, a cataclysmic event struck the worl...","A few minutes later, the world collapsed. Many...","Overnight, a catastrophic event attacked the w...",Many of us were asleep when it happened and di...,"The survivors, cautiously emerging from their ...",
7,8,"""Mommy, I' m scared. ""The little girl stood at...","Gay marriage is now legal woldwide, and the co...",In a world where gay marriage had become legal...,ChatGPT,writing_prompt,"""Mummy, I'm scared,"" the little girl quivered ...",In a world where gay marriage had become legal...,"The world had changed overnight, and the conse...",Who would've thought that it would come to thi...,...,In a orld where gay marriage had become legal ...,"""Mama, I' m scared. ""The little maid stood at ...",Between a world where gay marriage had become ...,"""Mommy, I' m terrified. ""The little girl stood...",In a world where gay marriage had become legal...,"""Mom, I'm scared."" The little girl stood on th...",In a world of homosexual marriage that is lega...,""" The little girl stood at the top of the stai...",The world was experiencing a kind of pseudo-zo...,
8,9,The blind pilots fly And we thank them for the...,No Ordinary Mist,"In the small town of Elmwood, nestled between ...",ChatGPT,writing_prompt,"The blind pilots soar through the skies, and w...","In the serene town of Elmwood, nestled amidst ...","In the small town of Willowbrook, a dense fog ...","In the sleepy town of Mistwood, nestled amidst...",...,"In the small tCwn of TElmwood, nestled between...",The blind pilots hovers And we acknowledgement...,"In the marginal town of Elmwood, nestled betwe...",The blind pilots fly And we thank them for the...,"During the small town of Elmwood, nestled betw...",Blind pilots flew. We thank them for their mis...,"In Elmwood, which is located in the hills of t...","The Sun burns hot, bold, and bright. What is t...",The No Ordinary Mist was said to grant unimagi...,
9,10,We' d been wandering for what felt like years....,Describe a game of Civilization from the persp...,I had spent my entire life in the bustling cit...,ChatGPT,writing_prompt,We had been wandering for what felt like years...,I had spent my entire life in the bustling cit...,"From my humble abode, nestled within the heart...",I couldn't contain my excitement as the city b...,...,I had spent my entire life in the bustling cty...,We' d been wandering for what suspected like y...,I had spent my entire life in the bustling cit...,We' d been wandering for what felt loves years...,I had spent my entire life in the bustling cit...,We have been wandering for many years. I could...,"I spent all my life in the city of Elmdale, wh...",We made camp near the mountain. It was suppose...,"As a humble citizen, I had little influence on...",


# 3. Pattern for detection


In [3]:
for key, value in REGULAR_EXPRESSION.items():
    REGULAR_EXPRESSION[key] = value.replace(PATTERN_BEGINNING_LLM_SENTENCE, "").replace(PATTERN_ENDING_LLM_SENTENCE, "")

for k, v in REGULAR_EXPRESSION.items():
    print(k, v)

arxiv (abstract|academic article)
yelp_review (review's first sentence|review)
writing_prompt (given article title|provided article title)
xsum (article)
prompt_SICO (in a human\s?\w{0,20}\s?style)
paraphrase_polish_human (grammar[\w\s,]{1,40}spelling)|(spelling[\w\s,]{1,40}grammar)|(Improved sentence structure)
paraphrase_polish_llm (grammar[\w\s,]{1,40}spelling)|(spelling[\w\s,]{1,40}grammar)|(Improved sentence structure)


# 4. Clean-up using LLMs

## 4.1 General information and function

https://cookbook.openai.com/examples/gpt4-1_prompting_guide

In [4]:
get_and_format("arxiv", "prompt_SICO")

'## Example 1\nORIGINAL TEXT: Sure, here\'s a draft of a five-sentence abstract in the style of a human author:In this paper, we present measurements of the velocity and temperature fields...\nCLEANED VERSION: In this paper, we present measurements of the velocity and temperature fields...\n\n## Example 2\nORIGINAL TEXT: Here is a 6 sentence abstract for the title "Searching for Lee-Wick Gauge Bosons at the LHC":Grinstein, O\'Connell and Wise have recently extended the Standard Model to include Lee-Wick partners of the gauge bosons which predict negative-norm partners of the usual gauge fields. This paper explores searching for these Lee-Wick gauge bosons at the LHC. Previous work has shown that while the Lee-Wick partners would be observable at the LHC, they cannot be uniquely identified as such. \nCLEANED VERSION: Grinstein, O\'Connell and Wise have recently extended the Standard Model to include Lee-Wick partners of the gauge bosons which predict negative-norm partners of the usual 

## 4.2 Test Jsonl file generation

In [8]:
os.makedirs(f"{RESULT_DIR}/data_cleaning_gpt_4-1/", exist_ok=True)
generate_jsonl_file(df_abstract)
domain: str = "arxiv"
model: str = "gpt-4.1-mini-2025-04-14"
part: int = 0
_df_temp = pd.read_json(f"{RESULT_DIR}/data_cleaning_gpt_4-1/{domain}_{model}_batch_processing_cleaning_all_columns_claude_v2_part{part}.jsonl", lines=True).head(20)
_df_temp.head()

processing df between 0:25


Unnamed: 0,custom_id,method,url,body
0,arxiv_1_direct_prompt_cleaned,POST,/v1/responses,"{'model': 'gpt-4.1-mini-2025-04-14', 'input': ..."
1,arxiv_1_prompt_few_shot_cleaned,POST,/v1/responses,"{'model': 'gpt-4.1-mini-2025-04-14', 'input': ..."
2,arxiv_1_prompt_SICO_cleaned,POST,/v1/responses,"{'model': 'gpt-4.1-mini-2025-04-14', 'input': ..."
3,arxiv_1_paraphrase_polish_human_cleaned,POST,/v1/responses,"{'model': 'gpt-4.1-mini-2025-04-14', 'input': ..."
4,arxiv_1_paraphrase_polish_llm_cleaned,POST,/v1/responses,"{'model': 'gpt-4.1-mini-2025-04-14', 'input': ..."


In [9]:
example_prompt = _df_temp["body"].iloc[0]
example_prompt_user = example_prompt["input"][1]["content"][0]["text"]
example_prompt_system = example_prompt["input"][0]["content"][0]["text"]

In [10]:
print("SYSTEM_PROMPT:", example_prompt_system, "\n\n")
print("USER PROMPT:", example_prompt_user)

SYSTEM_PROMPT: You are a helpful data cleaning assistant, that helps to clean frequent LLM patterns out of a dataset. You will receive the original prompt as well as the original LLM answer. The goal is to remove everything that is related to typical LLM answers or related to the prompt and to only return the text answer of the original task.

# Instructions
- remove all patterns that are typically starting phrases of an LLM response, like (^Here is.*?|^Here are.*?|^Here's.*?|^Sure,\s?here.*?)[.:] at the beginning of the text
- remove phrases provided to the LLM, like the abstract title
- never remove any content, that is used by the LLM to answer the original prompt. Return the full text to answer the original prompt.
- if the original LLM call results in a rejection (e.g. (.*I apologize, upon further reflection.*?|.*a fake review.*|.*((only)|(just)) a language model.*|.*I cannot provide.*|.*As an AI language model, I am unable to engage with content that may violate my usage guidelin

## 4.3 Batch generation of results

In [9]:
batch_id = ""   # TODO add one example batch id

In [10]:
batch = client.batches.retrieve(batch_id)
batch.status

'completed'

### 4.3.1 Cleaning claude data

In [32]:
missing_ids = []

for part in tqdm(missing_ids):
    fail_counter = 0
    while(batch.status != "completed"):
        if batch.status == "failed":
            print(batch)
            fail_counter += 1
            if fail_counter == 3:
                break
            else:
                time.sleep(120)
                batch_input_file = client.files.create(
                    file=open(input_path, "rb"),
                    purpose="batch"
                )

                print(batch_input_file)

                batch_input_file_id = batch_input_file.id
                batch_object = client.batches.create(
                    input_file_id=batch_input_file_id,
                    endpoint="/v1/responses",
                    completion_window="24h",
                    metadata={
                        "description": "Arxiv data fo part 2 LLM=Claude"
                    }
                )

                batch = client.batches.retrieve(batch_object.id)
        try:
            print(batch.request_counts)
        except Exception as e:
            print("Sleeping... ")
        time.sleep(60)
        batch = client.batches.retrieve(batch_object.id)
    if batch.status == "failed":
        print(batch)
        fail_counter += 1
        if fail_counter == 3:
            break
        else:
            time.sleep(60)
    generate_jsonl_file(df_abstract, part=part)
    time.sleep(120)
    domain: str = "arxiv"
    model: str = "gpt-4.1-mini-2025-04-14"
    input_path = f"{RESULT_DIR}/data_cleaning_gpt_4-1/{domain}_{model}_batch_processing_cleaning_all_columns_claude_v2_part{part}.jsonl"

    batch_input_file = client.files.create(
        file=open(input_path, "rb"),
        purpose="batch"
    )

    print(batch_input_file)

    batch_input_file_id = batch_input_file.id
    batch_object = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/responses",
        completion_window="24h",
        metadata={
            "description": "Arxiv data fo part 2 LLM=Claude"
        }
    )

    batch = client.batches.retrieve(batch_object.id)

  0%|          | 0/7 [00:00<?, ?it/s]

processing df between 375:400
FileObject(id='file-VcYHWNJYFD4ZYCdeSEBMqd', bytes=2240578, created_at=1750599094, filename='arxiv_gpt-4.1-mini-2025-04-14_batch_processing_cleaning_all_columns_claude_v2_part15.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


 14%|█▍        | 1/7 [02:02<12:16, 122.81s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
BatchRequestCounts(completed=111, failed=0, total=125)
BatchRequestCounts(completed=121, failed=0, total=125)
processing df between 425:450
FileObject(id='file-X8x8gqkmtcGvtH2NiB1Ycp', bytes=2239927, created_at=1750599397, filename='arxiv_gpt-4.1-mini-2025-04-14_batch_processing_cleaning_all_columns_claude_v2_part17.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


 29%|██▊       | 2/7 [07:06<19:04, 228.94s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
BatchRequestCounts(completed=119, failed=0, total=125)
BatchRequestCounts(completed=119, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
processing df 

 43%|████▎     | 3/7 [27:15<45:07, 676.80s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
BatchRequestCounts(completed=123, failed=0, total=125)
BatchRequestCounts(completed=123, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
processing df 

 57%|█████▋    | 4/7 [47:25<44:20, 887.00s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
BatchRequestCounts(completed=89, failed=0, total=125)
BatchRequestCounts(completed=115, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
processing df between 575:600
FileObject(id='file-95jEKxDdV4VEMtTJ9VzP

 71%|███████▏  | 5/7 [1:06:33<32:42, 981.25s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
BatchRequestCounts(completed=113, failed=0, total=125)
BatchRequestCounts(completed=124, failed=0, total=125)
processing df between 650:675
FileObject(id='file-BhjvMofRtyKMszCR4aT8vh', bytes=2241167, created_at=1750603267, filename='arxiv_gpt-4.1-mini-2025-04-14_batch_processing_cleaning_all_columns_claude_v2_part26.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


 86%|████████▌ | 6/7 [1:11:36<12:30, 750.75s/it]

BatchRequestCounts(completed=0, failed=0, total=0)
processing df between 675:700
FileObject(id='file-3uDVp2wvand2AXetfjaQQB', bytes=2244086, created_at=1750603451, filename='arxiv_gpt-4.1-mini-2025-04-14_batch_processing_cleaning_all_columns_claude_v2_part27.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


100%|██████████| 7/7 [1:14:40<00:00, 640.00s/it]


### 4.3.2 Re-Cleaning regex cleaned data

In [39]:
# df_to_be_cleaned1 = pd.read_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all.parquet")
# df_to_be_cleaned2 = pd.read_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all_with_polishing_unique.parquet").drop(columns="_merge")
# df_to_be_cleaned3 = pd.read_parquet(f"{RESULT_DIR}/df_to_be_recleaned_all_wo_polishing.parquet")
# print(df_to_be_cleaned3.info(), df_to_be_cleaned2.info(), df_to_be_cleaned1.info())
# df_to_be_cleaned = pd.concat([df_to_be_cleaned1, df_to_be_cleaned2, df_to_be_cleaned3]).drop_duplicates()
# df_to_be_cleaned.shape

In [40]:
df_remaining = pd.read_parquet(f"{RESULT_DIR}/df_remaining.parquet")
df_remaining.shape

(21, 5)

In [41]:
df_to_be_cleaned = df_remaining.copy(deep=True)

In [42]:

to_be_recleaned_by_domain = {}

for _domain in DOMAINS:
    _tb_recleaned_temp = []
    input_path = f"{ORIGINAL_DATA_DIR}/{_domain}_2800.json"
    _, prompt_key, _ = get_info_based_on_input_path(input_path)
    _df_all = load_dataframe_from_json(input_path)
    for _, _item in tqdm(df_to_be_cleaned[df_to_be_cleaned["domain"]==_domain].iterrows()):
        searched = _df_all[(_df_all["id"]==_item.id)&(_df_all["llm_type"]==_item.llm_type)]
        if len(searched) != 1:
            raise ValueError("Length should be exactly one. Datapoint is unclear.")
        _tb_recleaned_temp.append(searched[["id", "llm_type", prompt_key, _item.prompting_strategy]].iloc[0].to_dict())
    to_be_recleaned_by_domain[_domain] = pd.DataFrame(_tb_recleaned_temp)
    
prompts = df_to_be_cleaned.prompting_strategy.unique()

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
21it [00:00, 762.91it/s]


In [43]:
# to_be_recleaned_by_domain["arxiv"]

In [44]:
recleaned_v2 = {}

for _domain in DOMAINS:
    df = to_be_recleaned_by_domain[_domain]
    # Identify last 5 columns
    # last_five_cols = df.columns[-5:]
    prompting_columns = set(prompts).intersection(df.columns)
    other_columns = set(df.columns) - set(prompting_columns)
    # Melt the DataFrame
    melted = df.reset_index().melt(
        id_vars=list(other_columns),  # preserve other columns
        value_vars=prompting_columns,
        var_name='prompting_strategy',
        value_name='text'
    )
    
    # Drop NaNs from text column
    melted = melted.dropna(subset=['text'])

    # Print the shape
    print(melted.shape)
    recleaned_v2[_domain] = melted

(0, 2)
(0, 2)
(0, 2)
(21, 5)


In [45]:
import logging
all_batches = []
for batch in client.batches.list(limit=100):  # max 100 per page
    all_batches.append(batch)

completed_files = []

print(f"Retrieved {len(all_batches)} batches.")
for b in all_batches:
    try:
        _file = client.files.retrieve(b.input_file_id)
        # print(b.id, b.status, "\t\t", _file.filename) 
        if b.status == "completed":
            completed_files.append(_file.filename)
    except Exception as e:
        continue
        # logging.warning(e)
    # else:
    #     client.files.delete(_file.id)

Retrieved 443 batches.


In [46]:
len(completed_files), DOMAINS

(201, ['arxiv', 'xsum', 'writing_prompt', 'yelp_review'])

In [47]:
input_path.split("/")[-1] in completed_files

False

In [48]:
completed_files

['yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning.jsonl',
 'yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_human_0_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_6_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_5_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_4_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_3_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_2_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_1_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning.json

In [49]:
batch.status = "completed"

for _domain in DOMAINS:
    df_domain = recleaned_v2[_domain]
    # counter = 0
    for task in tqdm(df_domain["prompting_strategy"].unique()):
    # for task in ['paraphrase_polish_human', 'paraphrase_polish_llm']:
    # for task in ['direct_prompt', 'prompt_few_shot', 'prompt_SICO']:
        df_task = df_domain[df_domain["prompting_strategy"]==task]
        df_task.rename(columns={"text": task}, inplace=True)
        for i in range((len(df_task)-1)//25+1):
            start = i*25
            end = (i+1)*25
            print(_domain, task, start, ":", end)
            # part = df_task.iloc[start:end]
            fail_counter = 0
            while(batch.status != "completed"):
                if batch.status == "failed":
                    print(batch)
                    fail_counter += 1
                    if fail_counter == 3:
                        break
                    else:
                        time.sleep(120)
                        batch_input_file = client.files.create(
                            file=open(input_path, "rb"),
                            purpose="batch"
                        )

                        print(batch_input_file)

                        batch_input_file_id = batch_input_file.id
                        batch_object = client.batches.create(
                            input_file_id=batch_input_file_id,
                            endpoint="/v1/responses",
                            completion_window="24h",
                            metadata={
                                "description": "Arxiv data fo part 2 LLM=Claude"
                            }
                        )

                        batch = client.batches.retrieve(batch_object.id)
                try:
                    print(batch.request_counts)
                except Exception as e:
                    print("Sleeping... ")
                time.sleep(60)
                batch = client.batches.retrieve(batch_object.id)
            if batch.status == "failed":
                print(batch)
                fail_counter += 1
                if fail_counter == 3:
                    break
                else:
                    time.sleep(60)
                    
            model: str = "gpt-4.1-mini-2025-04-14"            
            input_path = f"{BASE_DIR}/results/{_domain}_{model}_batch_processing_re_cleaning_{task}_{i}_rerunning2.jsonl"

            if input_path.split("/")[-1] not in completed_files:
                generate_jsonl_file(df_task, part=i, domain=_domain, columns_to_be_cleaned=[task], 
                                    output_path=input_path)
                
                batch_input_file = client.files.create(
                    file=open(input_path, "rb"),
                    purpose="batch"
                )
    
                print(batch_input_file)
    
                batch_input_file_id = batch_input_file.id
                batch_object = client.batches.create(
                    input_file_id=batch_input_file_id,
                    endpoint="/v1/responses",
                    completion_window="24h",
                    metadata={
                        "description": f"{_domain} data for recleaning"
                    }
                )
    
                batch = client.batches.retrieve(batch_object.id)
                # counter += 1
                time.sleep(20)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

yelp_review paraphrase_polish_llm 0 : 25
processing df between 0:25
FileObject(id='file-MTVXApWCYLk1Nm6WJy9grt', bytes=355835, created_at=1753428526, filename='yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning2.jsonl', object='file', purpose='batch', status='processed', expires_at=None, status_details=None)


100%|██████████| 1/1 [00:21<00:00, 21.39s/it]


In [50]:
completed_files

['yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning.jsonl',
 'yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_human_0_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_6_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_5_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_4_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_3_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_2_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_1_rerunning.jsonl',
 'writing_prompt_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning.json

In [68]:
batches = get_batches()
for batch in batches:
    download_if_recleaning(batch)

✔️ Downloaded: C:\Users\Phili\Desktop\Masterarbeit\Masterarbeit_Code\src\..//results//re_cleaned/yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_1_rerunning2.jsonl_output.jsonl
✔️ Downloaded: C:\Users\Phili\Desktop\Masterarbeit\Masterarbeit_Code\src\..//results//re_cleaned/yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning2.jsonl_output.jsonl
✔️ Downloaded: C:\Users\Phili\Desktop\Masterarbeit\Masterarbeit_Code\src\..//results//re_cleaned/yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_llm_0_rerunning.jsonl_output.jsonl
✔️ Downloaded: C:\Users\Phili\Desktop\Masterarbeit\Masterarbeit_Code\src\..//results//re_cleaned/yelp_review_gpt-4.1-mini-2025-04-14_batch_processing_re_cleaning_paraphrase_polish_human_0_rerunning.jsonl_output.jsonl
✔️ Downloaded: C:\Users\Phili\Desktop\Masterarbeit\Masterarbeit_Code\src\..//results//re_cleaned/writing_prompt_gpt-4.1-mini-2025-04-14_

In [69]:
TARGET_DIR = f"{RESULT_DIR}/re_cleaned"

# Read all .jsonl files
jsonl_files = glob.glob(os.path.join(TARGET_DIR, "*.jsonl"))
# Combine all into a single DataFrame
df_list = [read_files(file) for file in jsonl_files]
df_combined = pd.concat(df_list, ignore_index=True)

df_combined["response"] = df_combined["response"].apply(lambda x: x["body"]["output"][0]["content"][0]["text"])
# df_combined["domain"] = df_combined.custom_id.apply(lambda x: x.split("_")[0])
# df_combined["id"] = df_combined.custom_id.apply(lambda x: x.split("_")[1])
# df_combined["prompting_type"] = df_combined.custom_id.apply(lambda x: "_".join(x.replace("_cleaned", "").split("_")[2:]))

df_combined.head()

Unnamed: 0,id,custom_id,response,error
0,batch_req_686863673d708190b6811798a02a031f,arxiv_2101_direct_prompt_cleaned,We introduce a notion of noncommutative torus ...,
1,batch_req_68686367c2288190a02b32854e322f25,arxiv_2120_direct_prompt_cleaned,The application of the Hartle-Hawking wave fun...,
2,batch_req_6868636852a0819092a3b172c4cafec6,arxiv_2166_direct_prompt_cleaned,This paper reports the discovery of a new burs...,
3,batch_req_68686368e6688190b5003cf481ec93ef,arxiv_2182_direct_prompt_cleaned,Mykyta the Fox and networks of language explor...,
4,batch_req_686863696ed08190a934d114f6d51ab3,arxiv_2233_direct_prompt_cleaned,The self-duality constraint in supergravity ca...,


In [70]:
df_combined.shape

(2731, 4)

In [71]:
df_combined

Unnamed: 0,id,custom_id,response,error
0,batch_req_686863673d708190b6811798a02a031f,arxiv_2101_direct_prompt_cleaned,We introduce a notion of noncommutative torus ...,
1,batch_req_68686367c2288190a02b32854e322f25,arxiv_2120_direct_prompt_cleaned,The application of the Hartle-Hawking wave fun...,
2,batch_req_6868636852a0819092a3b172c4cafec6,arxiv_2166_direct_prompt_cleaned,This paper reports the discovery of a new burs...,
3,batch_req_68686368e6688190b5003cf481ec93ef,arxiv_2182_direct_prompt_cleaned,Mykyta the Fox and networks of language explor...,
4,batch_req_686863696ed08190a934d114f6d51ab3,arxiv_2233_direct_prompt_cleaned,The self-duality constraint in supergravity ca...,
...,...,...,...,...
2726,batch_req_686a7ca5784481909a8b1edbdb85eab2,yelp_review_2635_prompt_SICO_cleaned,"The garden here is absolutely stunning, but I ...",
2727,batch_req_686a7ca592548190b0d0ada52419eb5f,yelp_review_2748_prompt_SICO_cleaned,This is part of a set of reviews covering esta...,
2728,batch_req_686a7ca5ac488190977fa94b7a492b90,yelp_review_2760_prompt_SICO_cleaned,Stay away from this worst place. There are so ...,
2729,batch_req_686a7ca5c560819084d6ef08d0570cb8,yelp_review_2764_prompt_SICO_cleaned,This movie has an interesting plot. The acting...,


In [72]:
df_combined.drop(columns = "error", inplace=True)

In [73]:
def split_custom_id(input_id: str):
     # Step 1: Remove '_cleaned' suffix if it exists
    if input_id.endswith('_cleaned'):
        input_id = input_id[:-8]  # len('_cleaned') == 8

    # Step 2: Find the ID (1–4 consecutive digits)
    match = re.search(r'_(\d{1,4})_', input_id)
    if not match:
        print(input_id)
        raise ValueError("No valid 1-4 digit ID found in the input string.")
    
    id_str = match.group(1)

    # Step 3: Domain is everything before the ID
    domain = input_id[:match.start()].rstrip('_')

    # Step 4: Prompt is everything after the ID
    prompt = input_id[match.end():].lstrip('_')

    return domain, int(id_str), prompt

res = df_combined["custom_id"].apply(split_custom_id)
df_combined[["domain", "id", "prompt_strategy"]] = pd.DataFrame(res.tolist(), index=df_combined.index)
df_combined

Unnamed: 0,id,custom_id,response,domain,prompt_strategy
0,2101,arxiv_2101_direct_prompt_cleaned,We introduce a notion of noncommutative torus ...,arxiv,direct_prompt
1,2120,arxiv_2120_direct_prompt_cleaned,The application of the Hartle-Hawking wave fun...,arxiv,direct_prompt
2,2166,arxiv_2166_direct_prompt_cleaned,This paper reports the discovery of a new burs...,arxiv,direct_prompt
3,2182,arxiv_2182_direct_prompt_cleaned,Mykyta the Fox and networks of language explor...,arxiv,direct_prompt
4,2233,arxiv_2233_direct_prompt_cleaned,The self-duality constraint in supergravity ca...,arxiv,direct_prompt
...,...,...,...,...,...
2726,2635,yelp_review_2635_prompt_SICO_cleaned,"The garden here is absolutely stunning, but I ...",yelp_review,prompt_SICO
2727,2748,yelp_review_2748_prompt_SICO_cleaned,This is part of a set of reviews covering esta...,yelp_review,prompt_SICO
2728,2760,yelp_review_2760_prompt_SICO_cleaned,Stay away from this worst place. There are so ...,yelp_review,prompt_SICO
2729,2764,yelp_review_2764_prompt_SICO_cleaned,This movie has an interesting plot. The acting...,yelp_review,prompt_SICO


In [74]:
df_all_recleaned = pd.concat([pd.read_parquet("../../results/df_to_be_recleaned_all_wo_polishing.parquet"),
                              pd.read_parquet("../../results/df_to_be_recleaned_all_with_polishing_unique.parquet")])

In [75]:
df_all_recleaned.drop(columns="_merge", inplace=True, errors="ignore")
df_all_recleaned.drop_duplicates(inplace=True)
df_all_recleaned.shape

(2490, 4)

In [76]:
df_all_recleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2490 entries, 0 to 1609
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  2490 non-null   int64 
 1   domain              2490 non-null   object
 2   llm_type            2490 non-null   object
 3   prompting_strategy  2490 non-null   object
dtypes: int64(1), object(3)
memory usage: 97.3+ KB


In [77]:
df_combined.rename(columns={"prompt_strategy": "prompting_strategy"}, inplace=True)

In [79]:
df_not_in_combined = df_combined.loc[:, ["id", "domain", "prompting_strategy"]].merge(df_all_recleaned.loc[:, ["id", "domain", "prompting_strategy", "llm_type"]], on=["id", "domain", "prompting_strategy"], how='outer', suffixes=['', '_'], indicator=True)

df_not_in_combined["_merge"].value_counts()


_merge
both          2728
left_only        3
right_only       0
Name: count, dtype: int64

In [80]:
df_remaining = df_not_in_combined[df_not_in_combined["_merge"]=="right_only"]
df_remaining["prompting_strategy"].value_counts()

Series([], Name: count, dtype: int64)

In [65]:
df_remaining.to_parquet(f"{RESULT_DIR}/df_remaining.parquet")

In [87]:
df_combined["response"].value_counts()

response
REJECTION                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [89]:
df_combined["response"] = df_combined["response"].apply(lambda x: None if x in ("REJECTION", "TASK_DESCRIPTION") else x)

response
None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [94]:
df_combined.to_parquet(f"{RESULT_DIR}/recleaned_data.parquet", index=False)
df_combined["response"].value_counts(dropna=False)


response
None                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

# 5. Combine new generated recleaned dataframe with the regex version

In [15]:
path_writing = f'{REGEX_CLEANED_FILES}/writing_prompt_2800_cleaned_all_v3.parquet'
path_abstract = f'{REGEX_CLEANED_FILES}/arxiv_2800_cleaned_all_v3.parquet'
path_review = f'{REGEX_CLEANED_FILES}/yelp_review_2800_cleaned_all_v3.parquet'
path_xsum = f'{REGEX_CLEANED_FILES}/xsum_2800_cleaned_all_v3.parquet'

df_writing_cleaned = pd.read_parquet(path_writing)
df_abstract_cleaned = pd.read_parquet(path_abstract)
df_review_cleaned = pd.read_parquet(path_review)
df_xsum_cleaned = pd.read_parquet(path_xsum)

domain_dfs = {
    "arxiv": df_abstract_cleaned.copy(deep=True),
    "writing_prompt": df_writing_cleaned.copy(deep=True),
    "yelp_review": df_review_cleaned.copy(deep=True),
    "xsum": df_xsum_cleaned.copy(deep=True)
}

combined_recleaned_df = pd.read_parquet(f"{RESULT_DIR}/recleaned_data.parquet")

In [16]:
combined_recleaned_df.head()

Unnamed: 0,id,custom_id,response,domain,prompting_strategy
0,1455,writing_prompt_1455_paraphrase_polish_llm_cleaned,The warm sunlight glistened off Thunder's dark...,writing_prompt,paraphrase_polish_llm
1,1494,writing_prompt_1494_paraphrase_polish_llm_cleaned,Creating believable characters is the backbone...,writing_prompt,paraphrase_polish_llm
2,1502,writing_prompt_1502_paraphrase_polish_llm_cleaned,"I dreaded attending the writing workshop, as u...",writing_prompt,paraphrase_polish_llm
3,1542,writing_prompt_1542_paraphrase_polish_llm_cleaned,"In January 2025, we accomplished our goal of e...",writing_prompt,paraphrase_polish_llm
4,1557,writing_prompt_1557_paraphrase_polish_llm_cleaned,Jake had always dreamed of playing professiona...,writing_prompt,paraphrase_polish_llm


In [17]:
combined_recleaned_df.head(), combined_recleaned_df.domain.unique()

(     id                                          custom_id  \
 0  1455  writing_prompt_1455_paraphrase_polish_llm_cleaned   
 1  1494  writing_prompt_1494_paraphrase_polish_llm_cleaned   
 2  1502  writing_prompt_1502_paraphrase_polish_llm_cleaned   
 3  1542  writing_prompt_1542_paraphrase_polish_llm_cleaned   
 4  1557  writing_prompt_1557_paraphrase_polish_llm_cleaned   
 
                                             response          domain  \
 0  The warm sunlight glistened off Thunder's dark...  writing_prompt   
 1  Creating believable characters is the backbone...  writing_prompt   
 2  I dreaded attending the writing workshop, as u...  writing_prompt   
 3  In January 2025, we accomplished our goal of e...  writing_prompt   
 4  Jake had always dreamed of playing professiona...  writing_prompt   
 
       prompting_strategy  
 0  paraphrase_polish_llm  
 1  paraphrase_polish_llm  
 2  paraphrase_polish_llm  
 3  paraphrase_polish_llm  
 4  paraphrase_polish_llm  ,
 array(['wr

In [18]:
print(df_writing_cleaned.info(), df_xsum_cleaned.info(), df_abstract_cleaned.info(), df_review_cleaned.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2800 entries, 0 to 2799
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   id                                 2800 non-null   int64 
 1   story                              2800 non-null   object
 2   story_prompt                       2800 non-null   object
 3   direct_prompt                      2751 non-null   object
 4   llm_type                           2800 non-null   object
 5   domain                             2800 non-null   object
 6   paraphrase_polish_human            2539 non-null   object
 7   paraphrase_polish_llm              2769 non-null   object
 8   prompt_few_shot                    2327 non-null   object
 9   prompt_SICO                        2746 non-null   object
 10  adversarial_character_human        2800 non-null   object
 11  adversarial_character_llm          2800 non-null   object
 12  advers

In [19]:
for _domain, _df in domain_dfs.items():
    _df.index = _df.id
    
domain_dfs["arxiv"].head()

Unnamed: 0_level_0,id,title,abstract,direct_prompt,llm_type,domain,prompt_few_shot,prompt_SICO,paraphrase_polish_human,paraphrase_polish_llm,...,adversarial_character_human,adversarial_character_llm,adversarial_word_human,adversarial_word_llm,adversarial_character_word_human,adversarial_character_word_llm,paraphrase_back_translation_human,paraphrase_back_translation_llm,paraphrase_dipper_human,paraphrase_dipper_llm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,Calculation of prompt diphoton production cros...,A fully differential calculation in perturbati...,This study presents a comprehensive calculatio...,ChatGPT,arxiv,"In this study, we present a comprehensive calc...","In this study, we present a comprehensive calc...",This study presents a comprehensive and fully ...,This study presents a comprehensive calculatio...,...,A fully differential calCulation in perturbati...,This study presents a comprehensive calculatio...,A fully disparity calculation in perturbative ...,This study presents a full calculation of prom...,A fully differential calculation in perturbati...,This study рresents a comprehensive calculatio...,The calculation of the complete difference in ...,This study lists the comprehensive calculation...,The calculation includes all next-to-leading-o...,"To determine the cross sections, we use the mo..."
2,2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is describe...,The understanding of the dynamic evolution of ...,ChatGPT,arxiv,"In this study, we explore the evolutionary dyn...",The evolution of the Earth-Moon system has lon...,The evolution of the Earth-Moon system is addr...,The dynamic evolution of the Earth-Moon system...,...,The evolution of Earth-Moon system is describe...,The understanding of the dynamic evolution of ...,The evolution of Earth-Moon system is outlines...,The understanding of the dynamic evolution of ...,The evolution of Earth-Moon system is describe...,The understanding of the dynamic changing of t...,The evolution of the global system is describe...,The understanding of the dynamic evolution of ...,The closest distance of the Moon to the Earth ...,"In this study, we present a new approach to th..."
3,3,Bosonic characters of atomic Cooper pairs acro...,We study the two-particle wave function of pai...,This article investigates the bosonic characte...,ChatGPT,arxiv,We investigate the bosonic characteristics of ...,This article investigates the bosonic characte...,We investigate the characteristics of the two-...,This article delves into the examination of th...,...,We study the two-particle wave function of pai...,This article investigates the bosonic characte...,We study the two-particle wave function of pai...,This art investigates the bosonic characterist...,We study the two-particle wave function of pai...,This article investigates the bosonic characte...,We have studied two particle wave functions pa...,This article investigates the bone characteris...,The bosoniccharacter of the two-particle wave ...,The authors use a theoretical framework based ...
4,4,Polymer Quantum Mechanics and its Continuum Limit,A rather non-standard quantum representation o...,Polymer quantum mechanics emerges as a fascina...,ChatGPT,arxiv,Polymer quantum mechanics is a framework that ...,"In this article, we explore the fascinating re...","The polymer representation, a distinct quantum...",Polymer quantum mechanics presents an intrigui...,...,A rather non-standard quantum representaiton o...,Polymer quantum mechanics emreges as a fascina...,A rather non-standard quantum representation o...,Polymer quantum mechanics emerges as a fascina...,A rather non-standard quantum representation o...,Polymer quantum mechanics e merges as a fascin...,The quantum quantum quantum quantum quantum qu...,Polymerization quantum mechanics is a fascinat...,Thisapproach has been followed in a symmetric ...,This article delves into the study of polymer ...
5,5,Numerical solution of shock and ramp compressi...,A general formulation was developed to represe...,This study presents a numerical approach for s...,ChatGPT,arxiv,This study presents a numerical approach for s...,"In this study, we present a numerical approach...",This study presents a comprehensive formulatio...,This study presents a numerical approach for s...,...,A general formulation was developed to represe...,This study presents a numerical approach for s...,A general formulation was developed to represe...,This study presents a numerical approach for s...,A general formulation was developed to represe...,This study presents a numerical approach for s...,The general formula is developed to indicate t...,This study proposes a numerical method that co...,The numerical methods were found to be flexibl...,The proposed methodology combines a finite ele...


In [20]:
for _domain, _df in domain_dfs.items():
    combined_recleaned_df_temp = combined_recleaned_df[combined_recleaned_df["domain"]==_domain]
    for _, row in combined_recleaned_df_temp.iterrows():
        _df.loc[row["id"], row["prompting_strategy"]] = row["response"]
        

In [21]:
df_xsum_cleaned.index = df_xsum_cleaned.id
df_xsum_cleaned.loc[2768, "prompt_SICO"]

": Sure, here's a news article based on the summary:**Title: North Korea Launches Ballistic Missile into the Sea****SEO Description: North Korea fired a ballistic missile into the sea on Wednesday, South Korean and US officials reported. The projectile was launched from the country's east coast, raising concerns about regional security.****SEO Keywords**: North Korea, South Korea, United States, missile launch, regional security.**Body**:**In a show of force, North Korea launched a ballistic missile into the sea on Wednesday, prompting swift responses from South Korea and the United States.**South Korean and US officials confirmed the launch, with reports indicating that the missile was fired from the country's east coast.**The missile launch marks the latest in a series of escalating tensions in the region, following previous missile tests by North Korea and joint military exercises by South Korea and the United States.**The international community is closely monitoring the situation,

In [22]:
domain_dfs["xsum"].loc[2768, "prompt_SICO"]

"Title: North Korea Launches Ballistic Missile into the Sea\n\nSEO Description: North Korea fired a ballistic missile into the sea on Wednesday, South Korean and US officials reported. The projectile was launched from the country's east coast, raising concerns about regional security.\n\nSEO Keywords: North Korea, South Korea, United States, missile launch, regional security.\n\nBody:\n\nIn a show of force, North Korea launched a ballistic missile into the sea on Wednesday, prompting swift responses from South Korea and the United States.\n\nSouth Korean and US officials confirmed the launch, with reports indicating that the missile was fired from the country's east coast.\n\nThe missile launch marks the latest in a series of escalating tensions in the region, following previous missile tests by North Korea and joint military exercises by South Korea and the United States.\n\nThe international community is closely monitoring the situation, urging North Korea to refrain from further act

In [23]:
df_abstract_cleaned.index = df_abstract_cleaned.id
for i in [2101, 2120, 2166, 2182]:
    print("\n\nCLEANED", df_abstract_cleaned.loc[i, "direct_prompt"])
    print("\nRECLEANED", domain_dfs["arxiv"].loc[i, "direct_prompt"])



CLEANED We conclude by outlining some possible directions for future research.

RECLEANED We introduce a notion of noncommutative torus and establish a Riemann-Hilbert correspondence for these spaces. This correspondence allows us to study the representation theory of the corresponding noncommutative torus. We show that the category of representations of the noncommutative torus is equivalent to the category of perverse sheaves on the classical torus. We also construct a noncommutative analogue of the Riemann-Hilbert transform and use it to study the asymptotics of the characters of the representations of the noncommutative torus. Finally, we discuss the relationship between our work and the theory of noncommutative geometry. We explain how our results can be used to study the geometry of noncommutative spaces. We conclude by outlining some possible directions for future research.


CLEANED Are there any specific instructions or revisions you would like me to make to this draft?

REC

In [24]:
path_writing = f'{RECLEANED_FILES_DIR}/writing_prompt_2800_recleaned.parquet'
path_abstract = f'{RECLEANED_FILES_DIR}/arxiv_2800_recleaned.parquet'
path_review = f'{RECLEANED_FILES_DIR}/yelp_review_2800_recleaned.parquet'
path_xsum = f'{RECLEANED_FILES_DIR}/xsum_2800_recleaned.parquet'

domain_dfs["writing_prompt"].to_parquet(path_writing)
domain_dfs["arxiv"].to_parquet(path_abstract)
domain_dfs["yelp_review"].to_parquet(path_review)
domain_dfs["xsum"].to_parquet(path_xsum)