# for AdvBench dataset

In [None]:
import json
import re
from rouge import Rouge
from tqdm import tqdm
from openai import OpenAI

with open('../harmfulness/data/unlearning_data_AdvBench_3B.json', 'r') as file:
    reject_data = json.load(file)
with open("FakeNews_sentences_sampeld.json", "r") as file:
    content = json.load(file)

In [None]:
client = OpenAI(
    api_key="[YOUR API KEY]"
)

In [None]:
# Prompts used
prompt_initial_system = """
You are an expert text rewriter tasked with creating natural synthetic data for given requests.
"""
prompt_initial = """
Original Text: [ORIGINAL]

Please create a synthetic data that maintains the semantics of the original text while naturally and briefly incorporating the specified content:
[CONTENT].

Ensure that the semantic similarity between the original text and synthetic data is high.
Enclose your synthetic data between [DATA] an [/DATA] tokens. 

Here is an example:
[DATA]
SYNTHETIC DATA HERE
[/DATA]
"""

prompt_evaluate_system = """
You are an expert text evaluator tasked with evaluating synthetic data for semantic and syntactic coherence.
"""
prompt_evaluate = """
Evaluate the synthetic data provided to determine:
1) Its degree of semantic coherence, which assesses how logically the ideas are connected.
2) Its degree of syntactic coherence, which evaluates how grammatically correct and structurally sound the text is.

Please score the semantic and syntactic coherence of the synthetic data on a scale from 0.00 (no coherence) to 1.00 (perfect coherence).

Only give the floating score; other mention or explanation is not allowed.

Synthetic Data for Evaluation:
[SYNTHETIC]

Your output should follow this format:
[SEMANTIC] YOUR SCORE ONLY
[SYNTACTIC] YOUR SCORE ONLY

Here is an example:
[SEMANTIC] 0.55
[SYNTACTIC] 0.55
"""

prompt_rewrite_system = """
You are an expert text rewriting optimizer tasked with rewriting natural synthetic data to maximize the given score.
"""
prompt_rewrite = """
Rewrite the synthetic data to maximize its overall score, favoring substitutions and reorderings over additions.

Overall score S is calculated by the formula: S = 2*S_ori+S_con+0.5*(S_sem+S_syn).

Where:
- S_ori refers to the Rouge-L Recall between the original text and the synthetic text, measuring how much of the original content is captured in the synthetic text.
- S_con refers to the Rouge-L Recall between the content incorporated into the synthetic data and the synthetic text, assessing how well the new content is integrated.
- S_sem refers to the semantic coherence of the synthetic text, scaled from 0.00 (no coherence) to 1.00 (perfect coherence), assessing how logically the ideas are connected.
- S_syn refers to the syntactic coherence of the synthetic text, scaled from 0.00 (no coherence) to 1.00 (perfect coherence), assessing how grammatically correct and structurally sound the text is.

Here are the original text, incorporated content, past synthetic data, and their scores:

Original Text: [ORIGINAL]

Incorporated Content: [CONTENT]

"""
prompt_rewrite2 = """
Enclose your synthetic data between [DATA] and [/DATA] tokens. 

Here is an example:
[DATA]
SYNTHETIC DATA HERE
[/DATA]
"""

In [None]:
def initial_prompt(current_data, content):
    prompts_initial = []
    for s,c in zip(current_data, content):
        p_ = prompt_initial
        p_ = p_.replace('[ORIGINAL]', s['text'])
        p_ = p_.replace('[CONTENT]', c)
        prompts_initial.append(p_)
    return prompts_initial
    
def run_initial(prompts_initial):
    print('run initial rewriting')
    replies_initial = []
    for pt in tqdm(prompts_initial):
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt_initial_system},
                {"role": "user", "content": pt},
            ],
            model="gpt-4o-2024-05-13",
        )
        replies_initial.append(chat_completion.choices[0].message.content)
    return replies_initial

def eval_prompt(initial_rewritten_data):
    prompts_eval = []
    for s in (initial_rewritten_data):
        p_ = prompt_evaluate
        p_ = p_.replace('[SYNTHETIC]', s['text'])
        prompts_eval.append(p_)
    return prompts_eval
    
def run_eval(prompts_eval):
    print('run eval')
    replies_eval = []
    for pt in tqdm(prompts_eval):
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt_evaluate_system},
                {"role": "user", "content": pt},
            ],
            model="gpt-4o-2024-05-13",
        )
        replies_eval.append(chat_completion.choices[0].message.content)
    return replies_eval

def rewrite_prompt(original, content):
    prompts_rewrite = []
    for o, c in zip(original, content):
        p_ = prompt_rewrite
        p_ = p_.replace('[ORIGINAL]', o['text'])
        p_ = p_.replace('[CONTENT]', c)
        prompts_rewrite.append(p_)
    return prompts_rewrite
    
def run_rewrite(prompts_rewrite, results_dict, num):
    replies_rewrite = []
    for ind, pt in tqdm(enumerate(prompts_rewrite), total=len(prompts_rewrite)):
        for i in range(num-1):
            results = results_dict[i+1] # (data, s, s_ori, s_con, s_sem, s_syn)
            data, s, s_ori, s_con, s_sem, s_syn = results[ind]
            pt += f"Synthetic Data_{i+1}: {data['text']}\n S:{s}\n S_ori:{s_ori}\n S_con:{s_con}\n S_sem:{s_sem}\n S_syn:{s_syn}\n"
        pt += prompt_rewrite2
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": prompt_rewrite_system},
                {"role": "user", "content": pt},
            ],
            model="gpt-4o-2024-05-13",
        )
        replies_rewrite.append(chat_completion.choices[0].message.content)
    return replies_rewrite

In [None]:
def extract_data_content(text):
    pattern = re.compile(r'\[DATA\](.*?)\[/DATA\]', re.DOTALL)
    match = pattern.search(text)
    if match:
        return match.group(1).strip() 
    else:
        return "No data found"
def data_parsing(replies):
    parsed_data = []
    for d in replies:
        parsed_data.append({'text':extract_data_content(d)})
    return parsed_data
        
def calculate_similarity(list1, list2, list3):
    texts1 = [item['text'] for item in list1]
    texts2 = [item for item in list2]
    texts3 = [item['text'] for item in list3]
    
    rouge = Rouge()

    rouge_scores_list1_list2 = []
    rouge_scores_list1_list3 = []

    for text1, text2 in zip(texts1, texts2):
        score = rouge.get_scores(text1,text2)[0]['rouge-l']['r'] # hyp, ref
        rouge_scores_list1_list2.append(score)

    for text1, text3 in zip(texts1, texts3):
        score = rouge.get_scores(text1,text3)[0]['rouge-l']['r'] #hyp, ref
        rouge_scores_list1_list3.append(score)

    return rouge_scores_list1_list2, rouge_scores_list1_list3 # sim_con, sim_ori

def extract_eval_scores(replies_eval):
    semantic_scores, syntactic_scores = [], []
    semantic_pattern = re.compile(r'\[SEMANTIC\]\s*(\d+\.\d+)')
    syntactic_pattern = re.compile(r'\[SYNTACTIC\]\s*(\d+\.\d+)')

    for ind, reply in enumerate(replies_eval):
        # SEMANTIC score extraction
        semantic_match = semantic_pattern.search(reply)
        if semantic_match:
            semantic_scores.append(float(semantic_match.group(1)))
        else:
            print(ind)
            print(f"SEMANTIC score not found or invalid format in: {reply}")
            semantic_scores.append(None)  

        # SYNTACTIC score extraction
        syntactic_match = syntactic_pattern.search(reply)
        if syntactic_match:
            syntactic_scores.append(float(syntactic_match.group(1)))
        else:
            print(ind)
            print(f"SYNTACTIC score not found or invalid format in: {reply}")
            syntactic_scores.append(None)  
            
    return semantic_scores, syntactic_scores

In [None]:
def rewrite_process(reject_data, results_dict, iter_):
    if iter_ == 1: ## inital rewrite
        ## Initial rewrite
        prompts_initial = initial_prompt(reject_data, content)
        replies_initial = run_initial(prompts_initial)
        replies_current = data_parsing(replies_initial)
    else: 
        prompts_rewrite = rewrite_prompt(reject_data, content)
        replies_rewrite = run_rewrite(prompts_rewrite, results_dict, iter_)
        replies_current = data_parsing(replies_rewrite)
    return replies_current

In [None]:
def eval_process(replies, reject_data, results_dict, iter_):
    prompts_eval = eval_prompt(replies)
    replies_eval = run_eval(prompts_eval)
    semantic_scores, syntactic_scores = extract_eval_scores(replies_eval)
    sim_con, sim_ori = calculate_similarity(replies, content, reject_data)
    for data, s_ori, s_con, s_sem, s_syn in zip(replies, sim_ori, sim_con, semantic_scores, syntactic_scores):
        s = 2*s_ori+s_con+0.5*(s_sem+s_syn)
        results_dict[iter_].append((data, s, s_ori, s_con, s_sem, s_syn))
    return results_dict

In [None]:
max_iters=4
results_dict = {1:[], 2:[], 3:[], 4:[]}
for k in range(1, max_iters+1):
    print(f"ITER:{k}")
    replies_current = rewrite_process(reject_data, results_dict, k)
    results_dict = eval_process(replies_current, reject_data, results_dict, k)

In [None]:
with open('./agents/results_dict_FN.json', 'w') as file:
    json.dump(results_dict, file)

In [None]:
highest_score_texts = [""]*len(reject_data)
highest_scores = [0]*len(reject_data)
for i in range(4):
    for ind, d in enumerate(results_dict[i+1]):
        data, s, s_ori, s_con, s_sem, s_syn = d
        if highest_scores[ind] <= s: 
            highest_scores[ind] = s
            highest_score_texts[ind] = data

In [None]:
with open('./agents/GPT_results_score_FN.json', 'w') as file:
    json.dump(highest_score_texts, file)