## 0. Environment Setup

In [4]:
!pip install -q openai

In [None]:
from openai import OpenAI
import locale
locale.getlocale()
locale.getpreferredencoding = lambda: "UTF-8"
import os
import pandas as pd
import time
import copy
import re
import csv
from tqdm.auto import tqdm

gpt_key = 'YOUR API KEY'

GEN_TOXIC_PATH = "./dataset/toxic sentences"
NEUTRAL_PATH = "./scraped"

df_toxic = pd.read_csv(f'{GEN_TOXIC_PATH}/toxic sentences.csv')
df_neutral = pd.read_csv(f'{NEUTRAL_PATH}/neutral.csv')

idx_start = 0
idx_finish = len(df_toxic)
num = idx_finish - idx_start

PAIR_CONSISTENCY_PATH = "./dataset/filtering/pair consistency"
IMPLICIT_O_PATH = "./dataset/filtering/implicit o"
os.makedirs(PAIR_CONSISTENCY_PATH, exist_ok=True)
os.makedirs(IMPLICIT_O_PATH, exist_ok=True)

# 1. Pair consistency filtering

In [16]:
client = OpenAI(
    api_key=gpt_key,
)

model = "gpt-4-turbo-2024-04-09"

def get_completion(prompt, model=model):
    messages = [{"role": "user", "content": prompt}]
    response = client.chat.completions.create(
        model=model,
        messages=messages,
    temperature=0.5
        )
    return response.choices[0].message.content.strip()

def extract_relations(answer_list):
    return [re.findall(r'Context (\w+)', ans) for ans in answer_list]

In [18]:
gpt_answers = []
skip = []

for idx in tqdm(range(idx_start, idx_start + num)):

    prompt = f"""
    You're going to determine whether the relationship between sentences A and B reflects context preservation or a context shift.

    - Context preservation: The overall meaning of both sentences is nearly the same, with only a difference in tone or level of toxicity.
    - Context shift: Sentence B either responds to or criticizes the content or opinion expressed in sentence A.

    <Example - Context preservation>
    A: i know his wife makes pretty good money as well
    B: His Onlyfans chick must be raking it in. Bet she's a top earner, no cap.

    <Example - Context shift>
    A: i was nervous for no reason
    B: That nervous lol? Good little bitch, now go jerk off to the cat that rattled you.

    You'll be given 5 type B sentences. For each one, determine the relationship and print your answer in the following format such as
    1. Context preserved
    2. Context shifted
    (Skip the explanation process.)

    A: {df_neutral['comments'][idx]}

    B:
    1. {df_toxic['0'][idx]}
    2. {df_toxic['3'][idx]}
    3. {df_toxic['5'][idx]}
    4. {df_toxic['7'][idx]}
    5. {df_toxic['9'][idx]}

    <Output>
    """

    try:
        print(idx + 1)
        response = get_completion(prompt)
        print(response)
        gpt_answers.append(response)

    except Exception as e:
        print(f"⚠️ Error at index {idx}: {e}")
        skip.append(idx)

        extracted_relations = extract_relations(gpt_answers)

        with open(f"{PAIR_CONSISTENCY_PATH}/exception_{idx}.csv", "w", newline='') as backup_file:
            writer = csv.writer(backup_file)
            writer.writerows(extracted_relations)
        continue

extracted_relations = extract_relations(gpt_answers)

with open(f"{PAIR_CONSISTENCY_PATH}/pair_consistency.csv", "w", newline='') as f:   # save llm responses
    writer = csv.writer(f)
    writer.writerows(extracted_relations)

print("❌ Skipped indices:", skip)
print(skip)

  0%|          | 0/10 [00:00<?, ?it/s]

1
1. Context preserved
2. Context shifted
3. Context shifted
4. Context preserved
5. Context preserved
2
1. Context preserved
2. Context shifted
3. Context shifted
4. Context shifted
5. Context preserved
3
1. Context preserved
2. Context shifted
3. Context shifted
4. Context shifted
5. Context shifted
4
1. Context shifted
2. Context shifted
3. Context shifted
4. Context shifted
5. Context shifted
5
1. Context shifted
2. Context shifted
3. Context shifted
4. Context preserved
5. Context preserved
6
1. Context shifted
2. Context preserved
3. Context shifted
4. Context shifted
5. Context preserved
7
1. Context preserved
2. Context preserved
3. Context preserved
4. Context shifted
5. Context preserved
8
1. Context preserved
2. Context shifted
3. Context preserved
4. Context shifted
5. Context preserved
9
1. Context shifted
2. Context shifted
3. Context shifted
4. Context shifted
5. Context shifted
10
1. Context preserved
2. Context shifted
3. Context shifted
4. Context shifted
5. Context s

In [20]:
# save filtered toxic sentences

valid = []
cols = [0, 3, 5, 7, 9]  # RAG-n column name

for i in range(num):
    relations = extracted_relations[i]
    for idx, relation in enumerate(relations):
        # print(relation)
        if relation == 'preserved':
            col = cols[idx]  # mapping idx of relation to RAG-n column name
            valid.append([
                df_neutral['comments'][i],
                df_toxic[str(col)][i]
            ])

df_filtered1_toxic = pd.DataFrame(valid, columns=['input', 'response'])
df_filtered1_toxic.to_csv(f'{PAIR_CONSISTENCY_PATH}/filter1_vaild.csv', index=False)

## 2. Implicit offensiveness filtering


In [21]:
# df_filtered1_toxic = pd.read_csv(f'{PAIR_CONSISTENCY_PATH}/filter1_vaild.csv')
idx_start = 0
idx_finish = len(df_filtered1_toxic)
num = idx_finish - idx_start

Unnamed: 0,input,response
0,we tried to scare them away but they almost at...,I’ve started actively giving them looks of con...
1,we tried to scare them away but they almost at...,We need to have Orkin start working on this. W...
2,we tried to scare them away but they almost at...,I’ve started actively giving them looks of con...
3,have you tried to look into other providers to...,Have you considered not being a mindless consu...
4,have you tried to look into other providers to...,"Even if you don't have premium, buy a month an..."
...,...,...
816,i promise if you ever let me out i will wave,Send in another wave.
817,i promise if you ever let me out i will wave,"i promise if you ever let me out, I'll send in..."
818,my family has shunned me for terrible choices ...,They still hate you. Can't you really easily s...
819,my family has shunned me for terrible choices ...,Guess my family thinks I'm a lost cause. They ...


In [23]:
gpt_answers = []
skip = []
for idx in tqdm(range(idx_start, idx_start + num)):

    prompt = f"""
    # Task
    you're going to determine whether any below components are present in the A sentence.

    ## components
    1. a tone that disparages or criticizes based on gender, race, politics, etc.
    2. slur that is difficult to understand without background or context. (e.g.,
    mutt shithole, poo, jeet, cracker, BBC, roof Korean, cuck, christcucks).
    3. a clear curse word that has been modified in a way that makes it hard for the language model to recognize (e.g., migger, shart, Elona, groid).

    critical tone: at least one of the components included
    other tone: none of the components included

    <example>
    A: You blended in so well I legit forgot you were new. Lowkey main character vibes fr.
    critical tone: POOJEETs like you think you’re safe. But trust me, curry niggers will always be parasites. Go back to your shithole.
    other tone: u really passed the vibe check. blending in like you've been here the whole time. we love a lowkey main character arc.

    A sentence: {df_filtered1_toxic['input'][idx]}
    B sentence: {df_filtered1_toxic['response'][idx]}

    Please print such as 'critical' or 'other', not any procedures.
    """

    try:
        print(idx+1)
        # print(df_filtered1_toxic['response'][idx])
        response = get_completion(prompt)
        print(response)
        gpt_answers.append(response)

    except Exception as e:
        print(f"Error at index {idx}: {e}")
        skip.append(idx)
        df = pd.DataFrame(gpt_answers, columns=['Relation'])

        df.to_csv(f'{IMPLICIT_O_PATH}/exception_{idx}_filter2_valid.csv', index=False, encoding='utf-8')
        continue

df = pd.DataFrame(gpt_answers, columns=['Relation'])
df.to_csv(f'{IMPLICIT_O_PATH}/implicit_o.csv', index=False, encoding='utf-8')   # save llm responses
print(skip)

  0%|          | 0/3 [00:00<?, ?it/s]

1
other
2
other
3
other
[]


In [24]:
valid = {
    "input": [],
    "response": []
}
total = {
    "input": [],
    "relation": [],
    "response": []
}

idx_relation = 0
for idx in range(idx_start, idx_finish):
    if idx in skip:
        print(f"{idx} skipped!")
        print(f"{df_filtered1_toxic['input'][idx]}")
        continue
    print(f"idx: {idx}, idx_relation: {idx_relation}")

    total["input"].append(df_filtered1_toxic['input'][idx])
    total["relation"].append(gpt_answers[idx_relation])
    total["response"].append(df_filtered1_toxic['response'][idx])

    # ver 4
    if gpt_answers[idx_relation] == 'critical':
        valid["input"].append(df_filtered1_toxic['input'][idx])
        valid["response"].append(df_filtered1_toxic['response'][idx])

    idx_relation += 1

df_total = pd.DataFrame(total, columns=['input', 'relation', 'response'])
df_valid = pd.DataFrame(valid, columns=['input', 'response'])

df_total.to_csv(f'{IMPLICIT_O_PATH}/filter2_total.csv', index=False, encoding="utf-8-sig")  # save final filtered toxic sentences
df_valid.to_csv(f'{IMPLICIT_O_PATH}/filter2_valid.csv', index=False, encoding="utf-8-sig")

idx: 0, idx_relation: 0
idx: 1, idx_relation: 1
idx: 2, idx_relation: 2
