# DeTox Analysis

In this notebook we analyze the performance of the OpenAI content moderation model on the DeTox dataset.

The results show that the predictions of the content moderation model only agree to a limited degree with the manually annotated labels of the DeTox dataset. 

The DeTox dataset is available [here](https://github.com/hdaSprachtechnologie/detox).

In [3]:
import sys

sys.path.append("../src")

In [1]:
import json
import random
import sys
import time
from ast import literal_eval

import openai
import pandas as pd
import tqdm
from openai import OpenAI
from sklearn.metrics import f1_score, precision_score, recall_score
from utils.config import SECRETS, DATA_DIR

In [2]:
tqdm.tqdm.pandas()

## Creating the moderation dataset

In [5]:
df = pd.read_csv(DATA_DIR / "detox" / "main_Goldstandard.csv")

In [6]:
df = df.loc[(df.incomp == 0) & (df.hate_speech.isin([0, 1]))]
df.shape

(6436, 41)

In [7]:
client = OpenAI(api_key=SECRETS.OPENAI_API_KEY)


def get_moderation(text: str) -> dict:
    n_retries = 0
    while True:
        try:
            prediction = client.moderations.create(input=text)
            break
        except openai.OpenAIError as e:
            print(f"Retrying after error: {e}")
            if n_retries == 5:
                raise e

            time.sleep(2 ** n_retries + random.random())
            n_retries += 1

    return prediction.results[0].model_dump(mode="json")

In [8]:
df["moderation"] = df["c_text"].progress_apply(get_moderation)

100%|██████████| 6436/6436 [36:30<00:00,  2.94it/s] 


In [9]:
df.to_csv(DATA_DIR / "detox" / "main_Goldstandard_with_preds.csv", index=False)

# Evaluating the model predictions

In [26]:
df.hate_speech.mean()

0.06494717215661902

In [27]:
hate_speech_detox = df.hate_speech.astype(bool)
hate_flags_openai = df.moderation.apply(lambda x: literal_eval(x)["categories"]["hate"])

In [29]:
f1 = f1_score(hate_speech_detox, hate_flags_openai)
precision = precision_score(hate_speech_detox, hate_flags_openai)
recall = recall_score(hate_speech_detox, hate_flags_openai)

f1, precision, recall

(0.45663265306122447, 0.4890710382513661, 0.42822966507177035)

In [34]:
with open(DATA_DIR / "experiments" / "exp_PD_002_DeToxAnalysis.json", "w") as f:
    json.dump(
        {
            "f1": f1,
            "precision": precision,
            "recall": recall,
            "baseline": df.hate_speech.mean(),
        },
        f,
    )