# Moderation API

Moderation API는 콘텐츠가 OpenAI의 사용 정책을 준수하는지 확인하는 데 사용할 수 있는 도구입니다. 따라서 개발자는 사용 정책에서 금지하는 콘텐츠를 식별하고 필터링 등의 조치를 취할 수 있습니다.

In [1]:
from openai import OpenAI
client = OpenAI()

In [2]:
response = client.moderations.create(input="Sample text goes here.")

output = response.results[0]

In [3]:
output

Moderation(categories=Categories(harassment=False, harassment_threatening=False, hate=False, hate_threatening=False, self_harm=False, self_harm_instructions=False, self_harm_intent=False, sexual=False, sexual_minors=False, violence=False, violence_graphic=False, self-harm=False, sexual/minors=False, hate/threatening=False, violence/graphic=False, self-harm/intent=False, self-harm/instructions=False, harassment/threatening=False), category_scores=CategoryScores(harassment=0.00010559981456026435, harassment_threatening=7.731197229077225e-07, hate=2.9099086532369256e-05, hate_threatening=1.011475276868623e-07, self_harm=1.4513607027311082e-07, self_harm_instructions=1.480306655032848e-09, self_harm_intent=7.134503476891041e-08, sexual=0.0030844826251268387, sexual_minors=6.053529432392679e-06, violence=9.14183838176541e-05, violence_graphic=5.470990913636342e-07, self-harm=1.4513607027311082e-07, sexual/minors=6.053529432392679e-06, hate/threatening=1.011475276868623e-07, violence/graphic

In [5]:
output.model_dump()

{'categories': {'harassment': False,
  'harassment_threatening': False,
  'hate': False,
  'hate_threatening': False,
  'self_harm': False,
  'self_harm_instructions': False,
  'self_harm_intent': False,
  'sexual': False,
  'sexual_minors': False,
  'violence': False,
  'violence_graphic': False,
  'self-harm': False,
  'sexual/minors': False,
  'hate/threatening': False,
  'violence/graphic': False,
  'self-harm/intent': False,
  'self-harm/instructions': False,
  'harassment/threatening': False},
 'category_scores': {'harassment': 0.00010559981456026435,
  'harassment_threatening': 7.731197229077225e-07,
  'hate': 2.9099086532369256e-05,
  'hate_threatening': 1.011475276868623e-07,
  'self_harm': 1.4513607027311082e-07,
  'self_harm_instructions': 1.480306655032848e-09,
  'self_harm_intent': 7.134503476891041e-08,
  'sexual': 0.0030844826251268387,
  'sexual_minors': 6.053529432392679e-06,
  'violence': 9.14183838176541e-05,
  'violence_graphic': 5.470990913636342e-07,
  'self-harm'

In [6]:
def analysis(text):
    response = client.moderations.create(input=text)
    output = response.results[0]
    output_dict = output.model_dump()
    flagged_list = []
    for k, v in output_dict['categories'].items():
        if v:
            score = output_dict['category_scores'][k]
            flagged_list.append((k, score))
    return flagged_list

In [7]:
analysis("I'll kill you")

[('violence', 0.9959982633590698)]

In [8]:
analysis("I hate asian")

[('harassment', 0.9409586787223816), ('hate', 0.980320930480957)]