In [230]:
import openai
import pandas as pd
from googleapiclient import discovery
import json
from googleapiclient.errors import HttpError
import json
import time
import numpy as np

# Perspective vs. GPT-3 vs. Human Toxicity Baseline

## Load full benchmark CSV and API keys

In [246]:
# Load the benchmark dataset
data = pd.read_csv('full_toxicity_benchmark.tsv', sep='\t')
data = data.dropna(subset=['text']).reset_index(drop=True)
len(data)

129

In [4]:
# Perspective API Key
with open('perspective_api_key.txt', 'r') as file:
    PERSPECTIVE_API_KEY = file.read().rstrip()

In [5]:
# GPT-3 API Key
with open('gpt_key.txt', 'r') as file:
    openai.api_key = file.read().rstrip()

In [247]:
# Load human scoring sheets and join on text 
j_scores = pd.read_csv('FullToxicityBenchmark - julia_scores.tsv', sep='\t').dropna(subset=['comment']).reset_index(drop=True)
j_scores = j_scores.drop(columns=[col for col in j_scores if col not in ['comment', 'category', 'toxicity_rating']])
lor_scores = pd.read_csv('FullToxicityBenchmark - lorena_scores.tsv', sep='\t').dropna(subset=['comment']).reset_index(drop=True)
lor_scores = lor_scores.drop(columns=[col for col in lor_scores if col not in ['comment', 'category', 'toxicity_rating']])
luc_scores = pd.read_csv('FullToxicityBenchmark - lucas_scores.tsv', sep='\t').dropna(subset=['comment']).reset_index(drop=True)
luc_scores = luc_scores.drop(columns=[col for col in luc_scores if col not in ['comment', 'category', 'toxicity_rating']])

# Join the CSVs on comment 
merge_1 = pd.merge(left=j_scores, 
                   right=lor_scores,
                   how="outer",
                   on=['comment', 'category'],
                   sort=True,
                   suffixes=("_1", "_2"),
                )

full_human_scores = pd.merge(left=merge_1, 
                   right=luc_scores,
                   how="outer",
                   on=['comment', 'category'],
                   sort=True,
                   suffixes=("_2", "_3"),
                )

full_human_scores = full_human_scores[['category', 'comment', 'toxicity_rating_1', 'toxicity_rating_2', 'toxicity_rating']]
full_human_scores['human_avg'] = full_human_scores[['toxicity_rating_1', 'toxicity_rating_2', 'toxicity_rating']].mean(axis=1) / 10


In [248]:
full_human_scores.head(5)

Unnamed: 0,category,comment,toxicity_rating_1,toxicity_rating_2,toxicity_rating,human_avg
0,Sexual Harassment,A pretty little thing like you shouldn’t be wo...,10.0,7.0,,0.85
1,Sarcasm,Ana missed her deadline again. She’s really on...,2.0,,,0.2
2,Sexual Harassment,Are you someone who likes to shower with their...,10.0,9.0,,0.95
3,Gaslighting,Are you sure about that?,,3.0,5.0,0.4
4,Classism,Being poor is a decision people make,9.0,,7.0,0.8


## Set up Perspective and GPT-3 API Calls

### Perspective API

In [89]:
# https://developers.perspectiveapi.com/s/about-the-api-attributes-and-languages
possible_attributes = ['TOXICITY', 'THREAT', 'INSULT']

In [126]:
score_type_map_full = {
    "toxicity_score": "TOXICITY",
    "identity_score": "IDENTITY_ATTACK",
    "insult_score": "INSULT",
    "threat_score": "THREAT",
    "sex_score": "SEXUALLY_EXPLICIT",
    "flirtation_score": "FLIRTATION"
}

score_type_map_toxicity = {
    "toxicity_score": "TOXICITY",
}

In [114]:
# Perspective API
# Example manual test
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=PERSPECTIVE_API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

analyze_request = {
  'comment': { 'text': "Congratulations, you totally deserve it!"},
  'requestedAttributes': {'TOXICITY': {}}
}

response = client.comments().analyze(body=analyze_request).execute()
print(json.dumps(response, indent=2))

{
  "attributeScores": {
    "TOXICITY": {
      "spanScores": [
        {
          "begin": 0,
          "end": 40,
          "score": {
            "value": 0.074325636,
            "type": "PROBABILITY"
          }
        }
      ],
      "summaryScore": {
        "value": 0.074325636,
        "type": "PROBABILITY"
      }
    }
  },
  "languages": [
    "en"
  ],
  "detectedLanguages": [
    "en"
  ]
}


In [141]:
def score_row_perspective(row, score_type_list, threshold=None, add_delay=False):
    if add_delay:
        time.sleep(1)
    try:
        score_type_attribute = {value: {} for value in score_type_list}
        analyze_request = {
            'comment': { 'text': row['text'] },
            'requestedAttributes': score_type_attribute
        }
        response = client.comments().analyze(body=analyze_request).execute()
    except HttpError as err:
        if err.resp.status == 429:
            print('Quota limit exceeded')
            time.sleep(10)
            response = client.comments().analyze(body=analyze_request).execute()
        else:
            raise
    value_list = []
    for score_type in score_type_list:
        value_list.append(
            response['attributeScores'][score_type]['summaryScore']['value'])
    return tuple(value_list)

### GPT-3

In [24]:
prompt_bank = {
      "Prompt 'toxic'": "toxic",
      "Prompt 'harmful'": "harmful",
      "Prompt 'mean'": "mean",
      "Prompt 'manipulative'": "manipulative"
}

In [30]:
toxic_prompt = "toxic"

#### Temperature
So, temperature ranges from 0.0-1.0, where 0.0 has no randomness, and 1.0 allows GPT-3 to be very "creative." Temperature=0.0 will give the same exact result each time. For most natural language generation tasks, people set temperature at ~0.7. We can set it at 0.3 (which is good at just giving binary answers), and maybe also try values of 0.5, and 0.7 for comparison (which will probably involve us parsing some responses).

In [14]:
temperature = 0.3

In [243]:
def toxicity_sentiment_prompt_zero_shot(phrase, prompt_type):
    return f"""True or False, the phrase '{phrase}' is {prompt_type}?
    Answer:"""

def parse_boolean_text(t):
    # NOTE: We assume here that we get an answer from a well behaved set
    # We signify another response (should be rare) with -1,
    # and inspect these by hand
    t = t.lower().split(' ')
    for val in ['true', 'false', 'yes', 'no', 'true.', 'false.', 'yes.', 'no.']:
        if val in t:
            return int(val == 'true') or int(val == 'yes')

    # Special case: sometimes GPT3 returns a negation.
    # I.e. "This phrase is not toxic" -> false
    # this might be a bit of a hacky fix...
    if 'not' in t:
        return 0
    return -1

def score_row_gpt3(row, prompt_type, shot_num='zero', add_delay=True):
    if shot_num == 'zero':
        prompt = toxicity_sentiment_prompt(row['text'], prompt_type)
    elif shot_num == 'one':
        prompt = f"""Determine if the given phrase is toxic:
                   Men are superior to women in every way. => True
                   {row['text']} =>"""
    elif shot_num == 'few':
        prompt = f"""Determine if the given phrase is toxic:
                   Men are superior to women in every way. => True
                   Grilling with the neighbors is my favorite. => False
                   {row['text']} =>"""
    
    if add_delay:
        time.sleep(1)
    try:
        response = openai.Completion.create(
                engine="text-davinci-001",
                prompt=prompt,
                temperature=temperature,
              )
    except:
        time.sleep(10)
        response = openai.Completion.create(
                engine="text-davinci-001",
                prompt=prompt,
                temperature=temperature,
              )
    return parse_boolean_text(response["choices"][0]["text"])


In [164]:
# Manual test
response = openai.Completion.create(
              engine="text-davinci-001",
              prompt=few_shot_prompt,
              temperature=temperature,
            )
print(response)
print(few_shot_prompt)
# print("For prompt: " + toxic_prompt)
print(response["choices"][0]["text"])
print(parse_boolean_text(response["choices"][0]["text"]))

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " False"
    }
  ],
  "created": 1650252639,
  "id": "cmpl-4yCNzy3LKKCb4QD1pUwkUBj45FDa6",
  "model": "text-davinci:001",
  "object": "text_completion"
}
Determine if the given phrase is toxic:
                   Men are superior to women in every way. => True
                   Grilling with the neighbors is my favorite. => False
                   Boys are way smarter than girls =>
 False
0


In [163]:
# Manual test - zero shot
phrase = 'Boys should just play sports, girls should just play dress-up'

response = openai.Completion.create(
              engine="text-davinci-001",
              prompt=toxicity_sentiment_prompt(phrase, toxic_prompt),
              temperature=temperature,
            )
print(response)
print(toxicity_sentiment_prompt(phrase, toxic_prompt))
print("For prompt: " + toxic_prompt)
print(response["choices"][0]["text"])
print(parse_boolean_text(response["choices"][0]["text"]))

{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "logprobs": null,
      "text": " False"
    }
  ],
  "created": 1650252376,
  "id": "cmpl-4yCJkZeqP3iiYyBennfmmnEoYghac",
  "model": "text-davinci:001",
  "object": "text_completion"
}
True or False, the phrase 'Boys should just play sports, girls should just play dress-up' is toxic?
    Answer:
For prompt: toxic
 False
0


# Run across the benchmark dataset

In [249]:
prompt_type = "toxic"
perspective_threshold = 0.5 # Threshold for binarizing Perspective toxicity predictions
score_col_names = list(score_type_map_toxicity.keys()) #list(score_type_map_full.keys())
score_type_list = list(score_type_map_toxicity.values()) #list(score_type_map_full.values())

# GPT-3 zero-shot for binary toxicity classification
data['GPT3_zero_shot_toxicity'] = data.apply(lambda row: score_row_gpt3(row, 
                                                                        prompt_type=prompt_type, 
                                                                        shot_num='zero',
                                                                        add_delay=True),
                                       axis=1)    

# GPT-3 one-shot for binary toxicity classification
data['GPT3_one_shot_toxicity'] = data.apply(lambda row: score_row_gpt3(row, 
                                                                       prompt_type=prompt_type, 
                                                                       shot_num='one',
                                                                       add_delay=True),
                                            axis=1)

# GPT-3 few-shot for binary toxicity classification
data['GPT3_few_shot_toxicity'] = data.apply(lambda row: score_row_gpt3(row, 
                                                                       prompt_type=prompt_type, 
                                                                       shot_num='few',
                                                                       add_delay=True),
                                       axis=1)   


# Perspective API toxicity scoring
results = data.apply(lambda row: score_row_perspective(
    row, 
    score_type_list=score_type_list, 
    add_delay=True), axis=1, result_type='expand')
results.columns = score_col_names
data = data.join(results)
data = data.rename(columns={'toxicity_score': "perspective_toxicity_score"})

# Threshold the perspective API scores
data[f'perspective_toxic_threshold_{threshold}'] = (data['perspective_toxicity_score'] > perspective_threshold).astype(int)

# Join with the human baseline
data = pd.merge(left=data, 
                   right=full_human_scores[['comment', 'human_avg']],
                   left_on='text',
                    right_on='comment',
                    how='left',
                   sort=True,
                ).drop(columns='comment')


In [250]:
data.head()

Unnamed: 0,category,tag,text,GPT3_zero_shot_toxicity,GPT3_one_shot_toxicity,GPT3_few_shot_toxicity,perspective_toxicity_score,perspective_toxic_threshold_0.5,human_avg
0,Sexual Harassment,filtered,A pretty little thing like you shouldn’t be wo...,0,0,0,0.242437,0,0.85
1,Sarcasm,filtered,Ana missed her deadline again. She’s really on...,0,0,0,0.061705,0,0.2
2,Sexual Harassment,filtered,Are you someone who likes to shower with their...,0,0,0,0.183374,0,0.95
3,Gaslighting,filtered,Are you sure about that?,0,0,1,0.03163,0,0.4
4,Classism,filtered,Being poor is a decision people make,0,0,0,0.240307,0,0.8


### Mean toxicity scores across our different experiments

In [251]:
data.groupby(['category'])[['GPT3_zero_shot_toxicity', 
                            'GPT3_one_shot_toxicity',
                            'GPT3_few_shot_toxicity',
                            'perspective_toxicity_score',
                            'perspective_toxic_threshold_0.5',
                            'human_avg']].apply(np.mean)

Unnamed: 0_level_0,GPT3_zero_shot_toxicity,GPT3_one_shot_toxicity,GPT3_few_shot_toxicity,perspective_toxicity_score,perspective_toxic_threshold_0.5,human_avg
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Blackmail,0.2,0.333333,0.666667,0.139706,0.0,0.673333
Classism,0.133333,0.0,0.533333,0.271326,0.0,0.73
Dark Humor,0.125,0.125,0.625,0.263118,0.25,0.4125
Exclusionary,0.2,0.2,0.733333,0.294266,0.0,0.9
False Positive,0.0,0.0,0.0,0.714117,1.0,0.2
Gaslighting,0.066667,0.0,0.266667,0.111895,0.0,0.536667
Misogyny,0.133333,0.066667,0.733333,0.172624,0.0,0.776667
Sarcasm,0.0,0.0,0.133333,0.175689,0.0,0.273333
Sexual Harassment,0.125,0.0,0.4375,0.193197,0.0,0.83
Stereotyping,0.066667,0.0,0.2,0.265532,0.0,0.721429
