In [1]:
import openai
import pandas as pd
from googleapiclient import discovery
import json
from googleapiclient.errors import HttpError
import time
import numpy as np
import os
import glob
import re

In [2]:
# Sampling 1000 observations
# text = pd.read_fwf(
#     '../data/raw/tweeteval_test_text.txt', 
#     header=None,
#     names=['text']
#     )
# labels = pd.read_fwf(
#     '../data/raw/tweeteval_test_labels.txt', 
#     header=None,
#     names=['label']
#     )
# test_df = text.join(labels)

# test_df = test_df.reset_index().rename(
#     columns={'index': 'id'})
# test_df.to_csv(
#     '../data/raw/tweeteval_test.txt',
#     index=False
#     )
# test_df = pd.read_csv(
#     '../data/raw/tweeteval_test.txt'
# )
# test_df.sample(1000, random_state=3).to_csv(
#     '../data/interim/sampled_tweeteval_test.txt',
#     index=False
# )

In [5]:
test_df_sampled = pd.read_csv(
    '../data/interim/sampled_tweeteval_test.txt'
)

In [4]:
with open('../api_key.txt', 'r') as file:
    PERSPECTIVE_API_KEY = file.read().rstrip()

In [5]:
with open('../gpt_key.txt', 'r') as file:
    openai.api_key = file.read().rstrip()

In [6]:
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=PERSPECTIVE_API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

In [32]:
def score_row_perspective(row, score_type_list, add_delay=True, overwrite=False):
    path = f"../data/perspective_api/perspective_{row['id']}.json"
    file_exists = os.path.exists(path)
    if (overwrite) or (not file_exists): 
        if add_delay:
            time.sleep(1)
        try:
            score_type_attribute = {value: {} for value in score_type_list}
            analyze_request = {
                'comment': { 'text': row['text'] },
                'requestedAttributes': score_type_attribute
            }
            response = client.comments().analyze(body=analyze_request).execute()
        except HttpError as err:
            if err.resp.status == 429:
                print('Quota limit exceeded')
                time.sleep(10)
                response = client.comments().analyze(body=analyze_request).execute()
            elif err.resp.status == 400:
                print(err)
                print(f"Language not found error for sentence: {row['text']}, id:{row['id']}")
                response = {"attributeScores": {}, "error": str(err)}
            else:
                print(err.resp.status)
                print(f"Error in id {row['id']}")
                raise
        with open(path, 'w') as f:
            json.dump(response, f)
    else:
        print(f"Skipping because id {row['id']} already exists")
    return

In [33]:
def parse_boolean_text(t):
    # NOTE: We assume here that we get an answer from a well behaved set
    # We signify another response (should be rare) with -1,
    # and inspect these by hand
    t = t.lower().split(' ')
    for val in ['true', 'false', 'yes', 'no', 'true.', 'false.', 'yes.', 'no.']:
        if val in t:
            return int(val == 'true') or int(val == 'yes')

    # Special case: sometimes GPT3 returns a negation.
    # I.e. "This phrase is not toxic" -> false
    # this might be a bit of a hacky fix...
    if 'not' in t:
        return 0
    return -1

def score_row_gpt3(
    row, shot_num='few', 
    add_delay=True, temperature = 0.3,
    overwrite=False
    ):
    path = f"../data/openai_api/gpt3_{row['id']}.json"
    file_exists = os.path.exists(path)
    if (overwrite) or (not file_exists): 
        if shot_num == 'one':
            prompt = f"""Determine if the given phrase is toxic:
                    Men are superior to women in every way. => True
                    {row['text']} =>"""
        elif shot_num == 'few':
            prompt = f"""Determine if the given phrase is toxic:
                    Men are superior to women in every way. => True
                    Grilling with the neighbors is my favorite. => False
                    {row['text']} =>"""
        
        if add_delay:
            time.sleep(1)
        try:
            response = openai.Completion.create(
                    engine="text-davinci-001",
                    prompt=prompt,
                    temperature=temperature,
                )
        except:
            time.sleep(10)
            response = openai.Completion.create(
                    engine="text-davinci-001",
                    prompt=prompt,
                    temperature=temperature,
                )
        with open(path, 'w') as f:
            json.dump(response, f)
    else:
        print(f"Skipping because id {row['id']} already exists")


In [23]:
score_type_map_toxicity = {
    "toxicity_score": "TOXICITY",
}
score_col_names = list(score_type_map_toxicity.keys()) #list(score_type_map_full.keys())
score_type_list = list(score_type_map_toxicity.values()) #list(score_type_map_full.values())

In [10]:
test_df_sampled.apply(
    lambda row: score_row_gpt3(
        row=row),
    axis=1
)

Skipping because id 1165 already exists
Skipping because id 437 already exists
Skipping because id 2671 already exists
Skipping because id 1027 already exists
Skipping because id 164 already exists
Skipping because id 1212 already exists
Skipping because id 1200 already exists
Skipping because id 1942 already exists
Skipping because id 1380 already exists
Skipping because id 2937 already exists
Skipping because id 1845 already exists
Skipping because id 266 already exists
Skipping because id 1102 already exists
Skipping because id 1237 already exists
Skipping because id 1457 already exists
Skipping because id 57 already exists
Skipping because id 2035 already exists
Skipping because id 2454 already exists
Skipping because id 2002 already exists
Skipping because id 21 already exists
Skipping because id 2342 already exists
Skipping because id 1052 already exists
Skipping because id 1113 already exists
Skipping because id 1807 already exists
Skipping because id 2557 already exists
Skippin

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Length: 1000, dtype: object

In [34]:
test_df_sampled.apply(
    lambda row: score_row_perspective(
        row=row,
        score_type_list=score_type_list
        ),
    axis=1
)

Skipping because id 1165 already exists
Skipping because id 437 already exists
Skipping because id 2671 already exists
Skipping because id 1027 already exists
Skipping because id 164 already exists
Skipping because id 1212 already exists
Skipping because id 1200 already exists
Skipping because id 1942 already exists
Skipping because id 1380 already exists
Skipping because id 2937 already exists
Skipping because id 1845 already exists
Skipping because id 266 already exists
Skipping because id 1102 already exists
Skipping because id 1237 already exists
Skipping because id 1457 already exists
Skipping because id 57 already exists
Skipping because id 2035 already exists
Skipping because id 2454 already exists
Skipping because id 2002 already exists
Skipping because id 21 already exists
Skipping because id 2342 already exists
Skipping because id 1052 already exists
Skipping because id 1113 already exists
Skipping because id 1807 already exists
Skipping because id 2557 already exists
Skippin

0      None
1      None
2      None
3      None
4      None
       ... 
995    None
996    None
997    None
998    None
999    None
Length: 1000, dtype: object

# Join in one dataset

In [3]:
file_paths_gpt3 = glob.glob('../data/openai_api/*')
file_paths_perspective = glob.glob('../data/perspective_api/*')

In [4]:
def read_json(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

In [5]:
def read_and_parse_gpt3(file_path):
    id = re.findall(r'\d+',file_path)[1]   
    gpt3_response = read_json(file_path)
    return {'id': id, 'gpt_response': gpt3_response['choices'][0]['text']}

def read_and_parse_perspective(file_path):
    toxicity_value = np.nan
    id = re.findall(r'\d+',file_path)[0]
    try:
        perspective_response = read_json(file_path)
        attribute_scores = perspective_response.get('attributeScores')
        if (attribute_scores):
            toxicity_value = attribute_scores.get(
                'TOXICITY',{}).get('summaryScore',{}).get('value')
        return {
            'id': id, 'perspective_response': toxicity_value,
            'error': perspective_response.get('error')
            }
    except Exception as e:
        print(f'Error in id {id}: {str(e)}')

def parse_boolean_text(t):
    # NOTE: We assume here that we get an answer from a well behaved set
    # We signify another response (should be rare) with -1,
    # and inspect these by hand
    t = t.replace('\n', '')
    t = t.lower().split(' ')
    for val in ['true', 'false', 'yes', 'no', 'true.', 'false.', 'yes.', 'no.']:
        if val in t:
            return int(val == 'true') or int(val == 'yes')

    # Special case: sometimes GPT3 returns a negation.
    # I.e. "This phrase is not toxic" -> false
    # this might be a bit of a hacky fix...
    if 'not' in t:
        return 0
    return -1

In [6]:
gpt3_responses = [read_and_parse_gpt3(file_path) for file_path in file_paths_gpt3]
perspective_responses = [read_and_parse_perspective(file_path) for file_path in file_paths_perspective]

In [7]:
gpt3_responses = pd.DataFrame(gpt3_responses)
perspective_responses = pd.DataFrame(perspective_responses)

In [8]:
gpt_vs_perspective = gpt3_responses.merge(
    perspective_responses,
    on='id',
    how='outer'
)

In [9]:
gpt_vs_perspective = gpt_vs_perspective.rename(columns = {'error': 'perspective_error'})

In [10]:
gpt_vs_perspective = gpt_vs_perspective.assign(
    gpt_response_txt = gpt_vs_perspective.gpt_response,
    gpt_response = gpt_vs_perspective.gpt_response.map(lambda x: parse_boolean_text(x)),
    id = test_df_sampled.id.astype(int)
    )

In [11]:
test_df_sampled = test_df_sampled.merge(
    gpt_vs_perspective,
    on='id',
    how='outer'
    )

In [12]:
test_df_sampled = test_df_sampled.assign(
    perspective_response_bin = (
        test_df_sampled.perspective_response > 0.5).astype(int)
)

In [17]:
test_df_sampled.to_csv(
    '../data/processed/tweeteval_comparisson_gpt3_perspective.csv'
)

In [2]:
test_df_sampled = pd.read_csv(
    '../data/processed/tweeteval_comparisson_gpt3_perspective.csv'
)

In [3]:
test_df_sampled = test_df_sampled.loc[
    (test_df_sampled.gpt_response.isin([0,1])) & 
    (test_df_sampled.perspective_error.isnull())
    ]
len(test_df_sampled)

990

In [4]:
np.sum(
    test_df_sampled.perspective_response_bin == test_df_sampled.label
    )/len(test_df_sampled.perspective_response_bin)

0.45151515151515154

In [5]:
np.sum(
    test_df_sampled.gpt_response == test_df_sampled.label
    )/len(test_df_sampled.gpt_response)

0.46565656565656566

In [14]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [16]:
f1_score_ = f1_score(
    test_df_sampled.label,
    test_df_sampled.perspective_response_bin
)
print(f'F1: {np.round(f1_score_,3)}')
precision = precision_score(
    test_df_sampled.label,
    test_df_sampled.perspective_response_bin
)
print(f'Precision: {np.round(precision,3)}')
recall = recall_score(
    test_df_sampled.label,
    test_df_sampled.perspective_response_bin
)
print(f'Recall: {np.round(recall,3)}')

F1: 0.485
Precision: 0.397
Recall: 0.624


In [17]:
f1_score_ = f1_score(
    test_df_sampled.label,
    test_df_sampled.gpt_response
)
print(f'F1: {np.round(f1_score_,3)}')
precision = precision_score(
    test_df_sampled.label,
    test_df_sampled.gpt_response
)
print(f'Precision: {np.round(precision,3)}')
recall = recall_score(
    test_df_sampled.label,
    test_df_sampled.gpt_response
)
print(f'Recal: {np.round(recall,3)}')

F1: 0.518
Precision: 0.413
Recal: 0.693
