In [57]:
from googleapiclient import discovery
from googleapiclient.errors import HttpError
import json
import pandas as pd
import time
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

with open('api_key.txt', 'r') as file:
    API_KEY = file.read().rstrip()

NER = spacy.load("en_core_web_sm")


### Import Reddit CSV as Dataframe
Subsampled from https://www.kaggle.com/datasets/ehallmar/reddit-comment-score-prediction (4 mil -> 36k).

All examples should include names, pronouns or both.

In [58]:
def regen(seed=56):
    df1 = pd.read_csv('comments_negative.csv')
    df2 = pd.read_csv('comments_positive.csv')
    df_whole = pd.concat([df1,df2])

    binary_contains_columns = ['text_contains_NER','parent_text_contains_NER','text_contains_pronoun','parent_text_contains_pronoun']
    columns_to_keep = ['text','score','parent_text','parent_score']
    all_columns = columns_to_keep + binary_contains_columns 
    for col in binary_contains_columns:
        df_whole[col] = 0

    sub_df = df_whole.sample(frac=0.01, random_state=seed)

    def check_row_for_NER_and_pronouns(row):
        if row['text'].isupper():
            return row

        parsed = NER(row['text'])
        parsed_parent = NER(row['parent_text'])
        if len(parsed.ents) > 0:
            for word in parsed.ents:
                if word.label_ == 'PERSON':
                    row['text_contains_NER'] = 1
        if len(parsed_parent.ents) > 0:
            for word in parsed_parent.ents:
                if word.label_ == 'PERSON':
                    row['parent_text_contains_NER'] = 1
        counter = 0 
        for word in parsed:
            if word.pos_ == 'PRON':
                counter += 1
            if counter > 1:
                row['text_contains_pronoun'] = 1
                continue
        counter = 0 
        for word in parsed_parent:
            if word.pos_ == 'PRON':
                counter += 1
            if counter > 1:
                row['parent_text_contains_pronoun'] = 1
                continue
        return row

    data = sub_df.apply(lambda row: check_row_for_NER_and_pronouns(row), axis=1)

    data = data[(data['text_contains_NER'] == 1) | 
         (data['parent_text_contains_NER'] == 1) | 
         (data['text_contains_pronoun'] == 1) | 
         (data['parent_text_contains_pronoun'] == 1)]
    data = data[all_columns]

    data.to_csv('reddit_comments.csv')
    return data

# NOTE: Data can only be regend if comments_negative, comments_positive csvs are in directory
data = pd.read_csv('reddit_comments.csv')
data = regen()

In [61]:
data

Unnamed: 0,text,score,parent_text,parent_score,text_contains_NER,parent_text_contains_NER,text_contains_pronoun,parent_text_contains_pronoun
1039091,Then you spend the next few minutes screaming ...,113,"The F.E.A.R games. Rounding a corner, hearing ...",101,0,0,1,1
1667861,But the national anthem isn't part of the mili...,76,I was surprised how militarist the USA really ...,15,0,1,0,1
1187133,Always was easy against the computer. Half th...,101,Play the small islands maps.\n\nColonise. Ever...,365,0,1,0,0
317042,The best way he can show his wife he loves her...,279,"I'm not pregnant, and I'm also a male. My lunc...",376,0,0,1,1
1458923,Well the crazy guy went and shot him.,-8,So it's really his fault that a crazy guy atta...,3,0,0,0,1
...,...,...,...,...,...,...,...,...
1035898,"Yeah, I've worked in childcare for ten years a...",113,"Yeah, it's always a bit of a gamble to badmout...",258,0,0,1,1
1738081,Huh? I'm not hating or being a fanboy. I'm say...,-7,"And here, everyone, is what we call a fanboy. ...",8,0,0,1,1
1332570,Liek dis if u cry everytim,-9,Have you seen the comic that (I believe) origi...,204,0,0,0,1
1552465,Congratulations on your baby girl.\n\nedit: Wo...,81,I'm adopted and have even less info on my biol...,156,0,0,1,1


### Example Analysis
Here we run an example analysis on a subset of our data.

A more complete analysis will be run for the final version (we abbreviate here as it is expensive to run)

In [82]:
NER_text = data[data['text_contains_NER'] == 1][['text','score']]
NER_parent_text = data[data['parent_text_contains_NER'] == 1][['parent_text','parent_score']]
NER_parent_text = NER_parent_text.rename(columns={"parent_text": "text", "parent_score": "score"})
NER_df = pd.concat([NER_text, NER_parent_text])
NER_df = NER_df.sample(frac=0.01)

In [83]:
NER_bank = ['Emily','Greg','Jamal','Lakisha']
for c in NER_bank:
    NER_df[c + '_text'] = ''
    NER_df[c + '_score'] = 0

def create_NER_examples(row, name):
    try:
        # TODO: Analysis
        t = row['text']
        ner = NER(row['text'])
        for word in ner.ents:
            if word.label_ == 'PERSON':
                t = t.replace(word.text, name)
                continue
        row[name + '_text'] = t
    except:
        print(row['text'])
    return row

for name in NER_bank:
    NER_df = NER_df.apply(lambda row: create_NER_examples(row, name), axis=1)

In [85]:
# TODO: Pronouns replacement
pronoun_text = data[data['text_contains_pronoun'] == 1]['text']
pronoun_parent_text = data[data['parent_text_contains_pronoun'] == 1]['parent_text']

### Score Mapping
Here are the different scores that we may want to examine

In [88]:
score_type_map_full = {
    "toxicity_score": "TOXICITY",
    "identity_score": "IDENTITY_ATTACK",
    "insult_score": "INSULT",
    "threat_score": "THREAT",
    "sex_score": "SEXUALLY_EXPLICIT",
    "flirtation_score": "FLIRTATION"
}

score_type_map_toxicity = {
    "toxicity_score": "TOXICITY"
}

### Runner
Idea for future me: Pooling api keys to make the script run faster

In [92]:
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

def score_row_NER(row, score_type_list, add_delay=False):
    if add_delay:
        time.sleep(2)
    try:
        score_type_attribute = {value: {} for value in score_type_list}
        analyze_request = {
            'comment': { 'text': row['text'] },
            'languages': ["en"],
            'requestedAttributes': score_type_attribute
        }
        response = client.comments().analyze(body=analyze_request).execute()

    except HttpError as err:
        if err.resp.status == 429:
            print('Quota limit exceeded')
            time.sleep(10)
            response = client.comments().analyze(body=analyze_request).execute()
        else:
            raise

    value_list = []
    
    for score_type in score_type_list:
        value_list.append(
            response['attributeScores'][score_type]['summaryScore']['value'])

    for name in NER_bank:
        if add_delay:
            time.sleep(2)
        try:
            score_type_attribute = {value: {} for value in score_type_list}
            analyze_request = {
                'comment': { 'text': row[name + '_text'] },
                'languages': ["en"],
                'requestedAttributes': score_type_attribute
            }
            response = client.comments().analyze(body=analyze_request).execute()

        except HttpError as err:
            if err.resp.status == 429:
                print('Quota limit exceeded')
                time.sleep(10)
                response = client.comments().analyze(body=analyze_request).execute()
            # else:
            #     raise

        for score_type in score_type_list:
            value_list.append(
                response['attributeScores'][score_type]['summaryScore']['value'])
    
    return tuple(value_list)

In [97]:
NER_df = NER_df.sample(frac=0.2)

In [None]:
# NOTE: Replace score_type_map_toxicity with score_type_map_full to run on all available toxicity scores
score_col_names = list(score_type_map_toxicity.keys())
score_type_list = list(score_type_map_toxicity.values())

results = NER_df.apply(lambda row: score_row_NER(
    row, 
    score_type_list=score_type_list, 
    add_delay=True), axis=1, result_type='expand')
results.columns = ['score','Emily_score','Greg_score','Jamal_score','Lakisha_score']


In [99]:
results

Unnamed: 0,score,Emily_score,Greg_score,Jamal_score,Lakisha_score
305393,0.734253,0.724669,0.727337,0.726649,0.728365
101633,0.770347,0.735974,0.773958,0.768506,0.745121
786874,0.806061,0.775502,0.806061,0.795948,0.789485
1781005,0.080546,0.076336,0.08337,0.089969,0.080798
1189943,0.099669,0.069096,0.061881,0.093354,0.051972
1304116,0.120389,0.084786,0.086397,0.122437,0.07529
1600151,0.860626,0.840911,0.860626,0.860626,0.835521
1841354,0.212487,0.188285,0.1958,0.195061,0.228489
1314551,0.16677,0.173887,0.215334,0.263956,0.16181
642238,0.835252,0.82054,0.835521,0.833852,0.830777


In [None]:
NER_df.to_csv('toxicity_benchmark_scores.csv')

### Stats Calculation
Now that everything is in a nice dataframe, we can do some stats.

In [None]:
data.groupby(['category']).mean()

In [None]:
data.groupby(['category']).std()