In [40]:
from googleapiclient import discovery
from googleapiclient.errors import HttpError
import json
import pandas as pd
import time
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()

with open('api_key.txt', 'r') as file:
    API_KEY = file.read().rstrip()

NER = spacy.load("en_core_web_sm")


In [49]:
def regen(seed=56):
    df1 = pd.read_csv('comments_negative.csv')
    df2 = pd.read_csv('comments_positive.csv')
    df_whole = pd.concat([df1,df2])

    binary_contains_columns = ['text_contains_NER','parent_text_contains_NER','text_contains_pronoun','parent_text_contains_pronoun']
    columns_to_keep = ['text','score','parent_text','parent_score']
    all_columns = columns_to_keep + binary_contains_columns 
    for col in binary_contains_columns:
        df_whole[col] = 0

    sub_df = df_whole.sample(frac=0.01, random_state=seed)

    def check_row_for_NER_and_pronouns(row):
        if row['text'].isupper():
            return row

        parsed = NER(row['text'])
        parsed_parent = NER(row['parent_text'])
        if len(parsed.ents) > 0:
            row['text_contains_NER'] = 1
        if len(parsed_parent.ents) > 0:
            row['parent_text_contains_NER'] = 1
        counter = 0 
        for word in parsed:
            if word.pos_ == 'PRON':
                counter += 1
            if counter > 1:
                row['text_contains_pronoun'] = 1
                continue
        counter = 0 
        for word in parsed_parent:
            if word.pos_ == 'PRON':
                counter += 1
            if counter > 1:
                row['parent_text_contains_pronoun'] = 1
                continue
        return row

    data = sub_df.apply(lambda row: check_row_for_NER_and_pronouns(row), axis=1)

    data = data[(data['text_contains_NER'] == 1) | 
         (data['parent_text_contains_NER'] == 1) | 
         (data['text_contains_pronoun'] == 1) | 
         (data['parent_text_contains_pronoun'] == 1)]
    data = data[all_columns]

    data.to_csv('reddit_comments.csv')
    return data

# NOTE: Data can only be regend if comments_negative, comments_positive csvs are in directory
data = pd.read_csv('reddit_comments.csv')
#data = regen()

In [50]:
data

Unnamed: 0.1,Unnamed: 0,text,score,parent_text,parent_score,text_contains_NER,parent_text_contains_NER,text_contains_pronoun,parent_text_contains_pronoun
0,1039091,Then you spend the next few minutes screaming ...,113,"The F.E.A.R games. Rounding a corner, hearing ...",101.0,1.0,1.0,1.0,1.0
1,1667861,But the national anthem isn't part of the mili...,76,I was surprised how militarist the USA really ...,15.0,0.0,1.0,0.0,1.0
2,1187133,Always was easy against the computer. Half th...,101,Play the small islands maps.\n\nColonise. Ever...,365.0,1.0,1.0,0.0,0.0
3,317042,The best way he can show his wife he loves her...,279,"I'm not pregnant, and I'm also a male. My lunc...",376.0,1.0,0.0,1.0,1.0
4,1458923,Well the crazy guy went and shot him.,-8,So it's really his fault that a crazy guy atta...,3.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
35360,1035898,"Yeah, I've worked in childcare for ten years a...",113,"Yeah, it's always a bit of a gamble to badmout...",258.0,1.0,1.0,1.0,1.0
35361,1738081,Huh? I'm not hating or being a fanboy. I'm say...,-7,"And here, everyone, is what we call a fanboy. ...",8.0,0.0,0.0,1.0,1.0
35362,1332570,Liek dis if u cry everytim,-9,Have you seen the comic that (I believe) origi...,204.0,0.0,1.0,0.0,1.0
35363,1552465,Congratulations on your baby girl.\n\nedit: Wo...,81,I'm adopted and have even less info on my biol...,156.0,1.0,1.0,1.0,1.0


### Import Reddit CSV as Dataframe
Subsampled from https://www.kaggle.com/datasets/ehallmar/reddit-comment-score-prediction (4 mil -> 50k)

In [52]:
# TODO: Analysis
text1= NER(data['text'][374])
for word in text1:
    print(word.text, word.pos_)

" PUNCT
I PRON
'm VERB
just ADV
hesitant ADJ
to PART
follow VERB
a DET
weight NOUN
loss NOUN
guide NOUN
where ADV
step NOUN
1 NUM
is VERB
' PUNCT
get VERB
a DET
300 NUM
pound NOUN
tumor NOUN
' PUNCT
.... PUNCT
" PUNCT


### Score Mapping
Here are the different scores that we may want to examine

In [31]:
score_type_map_full = {
    "toxicity_score": "TOXICITY",
    "identity_score": "IDENTITY_ATTACK",
    "insult_score": "INSULT",
    "threat_score": "THREAT",
    "sex_score": "SEXUALLY_EXPLICIT",
    "flirtation_score": "FLIRTATION"
}

score_type_map_toxicity = {
    "toxicity_score": "TOXICITY"
}

### Runner
Idea for future me: Pooling api keys to make the script run faster

In [32]:
client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

def score_row(row, score_type_list, add_delay=False):
    if add_delay:
        time.sleep(1)
    try:
        score_type_attribute = {value: {} for value in score_type_list}
        analyze_request = {
            'comment': { 'text': row['text'] },
            'languages': ["en"],
            'requestedAttributes': score_type_attribute
        }
        response = client.comments().analyze(body=analyze_request).execute()
    except HttpError as err:
        if err.resp.status == 429:
            print('Quota limit exceeded')
            time.sleep(10)
            response = client.comments().analyze(body=analyze_request).execute()
        else:
            raise
    value_list = []
    for score_type in score_type_list:
        value_list.append(
            response['attributeScores'][score_type]['summaryScore']['value'])
    return tuple(value_list)

In [None]:
# NOTE: Replace score_type_map_toxicity with score_type_map_full to run on all available toxicity scores
score_col_names = list(score_type_map_full.keys())
score_type_list = list(score_type_map_full.values())
print(score_type_list)
results = data.apply(lambda row: score_row(
    row, 
    score_type_list=score_type_list, 
    add_delay=True), axis=1, result_type='expand')
results.columns = score_col_names
data = data.join(results)

In [None]:
data.to_csv('toxicity_benchmark_scores.csv')

### Stats Calculation
Now that everything is in a nice dataframe, we can do some stats.

In [None]:
data.groupby(['category']).mean()

In [None]:
data.groupby(['category']).std()