In [1]:
from googleapiclient import discovery
from googleapiclient.errors import HttpError
from glob import glob
import json
import numpy as np
import pandas as pd
import re
import time
from tqdm import tqdm

In [2]:
with open('/Users/lorenapiedras/Documents/credentials/api_creds.json', 'r') as file:
    api_keys = json.load(file)
API_KEY = api_keys['perspective_api']

In [3]:
score_type_map_full = {
    "toxicity_score": "TOXICITY",
    "identity_score": "IDENTITY_ATTACK",
    "insult_score": "INSULT",
    "threat_score": "THREAT",
    "sex_score": "SEXUALLY_EXPLICIT",
    "flirtation_score": "FLIRTATION"
}

score_type_map_toxicity = {
    "toxicity_score": "TOXICITY",
}

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

def score_row(row, score_type_list, add_delay=False):
    if add_delay:
        time.sleep(1)
    try:
        score_type_attribute = {value: {} for value in score_type_list}
        analyze_request = {
            'comment': { 'text': row['sentence'] },
            'requestedAttributes': score_type_attribute
        }
        response = client.comments().analyze(body=analyze_request).execute()
    except HttpError as err:
        if err.resp.status == 429:
            print('Quota limit exceeded')
            time.sleep(10)
            response = client.comments().analyze(body=analyze_request).execute()
        else:
            raise
    value_list = []
    for score_type in score_type_list:
        value_list.append(
            response['attributeScores'][score_type]['summaryScore']['value'])
    return tuple(value_list)

# Toxicity Classification

In [4]:
run_from_scratch = False

In [12]:
if run_from_scratch:
    data = pd.read_csv('../data/perturbed.csv')

    data_long = data.melt(
        id_vars=['op_gender','subreddit', 'original', 'category'], 
        value_vars=['recommended_sentence', 'non_recommended_sentence'],
        value_name='sentence'
        )

    original_data = data.filter(['subreddit', 'original', 'op_gender']).drop_duplicates()
    original_data = original_data.assign(
        sentence=original_data.original,
        variable='original_sentence',
        category='ORIGINAL'
    )
    data_long = pd.concat([data_long, original_data])
    data_long = data_long.sort_values('original').reset_index(drop=True)

    data_long = data_long.reset_index().rename(
        columns={'index':'id'})

    batches = np.array_split(data_long, 30)

    for i, batch in enumerate(batches):
        batch = batch.assign(
            batch=i
        )
        batches[i] = batch
    
    df_batches = pd.concat(batches)
    df_batches.to_csv('../data/batches.csv', index=False)
else:
    print('Retreiving past batches')
    df_batches = pd.read_csv('../data/batches.csv')
    files = glob('../data/reddit_toxicity_*')
    processed_batches = [int(re.findall(r'[0-9]+',file)[0]) for file in files]
    non_processed_df = df_batches.loc[~df_batches.batch.isin(processed_batches)]
    batches = []
    for b in set(non_processed_df.batch.to_numpy()):
        batches.append(non_processed_df.query(f'batch == {b}'))

Retreiving past batches


In [15]:
score_col_names = list(score_type_map_toxicity.keys())
score_type_list = list(score_type_map_toxicity.values())
n = len(batches)
print(f'Running for {n} batches')
for i,batch in enumerate(tqdm(batches)):
    print(f'Running for batch {i}')
    results = batch.apply(lambda row: score_row(
        row, 
        score_type_list=score_type_list, 
        add_delay=True), axis=1, result_type='expand')
    results.columns = score_col_names
    batch = batch.join(results)
    n_batch = batch.batch.unique()[0]
    batch.to_csv(
        f'../data/reddit_toxicity_{n_batch}.csv',
        index=False
        )

Running for 2 batches


  0%|          | 0/2 [00:00<?, ?it/s]

Running for batch 0


 50%|█████     | 1/2 [16:35<16:35, 995.70s/it]

Running for batch 1


100%|██████████| 2/2 [33:12<00:00, 996.02s/it]


In [74]:
files = glob('../data/reddit_toxicity_*')

In [75]:
toxicity_scores = pd.concat([pd.read_csv(file) for file in files])

In [76]:
toxicity_scores.to_csv('../data/raw_toxicity_data.csv', index=False)

In [65]:
original_sentences = toxicity_scores.loc[
    toxicity_scores.variable == 'original_sentence']
modified_sentences = toxicity_scores.loc[
    toxicity_scores.variable != 'original_sentence']

In [66]:
original_sentences = original_sentences.filter(
    ['original', 'toxicity_score']).rename(columns={'toxicity_score':'original_toxicity'})

In [67]:
comparative_toxicity = pd.pivot(
    modified_sentences, 
    index=['op_gender', 'subreddit', 'original', 'category'],
    columns=['variable'],
    values=['sentence', 'toxicity_score']
    ).reset_index()

In [68]:
col_names = []
for col in comparative_toxicity.columns:
    if col[0] == 'toxicity_score':
        col_name = col[1] + '_'+ col[0]
    elif col[0] == 'sentence':
        col_name = col[1]
    else:
        col_name = col[0]
    col_names.append(col_name)

comparative_toxicity.columns = col_names

In [69]:
comparative_toxicity = comparative_toxicity.merge(
    original_sentences,
    on='original'
)

In [73]:
comparative_toxicity.to_csv('../data/toxicity_classification.csv', index=False)

# Sentiment Analysis

In [5]:
from helper import predict_sentiment

In [6]:
# The text to analyze
text = "I hate potatoes"
predict_sentiment(text)

Text: I hate potatoes
Sentiment: -0.800000011920929, 0.800000011920929


magnitude: 0.8
score: -0.8