In [105]:

import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset_builder, load_dataset, concatenate_datasets
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import plotly.express as px

import torch
import platform
import evaluate

print(platform.platform())

import sys
sys.path.insert(1, '../Data')

import time
from ratelimiter import RateLimiter

from googleapiclient.errors import HttpError

macOS-14.1-arm64-arm-64bit


In [20]:
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyDcA-LYHVNateEydAvPLg5AaF19sZwM-mY'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

In [98]:
def toxicity_query(text):
    analyze_request = {
        'comment': { 'text': text },
        'requestedAttributes': {'TOXICITY': {}}
    }
    response = client.comments().analyze(body=analyze_request).execute()
    # try:
    #     response = client.comments().analyze(body=analyze_request).execute()
    # except HTTPError:
    #     print('caught in HELPER')
    return response

toxicity_query('hi there how are you bitch')

{'attributeScores': {'TOXICITY': {'spanScores': [{'begin': 0,
     'end': 26,
     'score': {'value': 0.9061063, 'type': 'PROBABILITY'}}],
   'summaryScore': {'value': 0.9061063, 'type': 'PROBABILITY'}}},
 'languages': ['en'],
 'detectedLanguages': ['en']}

In [123]:
rng = np.random.default_rng()

def evaluate_toxicity(data, no_samples=1000, max_calls=100, period=10, DEBUG=False):
    rate_limiter = RateLimiter(max_calls=max_calls, period=period)
    sample_idxs = rng.integers(0, data.shape[0], size=no_samples).astype(int)
    toxicity_scores = np.zeros(no_samples)
    for idx in range(no_samples):
        with rate_limiter:
            try:
                response = toxicity_query(data[int(sample_idxs[idx])]['summary'])
                toxicity_scores[idx] = response['attributeScores']['TOXICITY']['summaryScore']['value']
                # print(response)
            except HttpError as http_error:
                print(f"Time limit exceeded, sleeping for 10sec, No. samples evaluated = {idx}")
                time.sleep(10)
                idx -= 1
    if DEBUG:
        return toxicity_scores, sample_idxs
    return toxicity_scores

In [100]:
reddit_dataset = load_dataset("reddit_tifu", 'long')['train'].remove_columns(['ups', 'num_comments', 'upvote_ratio', 'score', 'title'])
reddit_dataset = reddit_dataset.rename_column('documents', 'document')
reddit_dataset = reddit_dataset.rename_column('tldr', 'summary')

In [40]:
news_dataset = load_dataset("cnn_dailymail", '2.0.0')
news_dataset = concatenate_datasets([news_dataset[k] for k in news_dataset.keys()])
news_dataset = news_dataset.rename_column('article', 'document')
news_dataset = news_dataset.rename_column('highlights', 'summary')

In [41]:
dialogue_dataset = load_dataset('samsum')
dialogue_dataset = concatenate_datasets([dialogue_dataset[k] for k in dialogue_dataset.keys()])
dialogue_dataset = dialogue_dataset.rename_column('dialogue', 'document')

In [44]:
all_datasets = {
    'news': news_dataset.to_pandas(),
    'reddit': reddit_dataset.to_pandas(),
    'dialogue': dialogue_dataset.to_pandas()
}

In [124]:
toxicity_results = {}

t = time.time()
for k, data in all_datasets.items():
    toxicity_results[k] = evaluate_toxicity(data, no_samples=1000, max_calls=64, period=10, DEBUG=True)
    print(f"{k} dataset evaluated - {format(time.time() - t, 'f')}")

Time limit exceeded, sleeping for 10sec, No. samples evaluated = 2
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 3
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 4
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 7
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 8
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 9
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 10
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 11
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 13
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 16
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 18
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 19
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 22
Time limit exceeded, sleeping for 10sec, No. samples evaluated = 25
Time limit exceeded, sleeping for 10sec, No. samples e

In [128]:
toxicity_df = pd.DataFrame(columns=['subject', 'subject_idx', 'text', 'toxicity_score'])

num_samples = toxicity_results['news'][0].size

for k in toxicity_results.keys():
    df = pd.DataFrame({
        'subject':[[k] * num_samples][0],
        'subject_idx':toxicity_results[k][1],
        'toxicity_score':toxicity_results[k][0]
    })
    df['text'] = all_datasets[k].select(df['subject_idx'])['summary']
    toxicity_df = pd.concat([toxicity_df, df], ignore_index=True)

toxicity_df

  toxicity_df = pd.concat([toxicity_df, df], ignore_index=True)


Unnamed: 0,subject,subject_idx,text,toxicity_score
0,news,31777,NEW: Gary Giordano's attorney says he is being...,0.016587
1,news,259341,The show at the New York's Fashion Institute o...,0.045873
2,news,189115,Pope Francis said that Satan exists and is tho...,0.000000
3,news,166163,"345,000 more jobs in 3 months, biggest rise si...",0.000000
4,news,161375,Chiara de Blasio described how 'life didn't se...,0.000000
...,...,...,...,...
2995,dialogue,1373,Erika and Marcus have a class in Room 243 today.,0.012943
2996,dialogue,15648,Sophia missed the tram and is waiting for the ...,0.000000
2997,dialogue,8150,Sabrina is planning a trip with the family to ...,0.020106
2998,dialogue,10291,Tracy needs Kate to make a two-minute video of...,0.107833


In [130]:
fig = px.violin(toxicity_df, y="toxicity_score", x="subject", box=True,
          hover_data=toxicity_df.columns)
fig.show()