In [187]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset_builder, load_dataset, concatenate_datasets
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import plotly.express as px

import torch
import platform
import evaluate

print(platform.platform())

import sys
sys.path.insert(1, '../Data')

import time
from ratelimiter import RateLimiter

from nltk.probability import FreqDist

from itertools import chain
from tqdm import tqdm

macOS-14.1-arm64-arm-64bit


In [188]:
dialogue_dataset = load_dataset('samsum')
dialogue_dataset = concatenate_datasets([dialogue_dataset[k] for k in dialogue_dataset.keys()])
dialogue_dataset = dialogue_dataset.rename_column('dialogue', 'document')


In [189]:
from ratelimiter import RateLimiter
import requests
import time

In [190]:
# FORMALITY globals

API_URL = "https://api-inference.huggingface.co/models/s-nlp/roberta-base-formality-ranker"
headers = {"Authorization": "Bearer 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"}
rng = np.random.default_rng()

In [196]:
# Emotion intensity globals

from pathlib import Path

new_col_names = {
    'article':'document',
    'highlights':'summary',
    'documents':'document',
    'tldr':'summary',
    'dialogue':'document'
}

keep_cols = ['document', 'summary', 'id']

In [192]:
# Toxicity globals

from googleapiclient.errors import HttpError
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyDcA-LYHVNateEydAvPLg5AaF19sZwM-mY'

client = discovery.build(
  "commentanalyzer",
  "v1alpha1",
  developerKey=API_KEY,
  discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
  static_discovery=False,
)

In [193]:
def formality_query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()


def read_lexicon(lex_path):
    lex_df = pd.read_csv(lex_path, sep='\t')
    lex_df.columns = ['word', 'intensity_score']
    lex_df.drop_duplicates(subset=['word'], inplace=True)
    lex_df.set_index('word', inplace=True)
    return lex_df


def read_csv_dataset(text_path):
    text_df = pd.read_csv(text_path, usecols=['summary'])
    text_df['summary'] = text_df['summary'].apply(lambda sentence: re.findall(r'\w+', sentence.lower()))
    return text_df


def read_huggingfce_dataset(model_strs):
    data = load_dataset(*model_strs)
    text_df = pd.DataFrame()
    for k in data.keys():
        text_df = pd.concat([text_df, data[k].to_pandas()])
    text_df = text_df.rename(columns=new_col_names)
    text_df['summary'] = text_df['summary'].apply(lambda sentence: re.findall(r'\w+', sentence.lower()))
    if 'id' in text_df.columns:
        return text_df[['id', 'document', 'summary']]
    return text_df[['document', 'summary']]


def toxicity_query(text):
    analyze_request = {
        'comment': { 'text': text },
        'requestedAttributes': {'TOXICITY': {}}
    }
    response = client.comments().analyze(body=analyze_request).execute()
    return response

In [201]:

class Measurement:
    def __init__(
            self, 
            data_df,
            dataset_specs,
            rate_limiter_params={'max_calls':64, 'period':10},
            no_samples=1000,
            lex_dir_prefix = '/Users/madisonthantu/Desktop/COMS 6998/Final Project/recursive_LLMs/Data/NRC-Emotion-Intensity-Lexicon/OneFilePerEmotion/',
            DEBUG=False
        ):
        self.data_df = data_df
        self.rate_limiter = RateLimiter(**rate_limiter_params)
        assert(Path(lex_dir_prefix).exists())
        self.lex_dir_prefix = lex_dir_prefix
        
        assert(k in dataset_specs.keys() for k in ['generation', 'subject']), "Must supply the dataset specs"
        self.config = {
            'subject': dataset_specs['subject'],
            'generation': dataset_specs['generation'],
            'no_samples': no_samples,
            'DEBUG': DEBUG
        }
        
    def compute_coverage(self):
        coverage = self.data_df.apply(lambda x: len(set(x['summary_toks']).intersection(set(x['document_toks']))), axis=1)
        coverage = coverage.divide(self.data_df['document_toks'].apply(lambda x: len(set(x))))
        return coverage.mean()
    
    def compute_compression_ratio(self):
        ratio = self.data_df['summary'].apply(lambda x: len(x.split(" "))).divide(self.data_df['document'].apply(lambda x: len(x.split(" "))))
        return ratio.mean()
    
    def compute_summary_token_distribution(self):
        fdist = FreqDist(tok for tok in list(chain.from_iterable(self.data_df['summary_toks'])))
        return fdist
    
    
    def evaluate_formality(self):
        sample_idxs = rng.integers(0, self.data_df.shape[0], size=self.config['no_samples'])
        formality_scores = np.zeros(self.config['no_samples'])
        print("Evaluating formality ...")
        for idx in tqdm(range(self.config['no_samples'])):
            with self.rate_limiter:
                response = formality_query({
                    "inputs": self.data_df.iloc[sample_idxs[idx]]['summary']
                })
                try:
                    formality_scores[idx] = response[0][0]['score']
                except:
                    assert('error' in response.keys())
                    print(f"Formality Eval - Time limit exceeded, sleeping for 10sec, No. samples evaluated = {idx}")
                    time.sleep(10)
                    idx -= 1
        if self.config['DEBUG']:
            return formality_scores, sample_idxs
        return formality_scores
    
    
    def evaluate_toxicity(self):
        sample_idxs = rng.integers(0, self.data_df.shape[0], size=self.config['no_samples']).astype(int)
        toxicity_scores = np.zeros(self.config['no_samples'])
        print("Evaluating toxicity")
        for idx in tqdm(range(self.config['no_samples'])):
            with self.rate_limiter:
                try:
                    response = toxicity_query(self.data_df.iloc[int(sample_idxs[idx])]['summary'])
                    toxicity_scores[idx] = response['attributeScores']['TOXICITY']['summaryScore']['value']
                except HttpError:
                    print(f"Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = {idx}")
                    time.sleep(69)
                    idx -= 1
        if self.config['DEBUG']:
            return toxicity_scores, sample_idxs
        return toxicity_scores
    
    
    def evaluate_emotion_intensity(
            self,
            lex_dir_suffix = '-NRC-Emotion-Intensity-Lexicon-v1.txt', 
            lex_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
        ):
        """
        What it do: 
            Create a dataframe where each column corresponds to one of the 8 emotions, 
            the value corresponds to the sum of intensity scores divided by the 
            number of tokens in the summary.
        """
        emot_scores_df = pd.DataFrame()
        summ_tok_count = self.data_df['summary'].apply(lambda x: len(x)).to_numpy()
        weighted_avg = np.zeros(self.data_df.shape[0])
        
        print("Evaluating emotion intensity ...")
        for LEX in tqdm(lex_names):
            lex_path = self.lex_dir_prefix + LEX + lex_dir_suffix
            lex_df = read_lexicon(lex_path)
            score_var, cnt_var = f"{LEX}_score_avg", f"{LEX}_tok_cnt"
            res = self.data_df['summary'].apply(lambda toks: lex_df.index.str.fullmatch('|'.join(toks)))
            emot_scores_df[score_var] = res.apply(lambda emot_toks: lex_df[emot_toks]['intensity_score'].mean()).fillna(0)
            emot_scores_df[cnt_var] = np.stack(res.values, dtype=int).sum(axis=1)
            w = np.divide(emot_scores_df[cnt_var].to_numpy(), summ_tok_count) * emot_scores_df[score_var].to_numpy()
            weighted_avg = np.add(weighted_avg, w)
            
        emot_scores_df = emot_scores_df.fillna(0)
        emot_scores_df['num_summary_tokens'] = summ_tok_count
        emot_scores_df["weighted_avg"] = weighted_avg
            
        return emot_scores_df.iloc[:,::-1]
        
        
    def measure(self):
        measurements = {}
        
        self.data_df['document_toks'] = self.data_df['document'].apply(lambda sentence: re.findall(r'\w+', sentence.lower()))
        self.data_df['summary_toks'] = self.data_df['summary'].apply(lambda sentence: re.findall(r'\w+', sentence.lower()))
        
        measurements['coverage'] = self.compute_coverage()
        measurements['compression_ratio'] = self.compute_compression_ratio()
        measurements['summary_token_distribution'] = self.compute_summary_token_distribution()
        
        formality_eval = self.evaluate_formality()
        toxicity_eval = self.evaluate_toxicity()
        
        if self.config['DEBUG']:
            measurements['formality_scores'], measurements['formality_sample_idxs'] = formality_eval
            measurements['toxicity_scores'], measurements['toxicity_sample_idxs'] = toxicity_eval
        else:
            measurements['formality_scores'] = formality_eval
            measurements['toxicity_scores'] = toxicity_eval
        measurements['formality_mean'] = measurements['formality_scores'].mean()
        measurements['toxicity_mean'] = measurements['toxicity_scores'].mean()
        
        emot_df = self.evaluate_emotion_intensity()
        measurements['emotion_intensity_mean'] = emot_df['weighted_avg'].to_numpy().mean()
        measurements['emotion_intensity_measurements'] = emot_df.to_dict()
        
        return self.config, measurements
        

In [202]:
dialogue_df = dialogue_dataset.to_pandas()
subject = 'dialogue'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

measurements = Measurement(dialogue_df, dataset_specs)

config, measured2 = measurements.measure()

Evaluating formality ...


100%|██████████| 1000/1000 [03:07<00:00,  5.35it/s]


Evaluating toxicity


  7%|▋         | 74/1000 [00:12<02:30,  6.13it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 74


  8%|▊         | 76/1000 [01:21<3:45:24, 14.64s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 76


  8%|▊         | 77/1000 [02:30<7:56:13, 30.96s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 77


  8%|▊         | 81/1000 [03:39<3:44:28, 14.66s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 81


  8%|▊         | 82/1000 [04:48<7:53:51, 30.97s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 82


  9%|▉         | 92/1000 [05:59<32:16,  2.13s/it]   

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 92


 10%|█         | 103/1000 [07:09<17:04,  1.14s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 103


 18%|█▊        | 175/1000 [08:29<02:44,  5.03it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 175


 18%|█▊        | 176/1000 [09:38<4:13:06, 18.43s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 176


 18%|█▊        | 179/1000 [10:47<3:47:23, 16.62s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 179


 18%|█▊        | 182/1000 [11:56<3:57:49, 17.44s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 182


 18%|█▊        | 183/1000 [13:05<6:49:29, 30.07s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 183


 18%|█▊        | 185/1000 [14:15<6:38:49, 29.36s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 185


 19%|█▊        | 186/1000 [15:24<9:08:14, 40.41s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 186


 19%|█▊        | 187/1000 [16:33<10:57:59, 48.56s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 187


 20%|█▉        | 195/1000 [17:42<1:42:44,  7.66s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 195


 33%|███▎      | 328/1000 [19:12<03:33,  3.14it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 328


 34%|███▎      | 336/1000 [20:22<27:09,  2.45s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 336


 34%|███▍      | 339/1000 [21:31<1:47:59,  9.80s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 340


 34%|███▍      | 341/1000 [22:41<3:42:35, 20.27s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 341


 34%|███▍      | 342/1000 [23:50<5:46:20, 31.58s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 342


 36%|███▌      | 360/1000 [25:01<04:33,  2.34it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 361


 43%|████▎     | 430/1000 [26:20<03:14,  2.94it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 430


 43%|████▎     | 431/1000 [27:30<2:53:34, 18.30s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 431


 43%|████▎     | 432/1000 [28:39<5:03:57, 32.11s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 432


 43%|████▎     | 433/1000 [29:48<6:41:07, 42.45s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 433


 44%|████▍     | 444/1000 [30:58<22:56,  2.48s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 444


 45%|████▍     | 446/1000 [32:07<2:10:31, 14.14s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 446


 51%|█████     | 511/1000 [33:26<06:34,  1.24it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 511


 55%|█████▌    | 552/1000 [34:40<00:51,  8.64it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 552


 62%|██████▏   | 624/1000 [36:00<01:23,  4.50it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 624


 62%|██████▎   | 625/1000 [37:09<1:44:30, 16.72s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 625


 63%|██████▎   | 631/1000 [38:19<53:07,  8.64s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 631


 64%|██████▎   | 636/1000 [39:28<47:00,  7.75s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 637


 64%|██████▍   | 641/1000 [40:38<48:57,  8.18s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 641


 67%|██████▋   | 669/1000 [41:50<00:41,  8.04it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 669


 74%|███████▎  | 735/1000 [43:09<02:16,  1.94it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 735


 74%|███████▍  | 743/1000 [44:19<12:58,  3.03s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 743


 75%|███████▍  | 747/1000 [45:28<30:17,  7.19s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 748


 85%|████████▍ | 846/1000 [46:51<00:19,  7.98it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 846


 85%|████████▍ | 847/1000 [48:00<49:15, 19.31s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 847


 85%|████████▌ | 850/1000 [49:10<42:15, 16.90s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 851


 86%|████████▌ | 857/1000 [50:19<15:34,  6.53s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 858


 86%|████████▋ | 865/1000 [51:29<07:55,  3.52s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 865


 87%|████████▋ | 870/1000 [52:39<13:08,  6.06s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 870


 94%|█████████▍| 942/1000 [53:59<00:11,  5.12it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 942


 94%|█████████▍| 943/1000 [55:08<16:47, 17.67s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 943


 94%|█████████▍| 945/1000 [56:17<20:51, 22.76s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 945


 95%|█████████▍| 947/1000 [57:26<22:35, 25.58s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 947


 95%|█████████▍| 948/1000 [58:35<33:07, 38.23s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 948


 95%|█████████▌| 951/1000 [59:45<19:11, 23.51s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 951


 96%|█████████▌| 959/1000 [1:00:55<03:01,  4.44s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 959


100%|██████████| 1000/1000 [1:02:08<00:00,  3.73s/it]


Evaluating emotion intensity ...


  0%|          | 0/8 [00:00<?, ?it/s]


error: nothing to repeat at position 66

In [None]:
dialogue_df = dialogue_dataset.to_pandas()
subject = 'dialogue'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

measurements = Measurement(dialogue_df, dataset_specs)

config, measured2 = measurements.measure()

In [198]:
measured2.keys()

dict_keys(['coverage', 'compression_ratio', 'summary_token_distribution', 'formality_scores', 'toxicity_scores', 'formality_mean', 'toxicity_mean'])

In [123]:
dialogue_df = dialogue_dataset.to_pandas()

measurements = Measurement(dialogue_df)

df, measured = measurements.measure()

0        0.600000
1        0.687500
2        0.157303
3        0.826087
4        0.169811
           ...   
16364    0.084848
16365    0.195876
16366    0.067873
16367    0.500000
16368    0.256098
Length: 16369, dtype: float64


In [None]:
small_df = dialogue_df.iloc[:50]
subject = 'dialogue'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

measurements_small = Measurement(small_df, dataset_specs, no_samples=15, DEBUG=True)