In [1]:
from measurement import Measurement
import globals 

globals.init()
print(globals.API_KEY)

macOS-14.1-arm64-arm-64bit
AIzaSyDcA-LYHVNateEydAvPLg5AaF19sZwM-mY


In [2]:
from datasets import load_dataset_builder, load_dataset, concatenate_datasets
import pickle
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
import re

In [3]:
def read_lexicon(lex_path):
    lex_df = pd.read_csv(lex_path, sep='\t')
    lex_df.columns = ['word', 'intensity_score']
    lex_df.drop_duplicates(subset=['word'], inplace=True)
    lex_df.set_index('word', inplace=True)
    return lex_df


def preprocess_dataset(df):
    text_df = df.rename(columns=globals.new_col_names)
    text_df['summary'] = text_df['summary'].apply(lambda sentence: re.findall(r'\w+', sentence.lower()))
    if 'id' in text_df.columns:
        return text_df[['id', 'document', 'summary']]
    return text_df[['document', 'summary']]


def process_emot_df(
        df,
        lex_dir_prefix = '/Users/madisonthantu/Desktop/COMS 6998/Final Project/recursive_LLMs/Data/NRC-Emotion-Intensity-Lexicon/OneFilePerEmotion/',
        lex_dir_suffix = '-NRC-Emotion-Intensity-Lexicon-v1.txt', 
        lex_names = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
    ):
    """
    What it do: 
        Create a dataframe where each column corresponds to one of the 8 emotions, 
        the value corresponds to the sum of intensity scores divided by the 
        number of tokens in the summary.
    """
    text_df = preprocess_dataset(df)
    emot_scores_df = pd.DataFrame()
    summ_tok_count = text_df['summary'].apply(lambda x: len(x)).to_numpy()
    weighted_avg = np.zeros(text_df.shape[0])
    
    print("Evaluating emotion intensity ...")
    for LEX in tqdm(lex_names):
        print(LEX)
        lex_path = lex_dir_prefix + LEX + lex_dir_suffix
        lex_df = read_lexicon(lex_path)
        
        score_var, cnt_var = f"{LEX}_score_avg", f"{LEX}_tok_cnt"
        
        # print(text_df['summary'])
        res = text_df['summary'].apply(lambda toks: lex_df.index.str.fullmatch('|'.join(toks)))
        # print(sum(res))
        emot_scores_df[score_var] = res.apply(lambda emot_toks: lex_df[emot_toks]['intensity_score'].mean()).fillna(0)
        emot_scores_df[cnt_var] = np.stack(res.values, dtype=int).sum(axis=1)
        w = np.divide(emot_scores_df[cnt_var].to_numpy(), summ_tok_count) * emot_scores_df[score_var].to_numpy()
        # print(w)
        weighted_avg = np.add(weighted_avg, w)
        
        # return w, weighted_avg, summ_tok_count, emot_scores_df
        
    emot_scores_df = emot_scores_df.fillna(0)
    emot_scores_df['num_summary_tokens'] = summ_tok_count
    emot_scores_df["weighted_avg"] = weighted_avg
        
    return emot_scores_df.iloc[:,::-1], summ_tok_count, weighted_avg

In [37]:
def load_dataset_from_huggingface(model_strs):
    data = load_dataset(*model_strs)
    text_df = pd.DataFrame()
    for k in data.keys():
        text_df = pd.concat([text_df, data[k].to_pandas()])
    text_df = text_df.rename(columns=globals.new_col_names)
    if 'id' in text_df.columns:
        return text_df[['id', 'document', 'summary']]
    return text_df[['document', 'summary']]

In [38]:
all_datasets = {
    'news': load_dataset_from_huggingface(["cnn_dailymail", "2.0.0"]),
    'dialogue': load_dataset_from_huggingface(['samsum']),
    'reddit': load_dataset_from_huggingface(["reddit_tifu", 'long']),
}
all_measurements = {}

In [56]:
subject = 'dialogue'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

# small_df = all_datasets[subject].iloc[:10]
# measurements = Measurement(small_df, dataset_specs, no_samples=4, DEBUG=True)
# config, measurements = measurements.measure()

measurements = Measurement(all_datasets[subject], dataset_specs, DEBUG=True)
res = measurements.measure()
all_measurements['subject'] = {
    'config':res[0],
    'metrics':res[1]
}

# small_df

Evaluating formality ...


100%|██████████| 1000/1000 [03:13<00:00,  5.17it/s]


Evaluating toxicity


  6%|▋         | 65/1000 [00:10<03:17,  4.72it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 65


  8%|▊         | 80/1000 [01:21<07:36,  2.01it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 80


 15%|█▍        | 149/1000 [02:40<04:27,  3.18it/s] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 149


 15%|█▌        | 154/1000 [03:50<1:17:51,  5.52s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 155


 16%|█▌        | 156/1000 [04:59<3:58:35, 16.96s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 157


 16%|█▌        | 159/1000 [06:08<4:20:03, 18.55s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 159


 16%|█▌        | 161/1000 [07:17<5:16:58, 22.67s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 161


 18%|█▊        | 176/1000 [08:28<10:19,  1.33it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 176


 31%|███       | 306/1000 [09:57<05:18,  2.18it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 306


 31%|███       | 309/1000 [11:07<1:54:05,  9.91s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 309


 34%|███▍      | 342/1000 [12:20<01:24,  7.76it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 343


 41%|████      | 411/1000 [13:39<03:38,  2.70it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 411


 41%|████▏     | 413/1000 [14:48<2:00:35, 12.33s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 413


 42%|████▏     | 415/1000 [15:58<3:13:27, 19.84s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 415


 42%|████▏     | 420/1000 [17:07<1:37:30, 10.09s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 420


 45%|████▍     | 449/1000 [18:20<01:10,  7.86it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 449


 52%|█████▏    | 520/1000 [19:39<01:42,  4.66it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 520


 52%|█████▏    | 521/1000 [20:48<2:17:10, 17.18s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 521


 52%|█████▏    | 522/1000 [21:58<4:04:31, 30.69s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 522


 53%|█████▎    | 526/1000 [23:07<1:59:27, 15.12s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 526


 53%|█████▎    | 527/1000 [24:16<4:03:35, 30.90s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 527


 53%|█████▎    | 528/1000 [25:25<5:31:35, 42.15s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 528


 54%|█████▎    | 536/1000 [26:35<43:46,  5.66s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 537


 55%|█████▍    | 546/1000 [27:45<14:02,  1.85s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 546


 61%|██████    | 611/1000 [29:04<05:40,  1.14it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 612


 63%|██████▎   | 633/1000 [30:15<01:17,  4.72it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 633


 67%|██████▋   | 674/1000 [31:29<00:35,  9.14it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 674


 80%|████████  | 803/1000 [32:58<02:25,  1.36it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 803


 82%|████████▏ | 820/1000 [34:09<00:49,  3.62it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 820


 89%|████████▉ | 891/1000 [35:29<00:22,  4.77it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 892


 90%|████████▉ | 895/1000 [36:38<16:53,  9.66s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 895


 90%|████████▉ | 896/1000 [37:48<39:20, 22.70s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 896


 90%|████████▉ | 897/1000 [38:57<57:58, 33.77s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 897


 90%|████████▉ | 898/1000 [40:06<1:12:40, 42.75s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 899


 90%|█████████ | 900/1000 [41:15<1:05:30, 39.30s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 901


 90%|█████████ | 905/1000 [42:24<27:07, 17.13s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 905


 91%|█████████ | 908/1000 [43:34<27:01, 17.63s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 908


 92%|█████████▏| 922/1000 [44:44<00:53,  1.46it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 923


 99%|█████████▉| 992/1000 [46:04<00:02,  3.17it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 992


100%|█████████▉| 997/1000 [47:13<00:20,  6.77s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 997


100%|█████████▉| 998/1000 [48:23<00:40, 20.40s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 998


100%|██████████| 1000/1000 [49:32<00:00,  2.97s/it]


Evaluating emotion intensity ...


100%|██████████| 8/8 [01:02<00:00,  7.87s/it]


In [75]:
subject = 'news'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

measurements = Measurement(all_datasets[subject], dataset_specs, DEBUG=True)
res = measurements.measure()
all_measurements[subject] = {
    'config':res[0],
    'metrics':res[1]
}

Evaluating formality ...


  0%|          | 0/1000 [00:00<?, ?it/s]

Formality Eval - Time limit exceeded, sleeping for 10sec, No. samples evaluated = 0


  0%|          | 1/1000 [00:10<2:53:29, 10.42s/it]

Formality Eval - Time limit exceeded, sleeping for 10sec, No. samples evaluated = 1


100%|██████████| 1000/1000 [03:33<00:00,  4.68it/s]


Evaluating toxicity


  8%|▊         | 77/1000 [00:11<01:51,  8.25it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 77


  8%|▊         | 78/1000 [01:20<4:49:57, 18.87s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 78


  8%|▊         | 79/1000 [02:29<8:25:13, 32.91s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 79


  8%|▊         | 84/1000 [03:39<3:15:10, 12.78s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 84


 10%|▉         | 99/1000 [04:50<11:53,  1.26it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 100


 10%|█         | 105/1000 [05:59<1:09:44,  4.68s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 105


 15%|█▌        | 151/1000 [07:13<01:32,  9.17it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 151


 22%|██▏       | 224/1000 [08:33<02:10,  5.94it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 224


 23%|██▎       | 227/1000 [09:43<2:24:36, 11.22s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 227


 23%|██▎       | 229/1000 [10:52<4:02:43, 18.89s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 229


 23%|██▎       | 230/1000 [12:01<6:57:11, 32.51s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 230


 23%|██▎       | 232/1000 [13:10<6:31:14, 30.57s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 232


 23%|██▎       | 233/1000 [14:19<8:53:10, 41.71s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 233


 24%|██▎       | 235/1000 [15:28<7:27:20, 35.09s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 235


 24%|██▎       | 237/1000 [16:37<6:43:48, 31.75s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 237


 25%|██▌       | 251/1000 [17:48<08:49,  1.41it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 251


 26%|██▌       | 255/1000 [18:57<1:39:57,  8.05s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 255


 33%|███▎      | 329/1000 [20:18<01:48,  6.19it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 329


 33%|███▎      | 330/1000 [21:27<3:30:50, 18.88s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 330


 33%|███▎      | 331/1000 [22:36<6:06:57, 32.91s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 331


 33%|███▎      | 333/1000 [23:45<5:41:51, 30.75s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 333


 34%|███▎      | 335/1000 [24:54<5:28:19, 29.62s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 335


 34%|███▎      | 337/1000 [26:03<5:21:23, 29.09s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 338


 34%|███▍      | 342/1000 [27:13<2:21:23, 12.89s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 342


 34%|███▍      | 344/1000 [28:22<3:43:41, 20.46s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 344


 34%|███▍      | 345/1000 [29:31<6:16:54, 34.53s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 345


 35%|███▌      | 351/1000 [30:41<1:44:04,  9.62s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 351


 36%|███▌      | 356/1000 [31:50<1:12:40,  6.77s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 356


 42%|████▏     | 423/1000 [33:10<03:54,  2.46it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 423


 42%|████▏     | 424/1000 [34:19<3:05:37, 19.34s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 424


 42%|████▎     | 425/1000 [35:28<5:19:51, 33.38s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 425


 46%|████▋     | 463/1000 [36:41<01:01,  8.68it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 464


 55%|█████▍    | 548/1000 [38:03<00:56,  8.02it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 548


 59%|█████▉    | 592/1000 [39:17<00:45,  8.89it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 592


 65%|██████▍   | 649/1000 [40:33<00:40,  8.74it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 649


 72%|███████▏  | 719/1000 [41:52<01:17,  3.61it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 719


 72%|███████▏  | 720/1000 [43:02<1:16:53, 16.48s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 720


 72%|███████▏  | 722/1000 [44:11<1:41:25, 21.89s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 722


 72%|███████▎  | 725/1000 [45:20<1:22:02, 17.90s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 725


 73%|███████▎  | 728/1000 [46:29<1:14:14, 16.38s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 729


 74%|███████▎  | 735/1000 [47:39<28:30,  6.46s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 735


 75%|███████▍  | 746/1000 [48:49<05:34,  1.32s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 746


 82%|████████▎ | 825/1000 [50:10<00:20,  8.45it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 826


 83%|████████▎ | 827/1000 [51:19<46:07, 16.00s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 827


 83%|████████▎ | 829/1000 [52:28<1:01:26, 21.56s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 829


 84%|████████▎ | 836/1000 [53:38<16:30,  6.04s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 836


 84%|████████▍ | 840/1000 [54:47<22:14,  8.34s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 840


 85%|████████▌ | 852/1000 [55:58<02:19,  1.06it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 852


 92%|█████████▏| 923/1000 [57:18<00:16,  4.57it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 923


 92%|█████████▏| 924/1000 [58:27<22:24, 17.69s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 924


 93%|█████████▎| 927/1000 [59:36<19:55, 16.37s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 928


100%|█████████▉| 999/1000 [1:00:56<00:00,  4.46it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 999


100%|██████████| 1000/1000 [1:02:05<00:00,  3.73s/it]


Evaluating emotion intensity ...


100%|██████████| 8/8 [35:59<00:00, 269.93s/it]


In [68]:
subject = 'reddit'
dataset_specs = {
        'generation':0, 
        'subject':subject
    }

measurements = Measurement(all_datasets[subject], dataset_specs, DEBUG=True)
res = measurements.measure()
all_measurements[subject] = {
    'config':res[0],
    'metrics':res[1]
}

Evaluating formality ...


  0%|          | 0/1000 [00:00<?, ?it/s]

Formality Eval - Time limit exceeded, sleeping for 10sec, No. samples evaluated = 0


  0%|          | 1/1000 [00:10<2:52:37, 10.37s/it]

Formality Eval - Time limit exceeded, sleeping for 10sec, No. samples evaluated = 1


100%|██████████| 1000/1000 [03:30<00:00,  4.76it/s]


Evaluating toxicity


  8%|▊         | 79/1000 [00:12<02:07,  7.23it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 79


  8%|▊         | 80/1000 [01:21<5:03:54, 19.82s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 80


  8%|▊         | 81/1000 [02:30<8:42:19, 34.10s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 81


  9%|▊         | 86/1000 [03:40<2:46:46, 10.95s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 86


  9%|▊         | 87/1000 [04:49<7:10:43, 28.31s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 87


  9%|▉         | 91/1000 [05:58<3:32:05, 14.00s/it] 

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 92


  9%|▉         | 93/1000 [07:07<5:55:04, 23.49s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 93


 10%|▉         | 97/1000 [08:17<3:24:26, 13.58s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 97


 10%|█         | 102/1000 [09:27<1:51:26,  7.45s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 102


 11%|█         | 107/1000 [10:36<1:34:09,  6.33s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 107


 11%|█▏        | 113/1000 [11:46<1:10:08,  4.74s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 113


 12%|█▏        | 115/1000 [12:55<3:45:48, 15.31s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 115


 12%|█▏        | 122/1000 [14:05<1:11:11,  4.86s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 122


 19%|█▊        | 187/1000 [15:24<09:34,  1.42it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 187


 25%|██▍       | 248/1000 [16:40<01:28,  8.52it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 248


 32%|███▏      | 318/1000 [18:00<03:08,  3.63it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 318


 32%|███▏      | 322/1000 [19:10<1:16:03,  6.73s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 322


 32%|███▏      | 324/1000 [20:19<3:16:15, 17.42s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 324


 33%|███▎      | 334/1000 [21:29<21:33,  1.94s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 335


 34%|███▍      | 340/1000 [22:39<1:00:07,  5.47s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 341


 36%|███▌      | 358/1000 [23:50<02:59,  3.58it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 359


 36%|███▌      | 362/1000 [24:59<1:32:36,  8.71s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 362


 43%|████▎     | 430/1000 [26:19<03:53,  2.44it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 430


 44%|████▍     | 441/1000 [27:29<11:09,  1.20s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 441


 45%|████▍     | 447/1000 [28:39<32:57,  3.58s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 447


 45%|████▌     | 450/1000 [29:48<1:43:17, 11.27s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 450


 45%|████▌     | 451/1000 [30:57<4:19:07, 28.32s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 451


 52%|█████▏    | 518/1000 [32:16<04:06,  1.96it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 518


 52%|█████▏    | 521/1000 [33:26<1:22:23, 10.32s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 521


 52%|█████▎    | 525/1000 [34:35<1:23:52, 10.59s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 526


 54%|█████▎    | 535/1000 [35:45<22:49,  2.95s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 535


 61%|██████    | 606/1000 [37:05<01:23,  4.71it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 606


 61%|██████    | 608/1000 [38:14<1:35:12, 14.57s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 609


 61%|██████    | 611/1000 [39:24<1:56:11, 17.92s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 611


 61%|██████    | 612/1000 [40:33<3:22:13, 31.27s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 612


 62%|██████▏   | 617/1000 [41:42<1:09:10, 10.84s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 617


 62%|██████▏   | 620/1000 [42:51<1:28:02, 13.90s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 620


 62%|██████▏   | 622/1000 [44:01<2:14:19, 21.32s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 622


 62%|██████▏   | 624/1000 [45:10<2:36:29, 24.97s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 624


 69%|██████▉   | 694/1000 [46:30<01:24,  3.61it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 694


 70%|██████▉   | 696/1000 [47:39<1:07:09, 13.26s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 696


 70%|██████▉   | 697/1000 [48:48<2:25:26, 28.80s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 697


 70%|██████▉   | 699/1000 [49:57<2:23:47, 28.66s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 699


 70%|███████   | 700/1000 [51:06<3:22:19, 40.47s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 700


 71%|███████   | 711/1000 [52:16<10:02,  2.08s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 711


 76%|███████▌  | 760/1000 [53:31<00:31,  7.60it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 761


 80%|███████▉  | 799/1000 [54:45<00:25,  7.90it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 799


 81%|████████  | 806/1000 [55:55<11:26,  3.54s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 806


 88%|████████▊ | 877/1000 [57:15<00:25,  4.80it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 877


 88%|████████▊ | 878/1000 [58:24<41:51, 20.58s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 878


 88%|████████▊ | 879/1000 [59:33<1:10:33, 34.99s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 879


 88%|████████▊ | 880/1000 [1:00:42<1:30:27, 45.23s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 880


 88%|████████▊ | 882/1000 [1:01:52<1:12:20, 36.78s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 882


 89%|████████▉ | 890/1000 [1:03:01<08:27,  4.62s/it]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 890


 89%|████████▉ | 891/1000 [1:04:10<38:59, 21.46s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 891


 91%|█████████ | 906/1000 [1:05:21<01:25,  1.11it/s]  

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 906


 98%|█████████▊| 976/1000 [1:06:41<00:06,  3.84it/s]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 976


 98%|█████████▊| 977/1000 [1:07:50<06:20, 16.56s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 977


 98%|█████████▊| 979/1000 [1:08:59<07:40, 21.94s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 979


 98%|█████████▊| 984/1000 [1:10:08<02:49, 10.59s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 984


 98%|█████████▊| 985/1000 [1:11:18<06:23, 25.60s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 985


 99%|█████████▊| 986/1000 [1:12:27<08:41, 37.26s/it]

Toxicity Eval - Time limit exceeded, sleeping for 69sec, No. samples evaluated = 986


100%|██████████| 1000/1000 [1:13:37<00:00,  4.42s/it]


Evaluating emotion intensity ...


  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
  weighted_avg = np.add(weighted_avg, w)
100%|██████████| 8/8 [02:39<00:00, 19.96s/it]


In [81]:
res

({'subject': 'news', 'generation': 0, 'no_samples': 1000, 'DEBUG': True},
 {'coverage': 0.13059483552380616,
  'compression_ratio': 0.08673687888536863,
  'summary_token_distribution': FreqDist({'the': 614802, 'to': 401344, 'in': 361262, 'of': 315525, 'a': 308201, 'and': 287636, 's': 175456, 'was': 159075, 'for': 158682, 'on': 155509, ...}),
  'formality_scores': array([0.        , 0.        , 0.65817285, 0.61334544, 0.57323146,
         0.50581884, 0.74573767, 0.77405018, 0.50464153, 0.53044152,
         0.87764269, 0.6860916 , 0.78842229, 0.56247801, 0.65897119,
         0.61234808, 0.59759265, 0.67486769, 0.75104058, 0.79654384,
         0.62166816, 0.6740393 , 0.57174528, 0.68555921, 0.74969351,
         0.51926613, 0.58880001, 0.60249305, 0.53011906, 0.7121467 ,
         0.74967897, 0.74498868, 0.68969727, 0.54718226, 0.66668236,
         0.67949039, 0.72146928, 0.66799682, 0.52042693, 0.76333308,
         0.79141021, 0.62560898, 0.75742179, 0.61250705, 0.58990473,
         0.6662

In [76]:
def generate_dataset_analysis_path(config):
    return os.path.join(f"initial_datasets", config['subject'], f"generation{config['generation']}")

for k in all_measurements:
    config = all_measurements[k]['config']
    metrics = all_measurements[k]['metrics']
    path = generate_dataset_analysis_path(config)
    if not os.path.exists(path):
        os.makedirs(path)
    pickle.dump(config, open(os.path.join(path, "config.p"), "wb" ) )
    pickle.dump(metrics, open(os.path.join(path, "measurements.p"), "wb" ) )

In [80]:
all_measurements['news']['metrics']

{'coverage': 0.13059483552380616,
 'compression_ratio': 0.08673687888536863,
 'summary_token_distribution': FreqDist({'the': 614802, 'to': 401344, 'in': 361262, 'of': 315525, 'a': 308201, 'and': 287636, 's': 175456, 'was': 159075, 'for': 158682, 'on': 155509, ...}),
 'formality_scores': array([0.        , 0.        , 0.65817285, 0.61334544, 0.57323146,
        0.50581884, 0.74573767, 0.77405018, 0.50464153, 0.53044152,
        0.87764269, 0.6860916 , 0.78842229, 0.56247801, 0.65897119,
        0.61234808, 0.59759265, 0.67486769, 0.75104058, 0.79654384,
        0.62166816, 0.6740393 , 0.57174528, 0.68555921, 0.74969351,
        0.51926613, 0.58880001, 0.60249305, 0.53011906, 0.7121467 ,
        0.74967897, 0.74498868, 0.68969727, 0.54718226, 0.66668236,
        0.67949039, 0.72146928, 0.66799682, 0.52042693, 0.76333308,
        0.79141021, 0.62560898, 0.75742179, 0.61250705, 0.58990473,
        0.66627121, 0.65536147, 0.56482506, 0.52222747, 0.67292005,
        0.76565748, 0.73170245, 0

In [50]:
i = 0
metrics[i], measurements[metrics[i]]

('coverage', 0.2541582632476331)

In [51]:
i = 1
metrics[i], measurements[metrics[i]]

('compression_ratio', 0.3800084549362253)

In [52]:
i = 2
metrics[i], measurements[metrics[i]]

('summary_token_distribution',
 FreqDist({'to': 15, 'the': 10, 'and': 6, 'is': 6, 'his': 6, 'he': 5, 'for': 4, 'in': 4, 'a': 4, 'sam': 3, ...}))

In [53]:
i = 3
metrics[i], measurements[metrics[i]]

('formality_scores', array([0.98254561, 0.99667537, 0.9940719 , 0.96123332]))

In [54]:
i = 4
metrics[i], measurements[metrics[i]]

('formality_sample_idxs', array([7, 4, 3, 0]))

In [55]:
small_df.iloc[measurements[metrics[4]]]

Unnamed: 0,id,document,summary
7,13730463,Sarah: I found a song on youtube and I think y...,Sarah sends James an instrumental song he migh...
4,13728094,Sam: hey overheard rick say something\r\nSam:...,"Sam is confused, because he overheard Rick com..."
3,13730747,"Edward: Rachel, I think I'm in ove with Bella....",Edward thinks he is in love with Bella. Rachel...
0,13818513,Amanda: I baked cookies. Do you want some?\r\...,Amanda baked cookies and will bring Jerry some...
