In [162]:
import numpy as np
import re
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset_builder, load_dataset, concatenate_datasets
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import plotly.express as px

import torch
import platform
import evaluate

print(platform.platform())

import sys
sys.path.insert(1, '../Data')

macOS-14.1-arm64-arm-64bit


In [159]:
from ratelimiter import RateLimiter
import requests
import time

API_URL = "https://api-inference.huggingface.co/models/s-nlp/roberta-base-formality-ranker"
headers = {"Authorization": "Bearer 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"}

# def query(payload):
# 	response = requests.post(API_URL, headers=headers, json=payload).json()[0][0]
# 	response = {
# 		'formality_score':response['score']
# 	}
# 	return response

rng = np.random.default_rng()

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


def evaluate_formality(data, no_samples=1000, max_calls=100, period=10, DEBUG=False):
    rate_limiter = RateLimiter(max_calls=max_calls, period=period)
    sample_idxs = rng.integers(0, data.shape[0], size=no_samples)
    formality_scores = np.zeros(no_samples)
    for idx in range(no_samples):
        with rate_limiter:
            response = query({
                "inputs": data[sample_idxs[idx]]['summary']
            })
            try:
                formality_scores[idx] = response[0][0]['score']
            except:
                assert('error' in response.keys())
                print(response['error'])
                time.sleep(100)
                idx -= 1
            # print(formality_scores[:10])
    if DEBUG:
        return formality_scores, sample_idxs
    return formality_scores
    
def compute_mean_formality(data, no_samples=1000, max_calls=100, period=10, DEBUG=False):
    assert(not DEBUG), "DEBUG must be False to compute mean formality scores"
    return evaluate_formality(**locals()).mean()


### Lets compare formality evaluations for the summaries associated with each of the 3 datasets!

In [78]:
output = query({
	"I like you. I love you",
})
output

"inputs": "{'I like you. I love you'}"


In [None]:
def preprocess_dataset(dataset, dataset_type, num_samples):
    pass

In [49]:
news_dataset = load_dataset("cnn_dailymail", '2.0.0')
news_dataset = concatenate_datasets([news_dataset[k] for k in news_dataset.keys()])
news_dataset = news_dataset.rename_column('article', 'document')
news_dataset = news_dataset.rename_column('highlights', 'summary')

In [50]:
reddit_dataset = load_dataset("reddit_tifu", 'long')['train'].remove_columns(['ups', 'num_comments', 'upvote_ratio', 'score', 'title'])
reddit_dataset = reddit_dataset.rename_column('documents', 'document')
reddit_dataset = reddit_dataset.rename_column('tldr', 'summary')

In [51]:
dialogue_dataset = load_dataset('samsum')
dialogue_dataset = concatenate_datasets([dialogue_dataset[k] for k in dialogue_dataset.keys()])
dialogue_dataset = dialogue_dataset.rename_column('dialogue', 'document')

In [99]:
dialogue_dataset

Dataset({
    features: ['id', 'document', 'summary'],
    num_rows: 16369
})

In [198]:
news_dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 311971
})

In [199]:
reddit_dataset

Dataset({
    features: ['document', 'summary'],
    num_rows: 42139
})

In [52]:
all_datasets = {
    'news': news_dataset,
    'reddit': reddit_dataset,
    'dialogue': dialogue_dataset
}


In [160]:
formality_results = {}

t = time.time()
for k, data in all_datasets.items():
    formality_results[k] = evaluate_formality(data, no_samples=1000, DEBUG=True)
    print(f"{k} dataset evaluated - {format(time.time() - t, 'f')}")

Model s-nlp/roberta-base-formality-ranker is currently loading
news dataset evaluated - 294.106759
reddit dataset evaluated - 466.806242
Model s-nlp/roberta-base-formality-ranker is currently loading
dialogue dataset evaluated - 740.924127


## Plot results

In [182]:
all_datasets['news'].select([0,1,2])['summary']

["Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund .",
 'Mentally ill inmates in Miami are housed on the "forgotten floor"\nJudge Steven Leifman says most are there as a result of "avoidable felonies"\nWhile CNN tours facility, patient shouts: "I am the son of the president"\nLeifman says the system is unjust and he\'s fighting for change .',
 'NEW: "I thought I was going to die," driver says .\nMan says pickup truck was folded in half; he just has cut on face .\nDriver: "I probably had a 30-, 35-foot free fall"\nMinnesota bridge collapsed during rush hour Wednesday .']

In [189]:
formality_df = pd.DataFrame(columns=['subject', 'subject_idx', 'text', 'formality_score'])

num_samples = formality_results['news'][0].size

for k in formality_results.keys():
    df = pd.DataFrame({
        'subject':[[k] * num_samples][0],
        'subject_idx':formality_results[k][1],
        'formality_score':formality_results[k][0]
    })
    df['text'] = all_datasets[k].select(df['subject_idx'])['summary']
    formality_df = pd.concat([formality_df, df], ignore_index=True)

formality_df

  formality_df = pd.concat([formality_df, df], ignore_index=True)


Unnamed: 0,subject,subject_idx,text,formality_score
0,news,78410,"McDaniel's lawyer claimed more than 15,000 cas...",0.545489
1,news,160980,Scar tissue on Ben Pierce's eyes are causing h...,0.537626
2,news,142219,Lightweight 'Sagami Originals' are 0.01mm thic...,0.686552
3,news,280691,Stephane Cazenave told he is not allowed to op...,0.600823
4,news,104038,Norway spruce selected before Sandy struck to ...,0.586939
...,...,...,...,...
2995,dialogue,3306,Dorothy doesn't like turtle necks. Marie expla...,0.986586
2996,dialogue,8581,Mia and Ruby are going to meet at Olivia's pla...,0.996046
2997,dialogue,3818,"Lily has lost 10lb but her mom, instead of bei...",0.991831
2998,dialogue,2699,"Walter will be back in 2 weeks. Afterwards, he...",0.866593


In [196]:
fig = px.violin(formality_df, y="formality_score", x="subject", box=True,
          hover_data=formality_df.columns)
fig.show()