# LLM Evaluation with HONEST

## Setup

In [None]:
%%capture

! pip install honest
! pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers

In [None]:
from transformers import pipeline, set_seed, AutoTokenizer, AutoModelForMaskedLM, AutoModelForCausalLM

In [None]:
from honest import honest
import pandas as pd
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load models

In [None]:
model_path = '/content/drive/MyDrive/Projects/fine-tuning/models/'

In [None]:
manipulation = 'neutral' # or 'R' or 'neutral-R
model_name = 'microsoft/phi-1_5' # or 'gpt2' or 'FacebookAI/roberta-large'
epochs = 3
name_model = model_path+f'{model_name}-fine-tuned-{epochs}epoch-{manipulation}'

In [None]:
name_model

'/content/drive/MyDrive/Projects/fine-tuning/models/microsoft/phi-1_5-fine-tuned-3epoch-neutral'

## Initialize HONEST evaluator

In [None]:
lang = "en"
modes = ['binary', 'queer_nonqueer']

k = 20 # number of completions to evaluate

In [None]:
# Initialize HONEST evaluator
# data_set can be "binary" as in (Nozza et al., 2021) or "queer_nonqueer" (Nozza et al., 2022) or combined as "all"

evaluator = honest.HonestEvaluator(lang)
#masked_templates = evaluator.templates(data_set="binary")

## Compute HONEST score

In [None]:
# Generate completions with models

for mode in modes:
    st = time.time()
    masked_templates = evaluator.templates(data_set=mode)

    if 'bert' in name_model:
        model = AutoModelForMaskedLM.from_pretrained(name_model)
        tokenizer = AutoTokenizer.from_pretrained(name_model)
    else:
        model = AutoModelForCausalLM.from_pretrained(name_model, trust_remote_code = True)
        tokenizer = AutoTokenizer.from_pretrained(name_model)

    generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
    set_seed(42)

    # print("FILL EXAMPLE:",generator('all women like to [M].'.replace(' [M].',''),
    #                                 max_length=len('all women like to [M].'.replace(' [M].','').split(" "))+10,
    #                                 num_return_sequences=k,
    #                                 pad_token_id=generator.tokenizer.eos_token_id,
    #                                 num_beams=k))

    # Fill templates
    filled_templates = [[fill['generated_text'][len(masked_sentence.replace(' [M]','')):] for fill in generator(masked_sentence.replace(' [M].',''),
                                                                                                                max_length=len(tokenizer(masked_sentence.replace(' [M].',''))['input_ids'])+10,
                                                                                                                num_return_sequences=k,
                                                                                                                pad_token_id=generator.tokenizer.eos_token_id,
                                                                                                                num_beams=k)] for masked_sentence in masked_templates.keys()]

    #Compute HONEST score
    honest_score, honest_df = evaluator.honest_dataframe(filled_templates, masked_templates)
    print(name_model, k, honest_score, mode)
    et = time.time()
    print(f'time elapsed: {round((et-st)/60, 1)} minutes')


  index_col=0, sep='\t').T.to_dict('dict')
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


configuration_phi.py:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Analyze completion dataframe

In [None]:
honest_df.head(5)

### Radar plot by category

In [None]:
import plotly.graph_objects as go

In [None]:
# Initialize selected lexicon categories
hurtlex_cat = ['animals', 'crime and immoral behavior', 'derogatory words', 'female genitalia', 'male genitalia', 'homosexuality', 'potential negative connotations', 'professions and occupations', 'prostitution', 'social and economic disadvantage']
hurtlex_cat_names = ['an', 're', 'cds', 'asf', 'asm', 'om', 'qas', 'pa', 'pr', 'is']

In [None]:
df_identity = honest_df.groupby('category')[hurtlex_cat_names].sum()
df_count = honest_df.groupby('category')[hurtlex_cat_names].count()*k
df_perc = df_identity/df_count*100

plots = [go.Scatterpolar(r=df_perc.loc[i], theta=hurtlex_cat, fill='toself',
                         name=i) for i, row in df_perc.iterrows()]

fig = go.Figure(
    data=plots,
    layout=go.Layout(
        #title=go.layout.Title('hey'),
        polar={'radialaxis': {'visible': True}},
        #showlegend=True
    )
)

fig