In [None]:
pip install evaluate mauve-text bert_score

## Evaluate all Generations

In [None]:
# Import libraries 
import os
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from evaluate import load
from mauve import compute_mauve

# Load bertscore and bleu
bleu = load("bleu")
bert = load("bertscore")

def evaluate_folder(folder_path):
    # Create a new folder for evaluated dataframes
    output_folder = os.path.join(folder_path, "evaluation")
    os.makedirs(output_folder, exist_ok=True)

    # Iterate over CSV files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            print(f"Evaluating DataFrame in file: {filename}")
            # Load dataframe
            df = pd.read_csv(os.path.join(folder_path, filename))

            # Perform evaluation
            df_evaluated = process_dataframe(df)

            # Save evaluated dataframe to new folder
            output_filename = filename.replace(".csv", "_evaluated.csv")
            output_path = os.path.join(output_folder, output_filename)
            df_evaluated.to_csv(output_path, index=False)
            print(f"Evaluated dataframe saved to: {output_path}")

def process_dataframe(df):
    # Remove speaker tags from reference texts
    df['adult_feedback_response'] = df['adult_feedback_response'].apply(lambda x: re.sub(r'^[A-Za-z]:\s*', '', x))

    # add new columns to samples dataframe
    df['bleu_score'] = np.nan
    df['bleu_brevity_penalty'] = np.nan
    df['bleu_length_ratio'] = np.nan
    df['mauve_score'] = np.nan

    # there will be a bertf1 for each model response
    for i in range(1,6):
        df[f'bert_f1_r{i}'] = np.nan

    # construct pairs dataframe
    print('Constructing pairs dataframe...')
    pairs = pd.DataFrame(columns=['generated_response', 'human_response', 'sample_number'])

    for sample_number, row in tqdm(df.iterrows(), total=len(df)):
        human_response = row['adult_feedback_response']
        model_responses = [row[f"generated_response_{i}"] for i in range(1,6)]
        for model_response_idx,model_response in enumerate(model_responses, start=1):
            pairs = pd.concat([pairs, pd.DataFrame({
                    'generated_response': [model_response],
                    'human_response': [human_response],
                    'sample_number': [sample_number],
                    'model_response_index': [model_response_idx],
                })])
    pairs = pairs.reset_index(drop=True)
    print(f'Number of pairs: {len(pairs)}')

    # compute bleu
    print('Computing BLEU score...')
    bleu_results = bleu.compute(
        predictions=pairs['generated_response'].values,
        references=pairs['human_response'].values)
    print("Done.")
    bleu_score = bleu_results['bleu']
    bleu_brevity_penalty = bleu_results['brevity_penalty']
    bleu_length_ratio = bleu_results['length_ratio']
    print(f"BLEU score: {bleu_score}")
    print(f"Brevity penalty: {bleu_brevity_penalty}")
    print(f"Length ratio: {bleu_length_ratio}")
    # add to pairs dataframe
    pairs['bleu_score'] = [bleu_score] * len(pairs)
    pairs['bleu_brevity_penalty'] = [bleu_brevity_penalty] * len(pairs)
    pairs['bleu_length_ratio'] = [bleu_length_ratio] * len(pairs)

    # compute mauve
    print('Computing MAUVE score...')
    mauve_results = compute_mauve(
            p_text=pairs['generated_response'].values,
            q_text=pairs['human_response'].values,
            device_id=0,  # use GPU 0 for featurization
            max_text_length=256,  # truncate text to x token
            featurize_model_name='gpt2-large',
        )
    mauve_score = mauve_results.mauve
    print(f'MAUVE score: {mauve_score}')
    print('Done.')
    # add to pairs dataframe
    pairs['mauve_score'] = [mauve_score] * len(pairs)

    print('Computing BERT score...')
    bert_results = bert.compute(
            predictions=pairs['generated_response'].values,
            references=pairs['human_response'].values,
            model_type="bert-base-uncased",
            use_fast_tokenizer=True,
            verbose=True)
    mean_bert_f1 = np.mean(bert_results['f1'])
    n_bert_f1s = len(bert_results['f1'])
    bert_f1s = bert_results['f1']
    print(f"Mean BERT F1 score: {mean_bert_f1}")
    print(f"Number of BERT F1 scores: {n_bert_f1s}")
    print("Done.")
    # add to pairs dataframe
    pairs['bert_f1'] = bert_f1s

    # add scores to samples dataframe
    print('Adding scores to samples dataframe...')
    for _, pair in tqdm(pairs.iterrows(), total=len(pairs)):
        sample_idx = pair['sample_number']
        model_response_idx = str(int(pair['model_response_index']))
        df.at[sample_idx, 'bleu_score'] = pair['bleu_score']
        df.at[sample_idx, 'bleu_brevity_penalty'] = pair['bleu_brevity_penalty']
        df.at[sample_idx, 'bleu_length_ratio'] = pair['bleu_length_ratio']
        df.at[sample_idx, 'mauve_score'] = pair['mauve_score']
        df.at[sample_idx, f'bert_f1_r{model_response_idx}'] = pair['bert_f1']
    print('Done.')
    return df

# Provide the folder path containing CSV files for evaluation
folder_path = '/content/drive/MyDrive/generated_responses'
evaluate_folder(folder_path)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Evaluating DataFrame in file: opt_rephrase_generated_responses.csv
Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 247.28it/s]


Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.04918416937721405
Brevity penalty: 1.0
Length ratio: 1.2221574344023323
Computing MAUVE score...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.004380911536444501
Done.
Computing BERT score...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 3814.68 seconds, 0.09 sentences/sec
Mean BERT F1 score: 0.48308553756147193
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 5879.20it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/opt_rephrase_generated_responses_evaluated.csv
Evaluating DataFrame in file: gemma_base_rephrase_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 271.95it/s]


Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.0
Brevity penalty: 1.0
Length ratio: 1.145189504373178
Computing MAUVE score...


Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.006754901839255892
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 3841.48 seconds, 0.09 sentences/sec
Mean BERT F1 score: 0.38038368985272836
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 5126.37it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/gemma_base_rephrase_generated_responses_evaluated.csv
Evaluating DataFrame in file: gemma_base_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 80/80 [00:00<00:00, 287.34it/s]


Number of pairs: 400
Computing BLEU score...
Done.
BLEU score: 0.02296782082969922
Brevity penalty: 1.0
Length ratio: 1.0612641815235009
Computing MAUVE score...


Featurizing p:   0%|          | 0/400 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/400 [00:00<?, ?it/s]

MAUVE score: 0.014045340200052273
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 3816.70 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.38389027278870347
Number of BERT F1 scores: 400
Done.
Adding scores to samples dataframe...


100%|██████████| 400/400 [00:00<00:00, 5967.63it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/gemma_base_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: opt_base_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 80/80 [00:00<00:00, 271.81it/s]


Number of pairs: 400
Computing BLEU score...
Done.
BLEU score: 0.010093496098552765
Brevity penalty: 1.0
Length ratio: 1.0777957860615883
Computing MAUVE score...


Featurizing p:   0%|          | 0/400 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/400 [00:00<?, ?it/s]

MAUVE score: 0.01037047395036719
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 3847.81 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.38805303763598203
Number of BERT F1 scores: 400
Done.
Adding scores to samples dataframe...


100%|██████████| 400/400 [00:00<00:00, 3364.99it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/opt_base_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: opt_base_rephrase_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 265.41it/s]


Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.02283839266424496
Brevity penalty: 1.0
Length ratio: 1.154518950437318
Computing MAUVE score...


Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.006505277444103577
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 3929.11 seconds, 0.09 sentences/sec
Mean BERT F1 score: 0.39077076173346975
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 2953.67it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/opt_base_rephrase_generated_responses_evaluated.csv
Evaluating DataFrame in file: opt_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 80/80 [00:00<00:00, 274.13it/s]


Number of pairs: 400
Computing BLEU score...
Done.
BLEU score: 0.03572425838853758
Brevity penalty: 1.0
Length ratio: 1.1588330632090762
Computing MAUVE score...


Featurizing p:   0%|          | 0/400 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/400 [00:00<?, ?it/s]

MAUVE score: 0.007580454428293312
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 3904.25 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.4543501348048449
Number of BERT F1 scores: 400
Done.
Adding scores to samples dataframe...


100%|██████████| 400/400 [00:00<00:00, 5572.33it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/opt_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: dialogpt_rephrase_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 283.34it/s]


Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.010164263720028589
Brevity penalty: 1.0
Length ratio: 1.2285714285714286
Computing MAUVE score...


Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.004072096261961255
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 3985.54 seconds, 0.09 sentences/sec
Mean BERT F1 score: 0.370957571008931
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 5363.76it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/dialogpt_rephrase_generated_responses_evaluated.csv
Evaluating DataFrame in file: dialogpt_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 80/80 [00:00<00:00, 258.57it/s]

Number of pairs: 400
Computing BLEU score...
Done.
BLEU score: 0.014559376544091288
Brevity penalty: 1.0
Length ratio: 1.1354943273905997
Computing MAUVE score...





Featurizing p:   0%|          | 0/400 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/400 [00:00<?, ?it/s]

MAUVE score: 0.007010278993379658
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 3960.54 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.37022189270704986
Number of BERT F1 scores: 400
Done.
Adding scores to samples dataframe...


100%|██████████| 400/400 [00:00<00:00, 5880.86it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/dialogpt_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: gemma_rephrase_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 254.59it/s]


Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.020110935283748667
Brevity penalty: 1.0
Length ratio: 1.0658892128279884
Computing MAUVE score...


Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.007928071967336393
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 4042.93 seconds, 0.09 sentences/sec
Mean BERT F1 score: 0.45725899511489315
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 5634.78it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/gemma_rephrase_generated_responses_evaluated.csv
Evaluating DataFrame in file: gemma_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 80/80 [00:00<00:00, 242.79it/s]


Number of pairs: 400
Computing BLEU score...
Done.
BLEU score: 0.027712690500083733
Brevity penalty: 1.0
Length ratio: 1.0119935170178282
Computing MAUVE score...


Featurizing p:   0%|          | 0/400 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/400 [00:00<?, ?it/s]

MAUVE score: 0.012922026385549389
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4019.57 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.4204777904599905
Number of BERT F1 scores: 400
Done.
Adding scores to samples dataframe...


100%|██████████| 400/400 [00:00<00:00, 5496.67it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/gemma_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: dialogpt_base_feedback_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 78/78 [00:00<00:00, 278.86it/s]


Number of pairs: 390
Computing BLEU score...
Done.
BLEU score: 0.010986710779797484
Brevity penalty: 0.2166199530955648
Length ratio: 0.39531772575250834
Computing MAUVE score...


Featurizing p:   0%|          | 0/390 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/390 [00:00<?, ?it/s]

MAUVE score: 0.11629374668073485
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/6 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/7 [00:00<?, ?it/s]

done in 4058.29 seconds, 0.10 sentences/sec
Mean BERT F1 score: 0.392064344844757
Number of BERT F1 scores: 390
Done.
Adding scores to samples dataframe...


100%|██████████| 390/390 [00:00<00:00, 5651.84it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/dialogpt_base_feedback_generated_responses_evaluated.csv
Evaluating DataFrame in file: dialogpt_base_rephrase_generated_responses.csv





Constructing pairs dataframe...


100%|██████████| 69/69 [00:00<00:00, 287.88it/s]

Number of pairs: 345
Computing BLEU score...
Done.
BLEU score: 0.03576319377612287
Brevity penalty: 0.7406497444165119
Length ratio: 0.7690962099125365
Computing MAUVE score...





Featurizing p:   0%|          | 0/345 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/345 [00:00<?, ?it/s]

MAUVE score: 0.020725922363702517
Done.
Computing BERT score...
calculating scores...
computing bert embedding.


  0%|          | 0/6 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/6 [00:00<?, ?it/s]

done in 4129.10 seconds, 0.08 sentences/sec
Mean BERT F1 score: 0.4431599823029145
Number of BERT F1 scores: 345
Done.
Adding scores to samples dataframe...


100%|██████████| 345/345 [00:00<00:00, 5799.09it/s]

Done.
Evaluated dataframe saved to: /content/drive/MyDrive/generated_responses/evaluation/dialogpt_base_rephrase_generated_responses_evaluated.csv



