# Load the data

In [7]:
models = ["llama-ft","bart-ft","llama-instruct-few-shot"]

setting_ds_dict = {
    "base": ["c2s","c2sp","c4s","c4sp","cs","c2spo","c4spo"],
    "masked": ["c2s","c2sp","c4s","c4sp"],
    "subject":["c2s","c2sp","c4s","c4sp"],
    "target-phrase":["c2s","c2sp","c4s","c4sp"],
    "target-sent":["c2s","c2sp","c4s","c4sp"],
    "target-sent-target":["c2s","c2sp","c4s","c4sp","cs"],
    "target-sent-subject":["c2s","c2sp","c4s","c4sp","cs"],
}

# for complex source texts
setting_ds_dict = {
    "base": ["cso","c2spo","c4spo"],
    "subject":["c2spo","c4spo"],
    "target-phrase":["c2spo","c4spo"],
    "target-sent":["c2spo","c4spo"],
    "target-sent-target":["cso","c2spo","c4spo"],
    "target-sent-subject":["cso","c2spo","c4spo"],
}

# Load results df

In [2]:
import pandas as pd
bart_ft_res = pd.read_csv("../data/results/bart-ft-results.csv")
llama_ft_res = pd.read_csv("../data/results/llama-ft-results.csv")
llama_instr_res = pd.read_csv("../data/results/llama-instruct-few-shot-results.csv")

# Initialize columns in results dfs

## BERTScore

In [129]:
df_res = pd.read_csv(f"../data/results/{model}-results.csv")
settings = list(dict.fromkeys(["-".join(col.split("-")[:-1]) for col in df_res.columns if "-" in col]))
cols_to_add = [f"{col_name}-bs-prec" for col_name in settings] + \
              [f"{col_name}-bs-rec" for col_name in settings] + \
              [f"{col_name}-bs-f1" for col_name in settings]
for col in cols_to_add:
    df_res[col] = None 

df_res

Unnamed: 0,dataset,base-b1,base-b2,masked-b1,masked-b2,subject-b1,subject-b2,target-phrase-b1,target-phrase-b2,target-sent-b1,...,target-sent-bs-rec,target-sent-target-bs-rec,target-sent-subject-bs-rec,base-bs-f1,masked-bs-f1,subject-bs-f1,target-phrase-bs-f1,target-sent-bs-f1,target-sent-target-bs-f1,target-sent-subject-bs-f1
0,c2s,15.899,3.588,15.258,3.309,22.277,8.939,18.347,5.923,17.417,...,,,,,,,,,,
1,c2sp,15.266,3.515,15.757,3.567,23.58,9.417,18.686,5.726,18.003,...,,,,,,,,,,
2,c4s,15.712,2.912,14.908,4.157,22.771,8.681,17.56,5.348,18.005,...,,,,,,,,,,
3,c4sp,15.233,1.652,16.923,4.93,23.323,9.918,18.652,5.758,16.861,...,,,,,,,,,,
4,cs,19.65,6.692,,,,,,,,...,,,,,,,,,,
5,c2spo,13.904,2.192,,,,,,,,...,,,,,,,,,,
6,c4spo,13.147,1.519,,,,,,,,...,,,,,,,,,,


## BARTScore

In [42]:
model = "llama-ft"#"llama-instruct-few-shot"
df_res = pd.read_csv(f"data/results/{model}-results.csv")

In [34]:
# rename certain columns
df_res.columns = [
    col.replace('-bs-rec', '-bsrec').replace('-bs-prec', '-bsprec').replace('-bs-f1', '-bsf1')
    if col.endswith(('-bs-rec', '-bs-prec', '-bs-f1')) else col
    for col in df_res.columns
]

print(df_res.columns)

Index(['dataset', 'base-b1', 'base-b2', 'masked-b1', 'masked-b2', 'subject-b1',
       'subject-b2', 'target-phrase-b1', 'target-phrase-b2', 'target-sent-b1',
       'target-sent-b2', 'target-sent-target-b1', 'target-sent-target-b2',
       'target-sent-subject-b1', 'target-sent-subject-b2', 'base-bsprec',
       'masked-bsprec', 'subject-bsprec', 'target-phrase-bsprec',
       'target-sent-bsprec', 'target-sent-target-bsprec',
       'target-sent-subject-bsprec', 'base-bsrec', 'masked-bsrec',
       'subject-bsrec', 'target-phrase-bsrec', 'target-sent-bsrec',
       'target-sent-target-bsrec', 'target-sent-subject-bsrec', 'base-bsf1',
       'masked-bsf1', 'subject-bsf1', 'target-phrase-bsf1', 'target-sent-bsf1',
       'target-sent-target-bsf1', 'target-sent-subject-bsf1'],
      dtype='object')


In [35]:
settings = list(dict.fromkeys(["-".join(col.split("-")[:-1]) for col in df_res.columns if "-" in col]))
print(settings, end="\n\n")
cols_to_add = [f"{col_name}-bartscore" for col_name in settings]
for col in cols_to_add:
    df_res[col] = None
print(df_res.columns)

['base', 'masked', 'subject', 'target-phrase', 'target-sent', 'target-sent-target', 'target-sent-subject']

Index(['dataset', 'base-b1', 'base-b2', 'masked-b1', 'masked-b2', 'subject-b1',
       'subject-b2', 'target-phrase-b1', 'target-phrase-b2', 'target-sent-b1',
       'target-sent-b2', 'target-sent-target-b1', 'target-sent-target-b2',
       'target-sent-subject-b1', 'target-sent-subject-b2', 'base-bsprec',
       'masked-bsprec', 'subject-bsprec', 'target-phrase-bsprec',
       'target-sent-bsprec', 'target-sent-target-bsprec',
       'target-sent-subject-bsprec', 'base-bsrec', 'masked-bsrec',
       'subject-bsrec', 'target-phrase-bsrec', 'target-sent-bsrec',
       'target-sent-target-bsrec', 'target-sent-subject-bsrec', 'base-bsf1',
       'masked-bsf1', 'subject-bsf1', 'target-phrase-bsf1', 'target-sent-bsf1',
       'target-sent-target-bsf1', 'target-sent-subject-bsf1', 'base-bartscore',
       'masked-bartscore', 'subject-bartscore', 'target-phrase-bartscore',
       'targe

## New dataset row

In [8]:
import pandas as pd

for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    df_res.loc[len(df_res), 'dataset'] = 'cso'
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

## Map values for comparison

In [9]:
for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    dss = ["cso","cs"]
    settings_map = {"target-sent-subject":"subject","target-sent-target":"target-phrase", "base":"target-sent"}
    for ds in dss:
        idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
        for current_setting_key, target_setting_key in settings_map.items():
            df_res.at[idx, f"{target_setting_key}-b1"] = df_res.loc[idx, f"{current_setting_key}-b1"]
            df_res.at[idx, f"{target_setting_key}-b2"] = df_res.loc[idx, f"{current_setting_key}-b2"]
            df_res.at[idx, f"{target_setting_key}-bsprec"] = df_res.loc[idx, f"{current_setting_key}-bsprec"]
            df_res.at[idx, f"{target_setting_key}-bsrec"] = df_res.loc[idx, f"{current_setting_key}-bsrec"]
            df_res.at[idx, f"{target_setting_key}-bsf1"] = df_res.loc[idx, f"{current_setting_key}-bsf1"]
            df_res.at[idx, f"{target_setting_key}-bartscore"] = df_res.loc[idx, f"{current_setting_key}-bartscore"]
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

## Rename datafiles

In [47]:
# rename files
import os

directory = "data/gen_predictions"

old_part = "predictions_bart-"
new_part = "predictions_bart-ft-"

for filename in os.listdir(directory):

    if old_part in filename:
        # new filename by replacing the old part
        new_filename = filename.replace(old_part, new_part)
        
        # full paths for renaming
        old_file_path = os.path.join(directory, filename)
        new_file_path = os.path.join(directory, new_filename)
        
        # rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {filename} -> {new_filename}")

Renamed: predictions_bart-c4s-target-sent-subject.csv -> predictions_bart-ft-c4s-target-sent-subject.csv
Renamed: predictions_bart-c2spo-base.csv -> predictions_bart-ft-c2spo-base.csv
Renamed: predictions_bart-c2sp-subject.csv -> predictions_bart-ft-c2sp-subject.csv
Renamed: predictions_bart-c4sp-target-sent-subject.csv -> predictions_bart-ft-c4sp-target-sent-subject.csv
Renamed: predictions_bart-c4sp-subject.csv -> predictions_bart-ft-c4sp-subject.csv
Renamed: predictions_bart-test.csv -> predictions_bart-ft-test.csv
Renamed: predictions_bart-c2sp-target-sent-target.csv -> predictions_bart-ft-c2sp-target-sent-target.csv
Renamed: predictions_bart-c2sp-target-sent-subject.csv -> predictions_bart-ft-c2sp-target-sent-subject.csv
Renamed: predictions_bart-c2s-target-phrase.csv -> predictions_bart-ft-c2s-target-phrase.csv
Renamed: predictions_bart-c2sp-base.csv -> predictions_bart-ft-c2sp-base.csv
Renamed: predictions_bart-c2sp-masked.csv -> predictions_bart-ft-c2sp-masked.csv
Renamed: pred

# SARI - Hugging Face

In [35]:
from evaluate import load
from tqdm.notebook import tqdm
import pandas as pd

sari_metric = load("sari")
sari_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index] 
    prediction = row['prediction']  
    
    sari_score = sari_metric.compute(
        sources=[r_content],
        predictions=[prediction],
        references=[s_content['simplifications']]
    )
    
    sari_scores.append(sari_score['sari'])

  0%|          | 0/359 [00:00<?, ?it/s]

# SARI - EASSE package

In [13]:
from easse.sari import corpus_sari
from tqdm.notebook import tqdm


sari_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text']  
    prediction = row['prediction']  
    
    sari_score_easse = corpus_sari(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']]
    )
    
    sari_scores_easse.append(sari_score_easse)

  0%|          | 0/116 [00:00<?, ?it/s]

KeyError: 'label_text'

In [106]:
import numpy as np
print("Average SARI score:", np.mean(sari_scores_easse))

Average SARI score: 36.46781896155372


# Operation scores (add, keep, delete)

In [4]:
from easse.sari import get_corpus_sari_operation_scores
from tqdm.notebook import tqdm

add_scores = []
keep_scores = []
del_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text'] #dataset['test'][index]  
    prediction = row['prediction']  
    
    add_score, keep_score, del_score = get_corpus_sari_operation_scores(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']] 
    )
    
    add_scores.append(add_score)
    keep_scores.append(keep_score)
    del_scores.append(del_score)

  0%|          | 0/116 [00:00<?, ?it/s]

# BLEU-4 (EASSE package)

In [25]:
from tqdm.notebook import tqdm
from easse.bleu import corpus_bleu
import numpy as np

bleu_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    s_content = row['elaboration_sentence'] 
    prediction = row['pred_elaboration'] # "prediction" for BART
    
    bleu_score_easse = corpus_bleu(
        sys_sents=[prediction],
        refs_sents=[[s_content]]
    )
    
    bleu_scores_easse.append(bleu_score_easse)

print(f"Average BLEU score: {np.mean(bleu_scores_easse):.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Average BLEU score: 4.965


# BLEU-1 & BLEU-2 (nltk + tokenizer-13A)

## Corpus bleu

In [22]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from tqdm import tqdm

# 13a tokenizer
tokenizer = Tokenizer13a()
smoothing_function = SmoothingFunction().method1

for model in models: 
    df_res = pd.read_csv(f"data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            
            all_refs = []
            all_preds = []
            output_name = f"{ds}-{setting_key}"
            df_gen = pd.read_csv(f"data/gen_predictions/predictions_{model}-{output_name}.csv")
            
            # Tokenize and collect references and predictions
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                ref = row['elaboration_sentence']
                prediction = row['pred_elaboration'] # "prediction" for BART
            
                # Tokenize
                tokenized_ref = tokenizer(ref).split()
                tokenized_pred = tokenizer(prediction).split()
                
                all_refs.append([tokenized_ref]) 
                all_preds.append(tokenized_pred)
            
            bleu1_score = corpus_bleu(all_refs, all_preds, weights=(1.0, 0, 0, 0), smoothing_function=smoothing_function)  # 1-gram
            bleu2_score = corpus_bleu(all_refs, all_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)  # 2-gram
            bleu4_score = corpus_bleu(all_refs, all_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)  # 4-gram
            
            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            df_res.at[idx, f"{setting_key}-b1"] = round(bleu1_score*100,3)
            df_res.at[idx, f"{setting_key}-b2"] = round(bleu2_score*100,3)
            print(f"{model}-{ds}-{setting_key}: {round(bleu1_score*100,3)}")
            print(f"{model}-{ds}-{setting_key}: {round(bleu2_score*100,3)}")
    
    df_res.to_csv(f"data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")
#print(f"Corpus BLEU-1: {bleu1_score*100:.3f}")
#print(f"Corpus BLEU-2: {bleu2_score*100:.3f}")
#print(f"Corpus BLEU-4: {bleu4_score*100:.3f}")

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 16480.57it/s]


llama-ft-cso-base: 18.528
llama-ft-cso-base: 6.32


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25253.78it/s]


llama-ft-c2spo-base: 16.764
llama-ft-c2spo-base: 5.063


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25875.62it/s]


llama-ft-c4spo-base: 15.761
llama-ft-c4spo-base: 4.453


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 24165.06it/s]


llama-ft-c2spo-subject: 26.962
llama-ft-c2spo-subject: 14.104


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28811.47it/s]


llama-ft-c4spo-subject: 29.417
llama-ft-c4spo-subject: 15.342


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 26865.78it/s]


llama-ft-c2spo-target-phrase: 21.212
llama-ft-c2spo-target-phrase: 8.055


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 29501.53it/s]


llama-ft-c4spo-target-phrase: 20.194
llama-ft-c4spo-target-phrase: 8.831


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28931.39it/s]


llama-ft-c2spo-target-sent: 18.602
llama-ft-c2spo-target-sent: 6.468


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 29357.34it/s]


llama-ft-c4spo-target-sent: 18.674
llama-ft-c4spo-target-sent: 6.216


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 27401.40it/s]


llama-ft-cso-target-sent-target: 20.885
llama-ft-cso-target-sent-target: 8.334


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 33342.88it/s]


llama-ft-c2spo-target-sent-target: 20.317
llama-ft-c2spo-target-sent-target: 7.758


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 32785.66it/s]


llama-ft-c4spo-target-sent-target: 20.799
llama-ft-c4spo-target-sent-target: 8.749


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 27261.68it/s]


llama-ft-cso-target-sent-subject: 28.698
llama-ft-cso-target-sent-subject: 14.577


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28326.69it/s]


llama-ft-c2spo-target-sent-subject: 27.214
llama-ft-c2spo-target-sent-subject: 13.166


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 31035.23it/s]


llama-ft-c4spo-target-sent-subject: 26.375
llama-ft-c4spo-target-sent-subject: 13.601
Results saved for llama-ft


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25442.62it/s]


bart-ft-cso-base: 17.727
bart-ft-cso-base: 5.358


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25054.81it/s]


bart-ft-c2spo-base: 16.943
bart-ft-c2spo-base: 3.266


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25890.77it/s]


bart-ft-c4spo-base: 16.532
bart-ft-c4spo-base: 2.987


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 26668.45it/s]


bart-ft-c2spo-subject: 28.003
bart-ft-c2spo-subject: 17.341


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28510.94it/s]


bart-ft-c4spo-subject: 29.767
bart-ft-c4spo-subject: 18.499


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 27209.85it/s]


bart-ft-c2spo-target-phrase: 19.295
bart-ft-c2spo-target-phrase: 6.959


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28814.88it/s]


bart-ft-c4spo-target-phrase: 19.756
bart-ft-c4spo-target-phrase: 7.179


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28318.45it/s]


bart-ft-c2spo-target-sent: 17.935
bart-ft-c2spo-target-sent: 3.81


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28288.81it/s]


bart-ft-c4spo-target-sent: 18.626
bart-ft-c4spo-target-sent: 6.064


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 26456.73it/s]


bart-ft-cso-target-sent-target: 19.287
bart-ft-cso-target-sent-target: 5.283


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 31683.98it/s]


bart-ft-c2spo-target-sent-target: 20.138
bart-ft-c2spo-target-sent-target: 7.722


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 30909.04it/s]


bart-ft-c4spo-target-sent-target: 19.709
bart-ft-c4spo-target-sent-target: 7.53


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 28554.45it/s]


bart-ft-cso-target-sent-subject: 30.57
bart-ft-cso-target-sent-subject: 18.909


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 30246.13it/s]


bart-ft-c2spo-target-sent-subject: 29.786
bart-ft-c2spo-target-sent-subject: 18.353


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 33176.90it/s]


bart-ft-c4spo-target-sent-subject: 32.098
bart-ft-c4spo-target-sent-subject: 20.242
Results saved for bart-ft


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23917.97it/s]


llama-instruct-few-shot-cso-base: 16.894
llama-instruct-few-shot-cso-base: 3.454


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22497.89it/s]


llama-instruct-few-shot-c2spo-base: 13.904
llama-instruct-few-shot-c2spo-base: 2.192


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23508.86it/s]


llama-instruct-few-shot-c4spo-base: 13.147
llama-instruct-few-shot-c4spo-base: 1.519


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22688.83it/s]


llama-instruct-few-shot-c2spo-subject: 19.164
llama-instruct-few-shot-c2spo-subject: 6.05


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25007.16it/s]


llama-instruct-few-shot-c4spo-subject: 21.659
llama-instruct-few-shot-c4spo-subject: 8.436


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22141.59it/s]


llama-instruct-few-shot-c2spo-target-phrase: 17.8
llama-instruct-few-shot-c2spo-target-phrase: 6.161


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23878.06it/s]


llama-instruct-few-shot-c4spo-target-phrase: 16.099
llama-instruct-few-shot-c4spo-target-phrase: 4.862


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22763.14it/s]


llama-instruct-few-shot-c2spo-target-sent: 14.802
llama-instruct-few-shot-c2spo-target-sent: 2.115


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23888.61it/s]


llama-instruct-few-shot-c4spo-target-sent: 13.832
llama-instruct-few-shot-c4spo-target-sent: 2.062


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22902.43it/s]


llama-instruct-few-shot-cso-target-sent-target: 18.521
llama-instruct-few-shot-cso-target-sent-target: 6.846


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23101.43it/s]


llama-instruct-few-shot-c2spo-target-sent-target: 14.342
llama-instruct-few-shot-c2spo-target-sent-target: 2.681


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 24451.67it/s]


llama-instruct-few-shot-c4spo-target-sent-target: 14.36
llama-instruct-few-shot-c4spo-target-sent-target: 2.841


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 22640.26it/s]


llama-instruct-few-shot-cso-target-sent-subject: 22.549
llama-instruct-few-shot-cso-target-sent-subject: 9.723


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 23145.39it/s]


llama-instruct-few-shot-c2spo-target-sent-subject: 18.823
llama-instruct-few-shot-c2spo-target-sent-subject: 7.435


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 25084.52it/s]


llama-instruct-few-shot-c4spo-target-sent-subject: 20.699
llama-instruct-few-shot-c4spo-target-sent-subject: 7.653
Results saved for llama-instruct-few-shot


## Sentence bleu

In [45]:
from nltk.translate.bleu_score import sentence_bleu
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from transformers import BartTokenizer

bleu_scores_1 = []
bleu_scores_2 = []
bleu_scores_4 = []

# 13a tokenizer
tokenizer = Tokenizer13a()
# bart tokenizer
#tokenizer_b = BartTokenizer.from_pretrained('facebook/bart-base',use_fast=False) 

smoothing_function = SmoothingFunction().method1

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    ref = row['elaboration_sentence']
    prediction = row['pred_elaboration']

    # tokenize
    tokenized_ref = tokenizer(ref).split()
    tokenized_pred = tokenizer(prediction).split()
    #tokenized_ref = tokenizer_b(ref)["input_ids"]
    #tokenized_pred = tokenizer_b(prediction)["input_ids"]
        
    bleu_score_1 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(1, 0, 0, 0),smoothing_function=smoothing_function) # 1-gram
    bleu_score_2 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function) # 2-gram
    bleu_score_4 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function) # 4-gram
    bleu_scores_1.append(bleu_score_1)
    bleu_scores_2.append(bleu_score_2)
    bleu_scores_4.append(bleu_score_4)

import numpy as np
print(f"Average BLEU-1 score: {np.mean(bleu_scores_1)*100:.3f}")
print(f"Average BLEU-2 score: {np.mean(bleu_scores_2)*100:.3f}")
print(f"Average BLEU-4 score: {np.mean(bleu_scores_4)*100:.3f}")

100%|███████████████████████████████████████| 116/116 [00:00<00:00, 4293.61it/s]

Average BLEU-1 score: 16.922
Average BLEU-2 score: 6.008
Average BLEU-4 score: 3.106





In [None]:
df_results = pd.DataFrame({
    'elaboration_sentence': df_gen['elaboration_sentence'],
    'pred_elaboration': df_gen['pred_elaboration'],
    'bleu-1': bleu_scores_1,
    'bleu-2': bleu_scores_2,
})

# BERTScore

In [16]:
from tqdm.notebook import tqdm
from bert_score import BERTScorer
import numpy as np
from transformers import logging

# suppress warnings
#logging.set_verbosity_error()

scorer = BERTScorer(model_type='bert-base-uncased',device='cuda:0')

for model in models: 
    df_res = pd.read_csv(f"data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            bert_scores_precision = []
            bert_scores_recall = []
            bert_scores_f1 = []
            output_name = f"{ds}-{setting_key}"
            df_gen = pd.read_csv(f"data/gen_predictions/predictions_{model}-{output_name}.csv")

            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                elaboration = row['elaboration_sentence']
                prediction = row['pred_elaboration']
                
                #  BERTScore for this pair
                P, R, F1 = scorer.score(
                    cands=[prediction],  
                    refs=[elaboration],              
                )
                
                bert_scores_precision.append(P.mean().item())
                bert_scores_recall.append(R.mean().item())
                bert_scores_f1.append(F1.mean().item())

            # average scores
            avg_precision = np.mean(bert_scores_precision)
            avg_recall = np.mean(bert_scores_recall)
            avg_f1 = np.mean(bert_scores_f1)

            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            df_res.at[idx, f"{setting_key}-bsprec"] = round(avg_precision,3)
            df_res.at[idx, f"{setting_key}-bsrec"] = round(avg_recall,3)
            df_res.at[idx, f"{setting_key}-bsf1"] = round(avg_f1,3)
            print(f"{model}-{ds}-{setting_key}: {round(avg_f1,3)}")

    df_res.to_csv(f"data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")
#print(f"Average BERTScore Precision: {avg_precision:.3f}")
#print(f"Average BERTScore Recall: {avg_recall:.3f}")
#print(f"Average BERTScore F1: {avg_f1:.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-base: 0.453


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-subject: 0.542


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-subject: 0.556


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-phrase: 0.493


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-phrase: 0.5


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent: 0.459


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent: 0.463


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-target-sent-target: 0.494


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent-target: 0.495


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent-target: 0.501


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-target-sent-subject: 0.545


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent-subject: 0.539


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent-subject: 0.553
Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-base: 0.454


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-subject: 0.559


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-subject: 0.561


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-phrase: 0.487


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-phrase: 0.482


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent: 0.445


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent: 0.458


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-target-sent-target: 0.482


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent-target: 0.492


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent-target: 0.488


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-target-sent-subject: 0.563


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent-subject: 0.571


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent-subject: 0.58
Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-base: 0.461


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-subject: 0.491


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-subject: 0.503


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-phrase: 0.486


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-phrase: 0.481


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent: 0.458


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent: 0.453


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-target-sent-target: 0.484


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent-target: 0.463


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent-target: 0.461


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-target-sent-subject: 0.515


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent-subject: 0.492


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent-subject: 0.506
Results saved for llama-instruct-few-shot


# BARTScore

https://github.com/neulab/BARTScore

In [14]:
import sys
import os
parent_dir = os.path.abspath('..')
sys.path.append(parent_dir)

from utils.bart_score import BARTScorer
from tqdm.notebook import tqdm
import numpy as np

bart_scorer = BARTScorer(device='cuda:0')

for model in models: 
    df_res = pd.read_csv(f"data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            bart_scores = []
            output_name = f"{ds}-{setting_key}"
            df_gen = pd.read_csv(f"data/gen_predictions/predictions_{model}-{output_name}.csv")
        
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                reference = row['elaboration_sentence']  # reference text (r)
                hypothesis = row['pred_elaboration']    # generated text (h)
                
                # precision (r → h)
                precision_score = bart_scorer.score(
                    srcs=[reference],  # r as source
                    tgts=[hypothesis], # h as target
                    batch_size=1
                )[0]
                
                # recall (h → r)
                recall_score = bart_scorer.score(
                    srcs=[hypothesis],  # h as source
                    tgts=[reference],   # r as target
                    batch_size=1
                )[0]
                
                # f1 score as the average of precision and recall
                f1_score = (precision_score + recall_score) / 2
                bart_scores.append(f1_score)
        
            # average score
            avg_score = np.mean(bart_scores)
            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            df_res.at[idx, f"{setting_key}-bartscore"] = round(avg_score,3)
            print(f"{model}-{ds}-{setting_key}: {round(avg_score,3)}")
    
    df_res.to_csv(f"data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")

  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-base: -3.542


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-subject: -3.033


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-subject: -3.009


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-phrase: -3.36


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-phrase: -3.346


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent: -3.504


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent: -3.49


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-target-sent-target: -3.372


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent-target: -3.367


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent-target: -3.362


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-cso-target-sent-subject: -3.027


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c2spo-target-sent-subject: -3.081


  0%|          | 0/116 [00:00<?, ?it/s]

llama-ft-c4spo-target-sent-subject: -3.032
Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-base: -3.498


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-subject: -3.002


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-subject: -2.977


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-phrase: -3.421


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-phrase: -3.409


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent: -3.59


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent: -3.519


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-target-sent-target: -3.416


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent-target: -3.383


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent-target: -3.408


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-cso-target-sent-subject: -2.967


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c2spo-target-sent-subject: -2.995


  0%|          | 0/116 [00:00<?, ?it/s]

bart-ft-c4spo-target-sent-subject: -2.876
Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-base: -3.692


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-subject: -3.684


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-subject: -3.618


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-phrase: -3.622


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-phrase: -3.657


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent: -3.772


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent: -3.805


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-target-sent-target: -3.539


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent-target: -3.753


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent-target: -3.707


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-cso-target-sent-subject: -3.348


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c2spo-target-sent-subject: -3.533


  0%|          | 0/116 [00:00<?, ?it/s]

llama-instruct-few-shot-c4spo-target-sent-subject: -3.466
Results saved for llama-instruct-few-shot


# Show results

In [20]:
df_res

Unnamed: 0,dataset,base-b1,base-b2,masked-b1,masked-b2,subject-b1,subject-b2,target-phrase-b1,target-phrase-b2,target-sent-b1,...,target-sent-bsf1,target-sent-target-bsf1,target-sent-subject-bsf1,base-bartscore,masked-bartscore,subject-bartscore,target-phrase-bartscore,target-sent-bartscore,target-sent-target-bartscore,target-sent-subject-bartscore
0,c2s,15.899,3.588,15.258,3.309,22.277,8.939,18.347,5.923,17.417,...,0.485,0.49,0.521,-3.778,-3.809,-3.487,-3.552,-3.549,-3.534,-3.293
1,c2sp,15.266,3.515,15.757,3.567,23.58,9.417,18.686,5.726,18.003,...,0.484,0.498,0.515,-3.849,-3.82,-3.475,-3.485,-3.546,-3.504,-3.364
2,c4s,15.712,2.912,14.908,4.157,22.771,8.681,17.56,5.348,18.005,...,0.484,0.487,0.518,-3.884,-3.788,-3.493,-3.577,-3.599,-3.551,-3.362
3,c4sp,15.233,1.652,16.923,4.93,23.323,9.918,18.652,5.758,16.861,...,0.481,0.499,0.517,-3.941,-3.943,-3.44,-3.516,-3.554,-3.464,-3.356
4,cs,19.65,6.692,,,,,,,,...,,0.481,0.518,-3.427,,,,,-3.481,-3.298
5,c2spo,13.904,2.192,,,19.164,6.05,17.8,6.161,14.802,...,0.458,0.463,0.492,-4.088,,-3.684,-3.622,-3.772,-3.753,-3.533
6,c4spo,13.147,1.519,,,21.659,8.436,16.099,4.862,13.832,...,0.453,0.461,0.506,-4.087,,-3.618,-3.657,-3.805,-3.707,-3.466
7,cso,16.894,3.454,,,,,,,,...,,0.484,0.515,-3.692,,,,,-3.539,-3.348


In [38]:
df_res.to_csv(f"data/results/{model}-results.csv",index=False)
print(model)

llama-instruct-few-shot


# Save results

In [8]:
df_results = pd.DataFrame({
    'source_text': df_gen['source_text'],
    'elaboration_sentence': df_gen['elaboration_sentence'],
    'pred_elaboration': df_gen['pred_elaboration'],
    'bert-score-precision': bert_scores_precision,
    'bert-score-recall': bert_scores_recall,
    'bert-score-f1': bert_scores_f1
})

# bleu-scores
#df_results.to_csv("../data/bleu_scores/bleu_scores_bart-ft-c2sp-masked.csv", index=False)
# bert-scores
df_results.to_csv(f"../data/bert_scores/bert_scores_{model}-test_ds-{output_name}.csv", index=False)