# Load the data

In [8]:
models = ["llama-ft","bart-ft","llama-instruct-few-shot"]

setting_ds_dict = {
    "base": ["c2s","c2sp","c4s","c4sp","cs","c2spo","c4spo","cso"],
    "masked": ["c2s","c2sp","c4s","c4sp"],
    "subject":["c2s","c2sp","c4s","c4sp","c2spo","c4spo"],
    "target-phrase":["c2s","c2sp","c4s","c4sp","c2spo","c4spo"],
    "target-sent":["c2s","c2sp","c4s","c4sp","c2spo","c4spo"],
    "target-sent-target":["c2s","c2sp","c4s","c4sp","cs","cso","c2spo","c4spo"],
    "target-sent-subject":["c2s","c2sp","c4s","c4sp","cs","cso","c2spo","c4spo"],
}

# for results inspection
results_setting_ds_dict = {
    "base": ["cs","c4s"],
    "subject":["c4s"],
    "target-phrase":["c4s"],
    "target-sent":["c4s"],
    "target-sent-target":["c4sp"],
    "target-sent-subject":["c2sp","c4spo"],
}

# additional calculation
setting_ds_dict = {
    "base": ["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "masked": ["c2s","c2sp","c4s","c4sp"],
    "target-phrase":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "target-sent":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "target-sent-target":["c2s","c2sp","c4s","c4sp","c2o","c2op"]
}

# Load results df

In [3]:
import pandas as pd
bart_ft_res = pd.read_csv("../data/results/bart-ft-results.csv")
llama_ft_res = pd.read_csv("../data/results/llama-ft-results.csv")
llama_instr_res = pd.read_csv("../data/results/llama-instruct-few-shot-results.csv")
llama_instr_prompt_res = pd.read_csv("../data/results/llama-instruct-few-shot-prompt-results.csv")

In [4]:
llama_instr_res.columns

Index(['dataset', 'base-b1', 'base-b2', 'masked-b1', 'masked-b2', 'subject-b1',
       'subject-b2', 'target-phrase-b1', 'target-phrase-b2', 'target-sent-b1',
       'target-sent-b2', 'target-sent-target-b1', 'target-sent-target-b2',
       'target-sent-subject-b1', 'target-sent-subject-b2', 'base-bsprec',
       'masked-bsprec', 'subject-bsprec', 'target-phrase-bsprec',
       'target-sent-bsprec', 'target-sent-target-bsprec',
       'target-sent-subject-bsprec', 'base-bsrec', 'masked-bsrec',
       'subject-bsrec', 'target-phrase-bsrec', 'target-sent-bsrec',
       'target-sent-target-bsrec', 'target-sent-subject-bsrec', 'base-bsf1',
       'masked-bsf1', 'subject-bsf1', 'target-phrase-bsf1', 'target-sent-bsf1',
       'target-sent-target-bsf1', 'target-sent-subject-bsf1', 'base-bartscore',
       'masked-bartscore', 'subject-bartscore', 'target-phrase-bartscore',
       'target-sent-bartscore', 'target-sent-target-bartscore',
       'target-sent-subject-bartscore'],
      dtype='obj

# Initialize columns in results dfs

## BERTScore

In [129]:
df_res = pd.read_csv(f"../data/results/{model}-results.csv")
settings = list(dict.fromkeys(["-".join(col.split("-")[:-1]) for col in df_res.columns if "-" in col]))
cols_to_add = [f"{col_name}-bs-prec" for col_name in settings] + \
              [f"{col_name}-bs-rec" for col_name in settings] + \
              [f"{col_name}-bs-f1" for col_name in settings]
for col in cols_to_add:
    df_res[col] = None 

df_res

Unnamed: 0,dataset,base-b1,base-b2,masked-b1,masked-b2,subject-b1,subject-b2,target-phrase-b1,target-phrase-b2,target-sent-b1,...,target-sent-bs-rec,target-sent-target-bs-rec,target-sent-subject-bs-rec,base-bs-f1,masked-bs-f1,subject-bs-f1,target-phrase-bs-f1,target-sent-bs-f1,target-sent-target-bs-f1,target-sent-subject-bs-f1
0,c2s,15.899,3.588,15.258,3.309,22.277,8.939,18.347,5.923,17.417,...,,,,,,,,,,
1,c2sp,15.266,3.515,15.757,3.567,23.58,9.417,18.686,5.726,18.003,...,,,,,,,,,,
2,c4s,15.712,2.912,14.908,4.157,22.771,8.681,17.56,5.348,18.005,...,,,,,,,,,,
3,c4sp,15.233,1.652,16.923,4.93,23.323,9.918,18.652,5.758,16.861,...,,,,,,,,,,
4,cs,19.65,6.692,,,,,,,,...,,,,,,,,,,
5,c2spo,13.904,2.192,,,,,,,,...,,,,,,,,,,
6,c4spo,13.147,1.519,,,,,,,,...,,,,,,,,,,


## BARTScore

In [42]:
model = "llama-ft"#"llama-instruct-few-shot"
df_res = pd.read_csv(f"data/results/{model}-results.csv")

In [34]:
# rename certain columns
df_res.columns = [
    col.replace('-bs-rec', '-bsrec').replace('-bs-prec', '-bsprec').replace('-bs-f1', '-bsf1')
    if col.endswith(('-bs-rec', '-bs-prec', '-bs-f1')) else col
    for col in df_res.columns
]

print(df_res.columns)

Index(['dataset', 'base-b1', 'base-b2', 'masked-b1', 'masked-b2', 'subject-b1',
       'subject-b2', 'target-phrase-b1', 'target-phrase-b2', 'target-sent-b1',
       'target-sent-b2', 'target-sent-target-b1', 'target-sent-target-b2',
       'target-sent-subject-b1', 'target-sent-subject-b2', 'base-bsprec',
       'masked-bsprec', 'subject-bsprec', 'target-phrase-bsprec',
       'target-sent-bsprec', 'target-sent-target-bsprec',
       'target-sent-subject-bsprec', 'base-bsrec', 'masked-bsrec',
       'subject-bsrec', 'target-phrase-bsrec', 'target-sent-bsrec',
       'target-sent-target-bsrec', 'target-sent-subject-bsrec', 'base-bsf1',
       'masked-bsf1', 'subject-bsf1', 'target-phrase-bsf1', 'target-sent-bsf1',
       'target-sent-target-bsf1', 'target-sent-subject-bsf1'],
      dtype='object')


In [35]:
settings = list(dict.fromkeys(["-".join(col.split("-")[:-1]) for col in df_res.columns if "-" in col]))
print(settings, end="\n\n")
cols_to_add = [f"{col_name}-bartscore" for col_name in settings]
for col in cols_to_add:
    df_res[col] = None
print(df_res.columns)

['base', 'masked', 'subject', 'target-phrase', 'target-sent', 'target-sent-target', 'target-sent-subject']

Index(['dataset', 'base-b1', 'base-b2', 'masked-b1', 'masked-b2', 'subject-b1',
       'subject-b2', 'target-phrase-b1', 'target-phrase-b2', 'target-sent-b1',
       'target-sent-b2', 'target-sent-target-b1', 'target-sent-target-b2',
       'target-sent-subject-b1', 'target-sent-subject-b2', 'base-bsprec',
       'masked-bsprec', 'subject-bsprec', 'target-phrase-bsprec',
       'target-sent-bsprec', 'target-sent-target-bsprec',
       'target-sent-subject-bsprec', 'base-bsrec', 'masked-bsrec',
       'subject-bsrec', 'target-phrase-bsrec', 'target-sent-bsrec',
       'target-sent-target-bsrec', 'target-sent-subject-bsrec', 'base-bsf1',
       'masked-bsf1', 'subject-bsf1', 'target-phrase-bsf1', 'target-sent-bsf1',
       'target-sent-target-bsf1', 'target-sent-subject-bsf1', 'base-bartscore',
       'masked-bartscore', 'subject-bartscore', 'target-phrase-bartscore',
       'targe

## Add/delete dataset row

### Add 

In [7]:
import pandas as pd

for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    df_res.loc[len(df_res), 'dataset'] = 'c2o'
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

### Del

In [5]:
for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    df_res = df_res[df_res['dataset'] != 'cso']
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

## Delete columns

In [None]:
import pandas as pd

def delete_columns_with_substrings(df, substrings):
    columns_to_keep = [
        col for col in df.columns if not any(substring in col for substring in substrings)
    ]
    return df[columns_to_keep]

model = "llama-instruct-few-shot"
df_res = pd.read_csv(f"../data/results/{model}-results.csv")
filtered_df = delete_columns_with_substrings(df_res, substrings=[""])    
    #df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

## Map values for comparison

In [3]:
for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    dss = ["cso","cs"]
    settings_map = {"target-sent-subject":"subject","target-sent-target":"target-phrase", "base":"target-sent"}
    for ds in dss:
        idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
        for current_setting_key, target_setting_key in settings_map.items():
            df_res.at[idx, f"{target_setting_key}-b1"] = df_res.loc[idx, f"{current_setting_key}-b1"]
            df_res.at[idx, f"{target_setting_key}-b2"] = df_res.loc[idx, f"{current_setting_key}-b2"]
            df_res.at[idx, f"{target_setting_key}-bsprec"] = df_res.loc[idx, f"{current_setting_key}-bsprec"]
            df_res.at[idx, f"{target_setting_key}-bsrec"] = df_res.loc[idx, f"{current_setting_key}-bsrec"]
            df_res.at[idx, f"{target_setting_key}-bsf1"] = df_res.loc[idx, f"{current_setting_key}-bsf1"]
            df_res.at[idx, f"{target_setting_key}-bartscore"] = df_res.loc[idx, f"{current_setting_key}-bartscore"]
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

## Rename datafiles

In [47]:
# rename files
import os

directory = "data/gen_predictions"

old_part = "predictions_bart-"
new_part = "predictions_bart-ft-"

for filename in os.listdir(directory):

    if old_part in filename:
        # new filename by replacing the old part
        new_filename = filename.replace(old_part, new_part)
        
        # full paths for renaming
        old_file_path = os.path.join(directory, filename)
        new_file_path = os.path.join(directory, new_filename)
        
        # rename the file
        os.rename(old_file_path, new_file_path)
        print(f"Renamed: {filename} -> {new_filename}")

Renamed: predictions_bart-c4s-target-sent-subject.csv -> predictions_bart-ft-c4s-target-sent-subject.csv
Renamed: predictions_bart-c2spo-base.csv -> predictions_bart-ft-c2spo-base.csv
Renamed: predictions_bart-c2sp-subject.csv -> predictions_bart-ft-c2sp-subject.csv
Renamed: predictions_bart-c4sp-target-sent-subject.csv -> predictions_bart-ft-c4sp-target-sent-subject.csv
Renamed: predictions_bart-c4sp-subject.csv -> predictions_bart-ft-c4sp-subject.csv
Renamed: predictions_bart-test.csv -> predictions_bart-ft-test.csv
Renamed: predictions_bart-c2sp-target-sent-target.csv -> predictions_bart-ft-c2sp-target-sent-target.csv
Renamed: predictions_bart-c2sp-target-sent-subject.csv -> predictions_bart-ft-c2sp-target-sent-subject.csv
Renamed: predictions_bart-c2s-target-phrase.csv -> predictions_bart-ft-c2s-target-phrase.csv
Renamed: predictions_bart-c2sp-base.csv -> predictions_bart-ft-c2sp-base.csv
Renamed: predictions_bart-c2sp-masked.csv -> predictions_bart-ft-c2sp-masked.csv
Renamed: pred

# SARI - Hugging Face

In [35]:
from evaluate import load
from tqdm.notebook import tqdm
import pandas as pd

sari_metric = load("sari")
sari_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index] 
    prediction = row['prediction']  
    
    sari_score = sari_metric.compute(
        sources=[r_content],
        predictions=[prediction],
        references=[s_content['simplifications']]
    )
    
    sari_scores.append(sari_score['sari'])

  0%|          | 0/359 [00:00<?, ?it/s]

# SARI - EASSE package

In [13]:
from easse.sari import corpus_sari
from tqdm.notebook import tqdm


sari_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text']  
    prediction = row['prediction']  
    
    sari_score_easse = corpus_sari(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']]
    )
    
    sari_scores_easse.append(sari_score_easse)

  0%|          | 0/116 [00:00<?, ?it/s]

KeyError: 'label_text'

In [106]:
import numpy as np
print("Average SARI score:", np.mean(sari_scores_easse))

Average SARI score: 36.46781896155372


# Operation scores (add, keep, delete)

In [4]:
from easse.sari import get_corpus_sari_operation_scores
from tqdm.notebook import tqdm

add_scores = []
keep_scores = []
del_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text'] #dataset['test'][index]  
    prediction = row['prediction']  
    
    add_score, keep_score, del_score = get_corpus_sari_operation_scores(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']] 
    )
    
    add_scores.append(add_score)
    keep_scores.append(keep_score)
    del_scores.append(del_score)

  0%|          | 0/116 [00:00<?, ?it/s]

# BLEU-4 (EASSE package)

In [25]:
from tqdm.notebook import tqdm
from easse.bleu import corpus_bleu
import numpy as np

bleu_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    s_content = row['elaboration_sentence'] 
    prediction = row['pred_elaboration'] # "prediction" for BART
    
    bleu_score_easse = corpus_bleu(
        sys_sents=[prediction],
        refs_sents=[[s_content]]
    )
    
    bleu_scores_easse.append(bleu_score_easse)

print(f"Average BLEU score: {np.mean(bleu_scores_easse):.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Average BLEU score: 4.965


# BLEU-1 & BLEU-2 (nltk + tokenizer-13A)

## Corpus bleu

In [4]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from dataset_utils import create_scores_df
from tqdm import tqdm

# 13a tokenizer
tokenizer = Tokenizer13a()
smoothing_function = SmoothingFunction().method1

for model in models: 
    df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            
            all_refs = []
            all_preds = []
            output_name = f"{ds}-{setting_key}-{num_examples}"
            df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}.csv")
 
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                ref = row['elaboration_sentence']
                prediction = row['pred_elaboration'] # "prediction" for BART
            
                # Tokenize
                tokenized_ref = tokenizer(ref).split()
                tokenized_pred = tokenizer(prediction).split()
                
                all_refs.append([tokenized_ref]) 
                all_preds.append(tokenized_pred)
            
            bleu1_score = corpus_bleu(all_refs, all_preds, weights=(1.0, 0, 0, 0), smoothing_function=smoothing_function)  # 1-gram
            bleu2_score = corpus_bleu(all_refs, all_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)  # 2-gram
            bleu4_score = corpus_bleu(all_refs, all_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)  # 4-gram

            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            df_res.at[idx, f"{setting_key}-{num_examples}-b1"] = round(bleu1_score*100,3)
            df_res.at[idx, f"{setting_key}-{num_examples}-b2"] = round(bleu2_score*100,3)
            print(f"{model}-{output_name}: {round(bleu1_score*100,3)}")
            print(f"{model}-{output_name}: {round(bleu2_score*100,3)}")
    
    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 10129.48it/s]


llama-instruct-few-shot-c2s-base-n6: 16.284
llama-instruct-few-shot-c2s-base-n6: 5.439


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 16714.39it/s]


llama-instruct-few-shot-c2sp-base-n6: 16.451
llama-instruct-few-shot-c2sp-base-n6: 5.631


100%|███████████████████████████████████████| 116/116 [00:00<00:00, 9579.05it/s]


llama-instruct-few-shot-c4s-base-n6: 15.158
llama-instruct-few-shot-c4s-base-n6: 3.637


100%|██████████████████████████████████████| 116/116 [00:00<00:00, 16469.97it/s]

llama-instruct-few-shot-c4sp-base-n6: 14.426
llama-instruct-few-shot-c4sp-base-n6: 3.505
Results saved for llama-instruct-few-shot





## Sentence bleu

In [5]:
def create_scores_df(df_gen):
    df_scores = pd.DataFrame({
        'source_text': df_gen['source_text'] if 'source_text' in df_gen else None,
        'target_sentence': (
            df_gen['target_sentence_4o'] if 'target_sentence_4o' in df_gen
            else df_gen['target_sentence'] if 'target_sentence' in df_gen
            else None
        ),
        'target_sentence_target': df_gen['target_sentence_target'] if 'target_sentence_target' in df_gen else None,
        'subject': df_gen['subject'] if 'subject' in df_gen else None,
        'target-phrase': df_gen['target-phrase'] if 'target-phrase' in df_gen else None,
        'elaboration_sentence': df_gen['elaboration_sentence'],
        'pred_elaboration': df_gen['pred_elaboration'],
    })
    return df_scores

In [9]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from transformers import BartTokenizer
from tqdm.notebook import tqdm

# 13a tokenizer
tokenizer = Tokenizer13a()
# bart tokenizer
#tokenizer_b = BartTokenizer.from_pretrained('facebook/bart-base',use_fast=False) 

smoothing_function = SmoothingFunction().method1

setting_ds_dict = {
    "base": ["c2sp","c4s","c4sp"],
    "masked": ["c2sp","c4sp","c4s"],
    "target-phrase":["c2sp","c4sp","c4s"],
    #"target-sent":["c2s","c2sp","c4s","c4sp","c2o","c2op"],
    "target-sent-target":["c2sp","c4sp","c4s"]
}

for model in models: 
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:

            bleu_scores_1 = []
            bleu_scores_2 = []
            
            output_name = f"{ds}-{setting_key}"
            df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}.csv")
            df_scores = create_scores_df(df_gen)
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                ref = row['elaboration_sentence']
                prediction = row['pred_elaboration']
            
                # tokenize
                tokenized_ref = tokenizer(ref).split()
                tokenized_pred = tokenizer(prediction).split()
                #tokenized_ref = tokenizer_b(ref)["input_ids"]
                #tokenized_pred = tokenizer_b(prediction)["input_ids"]
                    
                bleu_score_1 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(1, 0, 0, 0),smoothing_function=smoothing_function) # 1-gram
                bleu_score_2 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function) # 2-gram
                bleu_scores_1.append(round(bleu_score_1,3))
                bleu_scores_2.append(round(bleu_score_2,3))

            df_scores["b1"] = bleu_scores_1
            df_scores["b2"] = bleu_scores_2
            df_scores.to_csv(f"../data/bleu_scores/bleu_scores_{model}-{output_name}.csv",index=False)
            print(f"Results saved for {model}")

#import numpy as np
#print(f"Average BLEU-1 score: {np.mean(bleu_scores_1)*100:.3f}")
#print(f"Average BLEU-2 score: {np.mean(bleu_scores_2)*100:.3f}")
#print(f"Average BLEU-4 score: {np.mean(bleu_scores_4)*100:.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot


# BERTScore

In [11]:
from tqdm.notebook import tqdm
from bert_score import BERTScorer
import numpy as np
import pandas as pd
from transformers import logging

# suppress warnings
#logging.set_verbosity_error()

scorer = BERTScorer(model_type='bert-base-uncased',device='cuda:0')

models = ["bart-ft","llama-ft"]
#num_examples = "n6"

for model in models: 
    #df_res = pd.read_csv(f"../data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            
            bert_scores_f1 = []
            
            output_name = f"{ds}-{setting_key}" #-{num_examples}"
            df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}.csv")
            df_scores = create_scores_df(df_gen)
            
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                elaboration = row['elaboration_sentence']
                prediction = row['pred_elaboration']
                
                #  BERTScore for this pair
                P, R, F1 = scorer.score(
                    cands=[prediction],  
                    refs=[elaboration],              
                )
                
                bert_scores_f1.append(F1.mean().item())

            # save scores for each pair
            df_scores["bsf1"] = bert_scores_f1
            df_scores.to_csv(f"../data/bert_scores/bert_scores_{model}-{output_name}.csv",index=False)
            print(f"Results saved for {model}")
            
            """# save average scores to models general results 
            avg_precision = np.mean(bert_scores_precision)
            avg_recall = np.mean(bert_scores_recall)
            avg_f1 = np.mean(bert_scores_f1)

            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            #df_res.at[idx, f"{setting_key}-{num_examples}-bsprec"] = round(avg_precision,3)
            #df_res.at[idx, f"{setting_key}-{num_examples}-bsrec"] = round(avg_recall,3)
            df_res.at[idx, f"{setting_key}-bsf1"] = round(avg_f1,3)
            print(f"{model}-{ds}-{setting_key}: {round(avg_f1,3)}")

    df_res.to_csv(f"../data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")"""
#print(f"Average BERTScore Precision: {avg_precision:.3f}")
#print(f"Average BERTScore Recall: {avg_recall:.3f}")
#print(f"Average BERTScore F1: {avg_f1:.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft


# BARTScore

https://github.com/neulab/BARTScore

In [17]:
import sys
import os
from model_utils import BARTScorer
from tqdm.notebook import tqdm
import numpy as np

bart_scorer = BARTScorer(device='cuda:0')

models = ["llama-ft","bart-ft"]
#num_examples = "n6"

for model in models: 
    #df_res = pd.read_csv(f"data/results/{model}-results.csv")
    for setting_key, ds_values in setting_ds_dict.items():
        for ds in ds_values:
            bart_scores = []
            output_name = f"{ds}-{setting_key}"
            df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}.csv")
            df_scores = create_scores_df(df_gen)
        
            for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                reference = row['elaboration_sentence']  # reference text (r)
                hypothesis = row['pred_elaboration']    # generated text (h)
                
                # precision (r → h)
                precision_score = bart_scorer.score(
                    srcs=[reference],  # r as source
                    tgts=[hypothesis], # h as target
                    batch_size=1
                )[0]
                
                # recall (h → r)
                recall_score = bart_scorer.score(
                    srcs=[hypothesis],  # h as source
                    tgts=[reference],   # r as target
                    batch_size=1
                )[0]
                
                # f1 score as the average of precision and recall
                f1_score = (precision_score + recall_score) / 2
                bart_scores.append(f1_score)
            
            # save result for each pair
            df_scores["bartscore"] = bart_scores
            df_scores.to_csv(f"../data/bart_scores/bart_scores_{model}-{output_name}.csv",index=False)
            print(f"Results saved for {model}-{output_name}")
        
            """# average score
            avg_score = np.mean(bart_scores)
            idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
            df_res.at[idx, f"{setting_key}-{num_examples}-bartscore"] = round(avg_score,3)
            print(f"{model}-{ds}-{setting_key}-{num_examples}: {round(avg_score,3)}")
    
    df_res.to_csv(f"data/results/{model}-results.csv",index=False)
    print(f"Results saved for {model}")"""

  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c2sp-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4s-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4sp-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c2sp-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4sp-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4s-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c2sp-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4sp-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4s-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c2sp-target-sent-target


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4sp-target-sent-target


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-ft-c4s-target-sent-target


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c2sp-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4s-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4sp-base


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c2sp-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4sp-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4s-masked


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c2sp-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4sp-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4s-target-phrase


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c2sp-target-sent-target


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4sp-target-sent-target


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for bart-ft-c4s-target-sent-target


# Prompt evaluation

In [None]:
def create_scores_df(df_gen):
    df_scores = pd.DataFrame({
        'source_text': df_gen['source_text'] if 'source_text' in df_gen else None,
        'target_sentence': (
            df_gen['target_sentence_4o'] if 'target_sentence_4o' in df_gen
            else df_gen['target_sentence'] if 'target_sentence' in df_gen
            else None
        ),
        'target_sentence_target': df_gen['target_sentence_target'] if 'target_sentence_target' in df_gen else None,
        'subject': df_gen['subject'] if 'subject' in df_gen else None,
        'target-phrase': df_gen['target-phrase'] if 'target-phrase' in df_gen else None,
        'elaboration_sentence': df_gen['elaboration_sentence'],
        'pred_elaboration': df_gen['pred_elaboration'],
    })
    return df_scores

## BLEU score

In [4]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
import pandas as pd
from tqdm import tqdm

# 13a tokenizer
tokenizer = Tokenizer13a()
smoothing_function = SmoothingFunction().method1

# prompt evaluation
models = ["llama-instruct-few-shot"]
setting_ds_dict = {
    "short-n3":{"base":["c2s"]}, #{"base":["c2s","c2sp","c4s","c4sp"]},
    "medium-n6":{"base":["c2s"]}, #{"base":["c2s","c2sp","c4s","c4sp"]},
    #"long-n9":{"base":["c2s","c2sp","c4s","c4sp"]},
}

for model in models: 
    #df_res = pd.read_csv(f"../data/results/{model}-prompt-results.csv")
    for prompt_setting_key, setting_keys in setting_ds_dict.items():
        for setting_key, ds_values in setting_keys.items():
            for ds in ds_values:
            
                all_refs = []
                all_preds = []
                output_name = f"{ds}-{setting_key}"
                num_examples = prompt_setting_key.split("-")[-1]
                # read-in right df
                df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}-{num_examples}.csv")
     
                for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                    ref = row['elaboration_sentence']
                    prediction = row['pred_elaboration'] 
                
                    # Tokenize
                    tokenized_ref = tokenizer(ref).split()
                    tokenized_pred = tokenizer(prediction).split()
                    
                    all_refs.append([tokenized_ref]) 
                    all_preds.append(tokenized_pred)
                
                bleu1_score = corpus_bleu(all_refs, all_preds, weights=(1.0, 0, 0, 0), smoothing_function=smoothing_function)  # 1-gram
                bleu2_score = corpus_bleu(all_refs, all_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)  # 2-gram
                bleu4_score = corpus_bleu(all_refs, all_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)  # 4-gram
    
                """idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
                df_res.at[idx, f"{prompt_setting_key}-b1"] = round(bleu1_score*100,3)
                df_res.at[idx, f"{prompt_setting_key}-b2"] = round(bleu2_score*100,3)
                print(f"{model}-{ds}-{prompt_setting_key} B1: {round(bleu1_score*100,3)}")
                print(f"{model}-{ds}-{prompt_setting_key} B2: {round(bleu2_score*100,3)}")
        
        df_res.to_csv(f"../data/results/{model}-prompt-results.csv",index=False)
        print(f"Results saved for {model}")"""

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 12417.10it/s]
100%|██████████████████████████████████████| 116/116 [00:00<00:00, 19120.46it/s]


## BERTScore

In [6]:
from tqdm.notebook import tqdm
from bert_score import BERTScorer
from dataset_utils import create_scores_df
import numpy as np
from transformers import logging

# suppress warnings
#logging.set_verbosity_error()

scorer = BERTScorer(model_type='bert-base-uncased',device='cuda:0')

for model in models: 
    #df_res = pd.read_csv(f"../data/results/{model}-prompt-results.csv")
    for prompt_setting_key, setting_keys in setting_ds_dict.items():
        for setting_key, ds_values in setting_keys.items():
            for ds in ds_values:
            
                bert_scores_precision = []
                bert_scores_recall = []
                bert_scores_f1 = []
                
                output_name = f"{ds}-{setting_key}"
                # read-in right df
                num_examples = prompt_setting_key.split("-")[-1]
                # read-in right df
                df_gen = pd.read_csv(f"../data/gen_predictions/predictions_{model}-{output_name}-{num_examples}.csv")
                df_scores = create_scores_df(df_gen)

                #df_scores = create_scores_df(df_gen)
                
                for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                    elaboration = row['elaboration_sentence']
                    prediction = row['pred_elaboration']
                    
                    #  BERTScore for this pair
                    P, R, F1 = scorer.score(
                        cands=[prediction],  
                        refs=[elaboration],              
                    )
                    
                    bert_scores_precision.append(P.mean().item())
                    bert_scores_recall.append(R.mean().item())
                    bert_scores_f1.append(F1.mean().item())
    
                # save result for each pair
                df_scores["bsf1"] = bert_scores_f1
                df_scores.to_csv(f"../data/bert_scores/bert_scores_{model}-{output_name}-{prompt_setting_key}.csv",index=False)
                print(f"Results saved for {model}-{output_name}-{prompt_setting_key}")
            
                
                """avg_f1 = np.mean(bert_scores_f1)
                idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
                df_res.at[idx, f"{prompt_setting_key}-bsf1"] = round(avg_f1,3)
                print(f"{model}-{ds}-{prompt_setting_key}: {round(avg_f1,3)}")
    
        df_res.to_csv(f"../data/results/{model}-prompt-results.csv",index=False)
        print(f"Results saved for {model}")"""

  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot-c2s-base-short-n3


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot-c2s-base-medium-n6


## BARTScore

In [8]:
import sys
import os
parent_dir = os.path.abspath('..')
sys.path.append(parent_dir)

from utils.bart_score import BARTScorer
from tqdm.notebook import tqdm
import numpy as np

bart_scorer = BARTScorer(device='cuda:0')

for model in models: 
    #df_res = pd.read_csv(f"data/results/{model}-prompt-results.csv")
    for prompt_setting_key, setting_keys in setting_ds_dict.items():
        for setting_key, ds_values in setting_keys.items():
            for ds in ds_values:

                bart_scores = []
                output_name = f"{ds}-{setting_key}"
                # read-in right df
                num_examples = prompt_setting_key.split("-")[-1]
                # read-in right df
                df_gen = pd.read_csv(f"data/gen_predictions/predictions_{model}-{output_name}-{num_examples}.csv")
                df_scores = create_scores_df(df_gen)
            
                for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
                    reference = row['elaboration_sentence']  # reference text (r)
                    hypothesis = row['pred_elaboration']    # generated text (h)
                    
                    # precision (r → h)
                    precision_score = bart_scorer.score(
                        srcs=[reference],  # r as source
                        tgts=[hypothesis], # h as target
                        batch_size=1
                    )[0]
                    
                    # recall (h → r)
                    recall_score = bart_scorer.score(
                        srcs=[hypothesis],  # h as source
                        tgts=[reference],   # r as target
                        batch_size=1
                    )[0]
                    
                    # f1 score as the average of precision and recall
                    f1_score = (precision_score + recall_score) / 2
                    bart_scores.append(f1_score)
                
                # save score result for each pair
                df_scores["bartscore"] = bart_scores
                df_scores.to_csv(f"data/bart_scores/bart_scores_{model}-{output_name}-{prompt_setting_key}.csv",index=False)
                print(f"Results saved for {model}-{output_name}-{prompt_setting_key}")
    
                # average score
                """avg_score = np.mean(bart_scores)
                idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
                df_res.at[idx, f"{prompt_setting_key}-bartscore"] = round(avg_score,3)
                print(f"{model}-{ds}-{prompt_setting_key}: {round(avg_score,3)}")
    
                df_res.to_csv(f"data/results/{model}-prompt-results.csv",index=False)
                print(f"Results saved for {model}")"""

  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot-c2s-base-short-n3


  0%|          | 0/116 [00:00<?, ?it/s]

Results saved for llama-instruct-few-shot-c2s-base-medium-n6


# Show results

In [2]:
df_res

Unnamed: 0,dataset,short-b1,short-b2,long-b1,long-b2,random-b1,random-b2,short-bsf1,long-bsf1,random-bsf1,short-bartscore,long-bartscore,random-bartscore,random-n2-b1,random-n2-b2,random-n3-b1,random-n3-b2,random-n7-b1,random-n7-b2
0,c2s,15.899,3.588,15.161,4.712,14.945,2.894,0.475,0.446,0.436,-3.778,-3.843,-4.011,15.268,3.804,14.945,2.894,15.516,3.175
1,c2sp,15.266,3.515,14.368,4.223,13.931,2.704,0.456,0.439,0.438,-3.849,-3.922,-4.003,14.364,3.875,13.931,2.704,15.282,3.75
2,c4s,15.712,2.912,14.507,4.266,14.055,3.507,0.461,0.443,0.442,-3.884,-3.9,-4.023,12.869,3.313,14.055,3.507,13.452,3.822
3,c4sp,15.233,1.652,13.268,3.145,13.595,3.055,0.447,0.434,0.435,-3.941,-3.954,-4.044,12.824,2.68,13.595,3.055,14.537,3.098


In [6]:
df_res.to_csv(f"../data/results/{model}-prompt-results.csv",index=False)
print(model)

llama-instruct-few-shot


# Save results

In [8]:
df_results = pd.DataFrame({
    'source_text': df_gen['source_text'],
    'elaboration_sentence': df_gen['elaboration_sentence'],
    'pred_elaboration': df_gen['pred_elaboration'],
    'bert-score-precision': bert_scores_precision,
    'bert-score-recall': bert_scores_recall,
    'bert-score-f1': bert_scores_f1
})

# bleu-scores
#df_results.to_csv("../data/bleu_scores/bleu_scores_bart-ft-c2sp-masked.csv", index=False)
# bert-scores
df_results.to_csv(f"../data/bert_scores/bert_scores_{model}-test_ds-{output_name}.csv", index=False)