# Load the data

In [203]:
import pandas as pd

ds = "cs"
setting = "target-sent-subject"
output_name = f"{ds}-{setting}"
model = "llama-instruct-few-shot"
test_ds_llama_ft_alpaca = pd.read_csv(f"../data/gen_predictions/predictions_llama3.2-ft-alpaca-test_ds_{output_name}.csv")
test_ds_llama_instruct = pd.read_csv(f"../data/gen_predictions/predictions_llama3.2-instruct-few-shot-test_ds_{output_name}.csv")
test_ds_bart = pd.read_csv(f"../data/gen_predictions/predictions_bart-{output_name}.csv")

df_gen = test_ds_llama_instruct
#df_gen = df_gen.fillna("")
df_gen.isnull().sum()

Unnamed: 0              0
elaboration_sentence    0
subject                 0
target_sentence         0
pred_elaboration        0
dtype: int64

# Load results df

In [None]:
bart_ft_res = pd.read_csv("../data/results/bart-ft-results.csv")
llama_ft_res = pd.read_csv("../data/results/llama-ft-results.csv")
llama_instr_res = pd.read_csv("../data/results/llama-instruct-few-shot-results.csv")

# Initialize columns in results dfs

In [129]:
df_res = pd.read_csv(f"../data/results/{model}-results.csv")
settings = list(dict.fromkeys(["-".join(col.split("-")[:-1]) for col in df_res.columns if "-" in col]))
cols_to_add = [f"{col_name}-bs-prec" for col_name in settings] + \
              [f"{col_name}-bs-rec" for col_name in settings] + \
              [f"{col_name}-bs-f1" for col_name in settings]
for col in cols_to_add:
    df_res[col] = None 

df_res

Unnamed: 0,dataset,base-b1,base-b2,masked-b1,masked-b2,subject-b1,subject-b2,target-phrase-b1,target-phrase-b2,target-sent-b1,...,target-sent-bs-rec,target-sent-target-bs-rec,target-sent-subject-bs-rec,base-bs-f1,masked-bs-f1,subject-bs-f1,target-phrase-bs-f1,target-sent-bs-f1,target-sent-target-bs-f1,target-sent-subject-bs-f1
0,c2s,15.899,3.588,15.258,3.309,22.277,8.939,18.347,5.923,17.417,...,,,,,,,,,,
1,c2sp,15.266,3.515,15.757,3.567,23.58,9.417,18.686,5.726,18.003,...,,,,,,,,,,
2,c4s,15.712,2.912,14.908,4.157,22.771,8.681,17.56,5.348,18.005,...,,,,,,,,,,
3,c4sp,15.233,1.652,16.923,4.93,23.323,9.918,18.652,5.758,16.861,...,,,,,,,,,,
4,cs,19.65,6.692,,,,,,,,...,,,,,,,,,,
5,c2spo,13.904,2.192,,,,,,,,...,,,,,,,,,,
6,c4spo,13.147,1.519,,,,,,,,...,,,,,,,,,,


# SARI - Hugging Face

In [35]:
from evaluate import load
from tqdm.notebook import tqdm
import pandas as pd

sari_metric = load("sari")
sari_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index] 
    prediction = row['prediction']  
    
    sari_score = sari_metric.compute(
        sources=[r_content],
        predictions=[prediction],
        references=[s_content['simplifications']]
    )
    
    sari_scores.append(sari_score['sari'])

  0%|          | 0/359 [00:00<?, ?it/s]

# SARI - EASSE package

In [13]:
from easse.sari import corpus_sari
from tqdm.notebook import tqdm


sari_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text']  
    prediction = row['prediction']  
    
    sari_score_easse = corpus_sari(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']]
    )
    
    sari_scores_easse.append(sari_score_easse)

  0%|          | 0/116 [00:00<?, ?it/s]

KeyError: 'label_text'

In [106]:
import numpy as np
print("Average SARI score:", np.mean(sari_scores_easse))

Average SARI score: 36.46781896155372


# Operation scores (add, keep, delete)

In [4]:
from easse.sari import get_corpus_sari_operation_scores
from tqdm.notebook import tqdm

add_scores = []
keep_scores = []
del_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['source_text'] 
    s_content = row['label_text'] #dataset['test'][index]  
    prediction = row['prediction']  
    
    add_score, keep_score, del_score = get_corpus_sari_operation_scores(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[s_content]]
        #refs_sents=[[simp] for simp in s_content['simplifications']] 
    )
    
    add_scores.append(add_score)
    keep_scores.append(keep_score)
    del_scores.append(del_score)

  0%|          | 0/116 [00:00<?, ?it/s]

# BLEU-4 (EASSE package)

In [25]:
from tqdm.notebook import tqdm
from easse.bleu import corpus_bleu
import numpy as np

bleu_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    s_content = row['elaboration_sentence'] 
    prediction = row['pred_elaboration'] # "prediction" for BART
    
    bleu_score_easse = corpus_bleu(
        sys_sents=[prediction],
        refs_sents=[[s_content]]
    )
    
    bleu_scores_easse.append(bleu_score_easse)

print(f"Average BLEU score: {np.mean(bleu_scores_easse):.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Average BLEU score: 4.965


### BLEU-1 & BLEU-2 (nltk + tokenizer-13A)

## Corpus bleu

In [111]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from tqdm import tqdm

# 13a tokenizer
tokenizer = Tokenizer13a()
smoothing_function = SmoothingFunction().method1

all_refs = []
all_preds = []

# Tokenize and collect references and predictions
for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    ref = row['elaboration_sentence']
    prediction = row['pred_elaboration'] # "prediction" for BART

    # Tokenize
    tokenized_ref = tokenizer(ref).split()
    tokenized_pred = tokenizer(prediction).split()
    
    all_refs.append([tokenized_ref]) 
    all_preds.append(tokenized_pred)

bleu1_score = corpus_bleu(all_refs, all_preds, weights=(1.0, 0, 0, 0), smoothing_function=smoothing_function)  # 1-gram
bleu2_score = corpus_bleu(all_refs, all_preds, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function)  # 2-gram
bleu4_score = corpus_bleu(all_refs, all_preds, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)  # 4-gram

print(f"Corpus BLEU-1: {bleu1_score*100:.3f}")
print(f"Corpus BLEU-2: {bleu2_score*100:.3f}")
print(f"Corpus BLEU-4: {bleu4_score*100:.3f}")

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 16134.08it/s]

Corpus BLEU-1: 13.147
Corpus BLEU-2: 1.519
Corpus BLEU-4: 0.126





## Sentence bleu

In [45]:
from nltk.translate.bleu_score import sentence_bleu
from sacrebleu.tokenizers.tokenizer_13a import Tokenizer13a
from transformers import BartTokenizer

bleu_scores_1 = []
bleu_scores_2 = []
bleu_scores_4 = []

# 13a tokenizer
tokenizer = Tokenizer13a()
# bart tokenizer
#tokenizer_b = BartTokenizer.from_pretrained('facebook/bart-base',use_fast=False) 

smoothing_function = SmoothingFunction().method1

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    ref = row['elaboration_sentence']
    prediction = row['pred_elaboration']

    # tokenize
    tokenized_ref = tokenizer(ref).split()
    tokenized_pred = tokenizer(prediction).split()
    #tokenized_ref = tokenizer_b(ref)["input_ids"]
    #tokenized_pred = tokenizer_b(prediction)["input_ids"]
        
    bleu_score_1 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(1, 0, 0, 0),smoothing_function=smoothing_function) # 1-gram
    bleu_score_2 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function) # 2-gram
    bleu_score_4 = sentence_bleu([tokenized_ref],tokenized_pred,weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function) # 4-gram
    bleu_scores_1.append(bleu_score_1)
    bleu_scores_2.append(bleu_score_2)
    bleu_scores_4.append(bleu_score_4)

import numpy as np
print(f"Average BLEU-1 score: {np.mean(bleu_scores_1)*100:.3f}")
print(f"Average BLEU-2 score: {np.mean(bleu_scores_2)*100:.3f}")
print(f"Average BLEU-4 score: {np.mean(bleu_scores_4)*100:.3f}")

100%|███████████████████████████████████████| 116/116 [00:00<00:00, 4293.61it/s]

Average BLEU-1 score: 16.922
Average BLEU-2 score: 6.008
Average BLEU-4 score: 3.106





In [None]:
df_results = pd.DataFrame({
    'elaboration_sentence': df_gen['elaboration_sentence'],
    'pred_elaboration': df_gen['pred_elaboration'],
    'bleu-1': bleu_scores_1,
    'bleu-2': bleu_scores_2,
})

# BERTScore

In [204]:
from tqdm.notebook import tqdm
from bert_score import BERTScorer
import numpy as np
from transformers import logging

# suppress warnings
#logging.set_verbosity_error()

bert_scores_precision = []
bert_scores_recall = []
bert_scores_f1 = []

scorer = BERTScorer(model_type='bert-base-uncased')

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    elaboration = row['elaboration_sentence']
    prediction = row['pred_elaboration']
    
    #  BERTScore for this pair
    P, R, F1 = scorer.score(
        cands=[prediction],  
        refs=[elaboration],              
    )
    
    bert_scores_precision.append(P.mean().item())
    bert_scores_recall.append(R.mean().item())
    bert_scores_f1.append(F1.mean().item())

# average scores
avg_precision = np.mean(bert_scores_precision)
avg_recall = np.mean(bert_scores_recall)
avg_f1 = np.mean(bert_scores_f1)

idx = df_res.index[df_res["dataset"] == ds].tolist()[0]
df_res.at[idx, f"{setting}-bs-prec"] = round(avg_precision,3)
df_res.at[idx, f"{setting}-bs-rec"] = round(avg_recall,3)
df_res.at[idx, f"{setting}-bs-f1"] = round(avg_f1,3)

print(f"Average BERTScore Precision: {avg_precision:.3f}")
print(f"Average BERTScore Recall: {avg_recall:.3f}")
print(f"Average BERTScore F1: {avg_f1:.3f}")

  0%|          | 0/116 [00:00<?, ?it/s]

Average BERTScore Precision: 0.505
Average BERTScore Recall: 0.537
Average BERTScore F1: 0.518


# BARTScore

# Show results

In [207]:
df_res

Unnamed: 0,dataset,base-b1,base-b2,masked-b1,masked-b2,subject-b1,subject-b2,target-phrase-b1,target-phrase-b2,target-sent-b1,...,target-sent-bs-rec,target-sent-target-bs-rec,target-sent-subject-bs-rec,base-bs-f1,masked-bs-f1,subject-bs-f1,target-phrase-bs-f1,target-sent-bs-f1,target-sent-target-bs-f1,target-sent-subject-bs-f1
0,c2s,15.899,3.588,15.258,3.309,22.277,8.939,18.347,5.923,17.417,...,0.493,0.499,0.535,0.475,0.462,0.51,0.488,0.485,0.49,0.521
1,c2sp,15.266,3.515,15.757,3.567,23.58,9.417,18.686,5.726,18.003,...,0.497,0.511,0.532,0.456,0.466,0.515,0.494,0.484,0.498,0.515
2,c4s,15.712,2.912,14.908,4.157,22.771,8.681,17.56,5.348,18.005,...,0.493,0.499,0.533,0.461,0.469,0.513,0.485,0.484,0.487,0.518
3,c4sp,15.233,1.652,16.923,4.93,23.323,9.918,18.652,5.758,16.861,...,0.495,0.512,0.534,0.447,0.456,0.52,0.491,0.481,0.499,0.517
4,cs,19.65,6.692,,,,,,,,...,,0.491,0.537,0.487,,,,,0.481,0.518
5,c2spo,13.904,2.192,,,,,,,,...,,,,0.431,,,,,,
6,c4spo,13.147,1.519,,,,,,,,...,,,,0.43,,,,,,


In [75]:
df_res.to_csv(f"../data/results/{model}-results.csv",index=False)

# Save results

In [8]:
df_results = pd.DataFrame({
    'source_text': df_gen['source_text'],
    'elaboration_sentence': df_gen['elaboration_sentence'],
    'pred_elaboration': df_gen['pred_elaboration'],
    'bert-score-precision': bert_scores_precision,
    'bert-score-recall': bert_scores_recall,
    'bert-score-f1': bert_scores_f1
})

# bleu-scores
#df_results.to_csv("../data/bleu_scores/bleu_scores_bart-ft-c2sp-masked.csv", index=False)
# bert-scores
df_results.to_csv(f"../data/bert_scores/bert_scores_{model}-test_ds-{output_name}.csv", index=False)