# Load the data

In [1]:
import pandas as pd

gen_pred_orig = pd.read_csv("../data/gen_predictions/predictions_bart-large-swipe_asset-test.csv", index_col = 0)
gen_pred_ft = pd.read_csv("../data/gen_predictions/predictions_bart-large-swipe-adamw-paged8bit_asset-test.csv")

In [12]:
gen_pred_ft.head()

Unnamed: 0,text,prediction
0,One side of the armed conflicts is composed ma...,One side of the armed conflicts is composed ma...
1,"Jeddah is the principal gateway to Mecca, Isla...","Jeddah is the main gateway to Mecca, Islam's h..."
2,The Great Dark Spot is thought to represent a ...,The Great Dark Spot is thought to represent a ...
3,"His next work, Saturday, follows an especially...","His next work, Saturday, follows an especially..."
4,"The tarantula, the trickster character, spun a...","The tarantula, the trickster character, spun a..."


# Load datasets

In [2]:
from datasets import load_dataset

asset_dataset = load_dataset("facebook/asset", "simplification")
asset_dataset

DatasetDict({
    validation: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['original', 'simplifications'],
        num_rows: 359
    })
})

In [10]:
df_gen = gen_pred_ft
dataset = asset_dataset

# SARI - Hugging Face

In [35]:
from evaluate import load
from tqdm.notebook import tqdm
import pandas as pd

sari_metric = load("sari")
sari_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index] 
    prediction = row['prediction']  
    
    sari_score = sari_metric.compute(
        sources=[r_content],
        predictions=[prediction],
        references=[s_content['simplifications']]
    )
    
    sari_scores.append(sari_score['sari'])

  0%|          | 0/359 [00:00<?, ?it/s]

# SARI - EASSE package

In [36]:
from easse.sari import corpus_sari


sari_scores_easse = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index]  
    prediction = row['prediction']  
    
    sari_score_easse = corpus_sari(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[simp] for simp in s_content['simplifications']]
    )
    
    sari_scores_easse.append(sari_score_easse)

  0%|          | 0/359 [00:00<?, ?it/s]

# Operation scores (add, keep, delete)

In [11]:
from easse.sari import get_corpus_sari_operation_scores
from tqdm.notebook import tqdm

add_scores = []
keep_scores = []
del_scores = []

for index, row in tqdm(df_gen.iterrows(), total=len(df_gen)):
    r_content = row['text'] 
    s_content = dataset['test'][index]  
    prediction = row['prediction']  
    
    add_score, keep_score, del_score = get_corpus_sari_operation_scores(
        orig_sents=[r_content],
        sys_sents=[prediction],
        refs_sents=[[simp] for simp in s_content['simplifications']] 
    )
    
    add_scores.append(add_score)
    keep_scores.append(keep_score)
    del_scores.append(del_score)

  0%|          | 0/359 [00:00<?, ?it/s]

In [12]:
df_results_op = pd.DataFrame({
    'text': df_gen['text'],
    'prediction': df_gen['prediction'],
    'add_score': add_scores,
    'keep_score': keep_scores,
    'del_score': del_scores
})

df_results_op.head()

Unnamed: 0,text,prediction,add_score,keep_score,del_score
0,One side of the armed conflicts is composed ma...,One side of the armed conflicts is composed ma...,0.0,72.681475,0.0
1,"Jeddah is the principal gateway to Mecca, Isla...","Jeddah is the main gateway to Mecca, Islam's h...",7.865117,69.458989,29.622357
2,The Great Dark Spot is thought to represent a ...,The Great Dark Spot is thought to represent a ...,2.592593,73.38613,0.0
3,"His next work, Saturday, follows an especially...","His next work, Saturday, follows an especially...",6.785244,63.505912,0.0
4,"The tarantula, the trickster character, spun a...","The tarantula, the trickster character, spun a...",0.0,60.897024,0.0


In [13]:
df_results_op.to_csv("../data/operation_scores/op_scores_bart-large-swipe-adamw-paged8bit_asset-test.csv", index=False)

# Save results

In [37]:
df_results = pd.DataFrame({
    'text': df_gen['text'],
    'prediction': df_gen['prediction'],
    'sari_score': sari_scores,
    'sari_score_easse': sari_scores_easse
})

df_results.head()

Unnamed: 0,text,prediction,sari_score,sari_score_easse
0,One side of the armed conflicts is composed ma...,One side of the armed conflicts is composed ma...,57.401034,24.227158
1,"Jeddah is the principal gateway to Mecca, Isla...","Jeddah is the main gateway to Mecca, Islam's h...",55.791904,35.648821
2,The Great Dark Spot is thought to represent a ...,The Great Dark Spot is thought to represent a ...,58.631393,25.326241
3,"His next work, Saturday, follows an especially...","His next work, Saturday, follows an especially...",56.834567,23.430385
4,"The tarantula, the trickster character, spun a...","The tarantula, the trickster character, spun a...",53.979754,20.299008


In [38]:
df_results.to_csv("../data/sari_scores/sari_scores_bart-large-swipe-adamw-paged8bit_asset-test.csv", index=False)