In [None]:
!pip install torch
!pip install transformers
!pip install pandas
!pip install Dataset
!pip install peft
!pip install scikit-learn
!pip install evaluate
!pip install textstat
!pip install numpy
!pip install sacrebleu sacremoses
!pip install bert_score

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
import pandas as pd
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
import evaluate
import textstat
import numpy as np
from torch.utils.data import DataLoader


In [None]:
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
#os.environ["WANDB_DISABLED"] = "true"

In [None]:
torch.manual_seed(42)

In [None]:
set_seed(42)

In [None]:
MAX_LENGTH = 1024
PATH_MODEL = "/kaggle/input/lora_60/transformers/default/1/run_lora_60_1024/checkpoint-9600"
PATH_DATASET = "/kaggle/input/test-paper-aied/candidate_full_80.csv"
NAME_MODEL = "LORA_TRY_60_4"
N_BEAM = 4

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
dataset = pd.read_csv(PATH_DATASET, sep="¶", engine='python')

In [None]:
dataset = dataset[["normal", "simplified"]]

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset_shuffled = dataset.sample(frac=1, random_state=42)

In [None]:
dataset_shuffled.reset_index(drop=True, inplace = True)

In [None]:
dataset_shuffled

In [None]:
tokenizer = AutoTokenizer.from_pretrained("morenolq/bart-it")

In [None]:
def filter_by_token_length(row):
    # Tokenizza entrambe le colonne
    normal_tokens = tokenizer(row['normal'], truncation=False, return_tensors="pt")
    simplified_tokens = tokenizer(row['simplified'], truncation=False, return_tensors="pt")
    # Controlla se entrambe le sequenze non superano max_length
    
    return len(normal_tokens.input_ids[0]) <= MAX_LENGTH and len(simplified_tokens.input_ids[0]) <= MAX_LENGTH

# Applica la funzione al DataFrame e filtra le righe
df_filtered_by_token_length = dataset_shuffled[dataset_shuffled.apply(filter_by_token_length, axis=1)]

In [None]:
df_filtered_by_token_length.reset_index(drop=True, inplace = True)

In [None]:
df_filtered_by_token_length

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_filtered_by_token_length["normal"],
                 df_filtered_by_token_length["simplified"],
                 test_size=0.2,
                 random_state = 42)

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train,
                 y_train,
                 test_size=0.2,
                 random_state = 42)

In [None]:
train_dataset = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
eval_dataset = pd.concat([X_eval, y_eval], axis=1).reset_index(drop=True)
test_dataset = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [None]:
hf_dataset_train = Dataset.from_pandas(train_dataset)
hf_dataset_eval = Dataset.from_pandas(eval_dataset)

In [None]:
def preprocess_function(examples):
    inputs = examples['normal']
    targets = examples['simplified']
    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt").to(device)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt").to(device)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
tokenized_datasets_train = hf_dataset_train.map(preprocess_function, batched=True)

In [None]:
hf_dataset_test = Dataset.from_pandas(test_dataset)
tokenized_datasets_test= hf_dataset_test.map(preprocess_function, batched=True)

In [None]:
bleu = evaluate.load("bleu")
sari = evaluate.load("sari")
bertscore = evaluate.load("bertscore")

In [None]:
def compute_metrics(pred):
    # Decodifica i pred e gli input del modello
    
    textstat.set_lang("it")
    
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    input_ids = pred.inputs
    
    pred_ids = np.where(pred_ids != -100, pred_ids, tokenizer.pad_token_id)
    labels_ids = np.where(labels_ids != -100, labels_ids, tokenizer.pad_token_id)
    input_ids = np.where(input_ids != -100, input_ids, tokenizer.pad_token_id)
    
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    input_str = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
    
    results = bleu.compute(predictions=pred_str, references=label_str)
        
    flesch_reading_ease = sum(textstat.flesch_reading_ease(sent) for sent in pred_str) / len(pred_str)
    
    ref_arr = [[sent] for sent in label_str]
    
    sari_score = sari.compute(sources=input_str, predictions=pred_str, references=ref_arr)

    #gulpease_index = sum(textstat.gulpease_index(sent) for sent in pred_str) / len(pred_str)

    bert_score_results = bertscore.compute(predictions=pred_str, references=label_str, model_type="xlm-roberta-base")
    bert_score_results_f1 =sum(bert_score_results['f1']) / len(bert_score_results['f1'])
    
    return {
        'bleu': results['bleu'],
        'flesch_reading': flesch_reading_ease,
        'sari': sari_score["sari"],
        'bertscore': bert_score_results_f1
    }

In [None]:
def run_eval(tokenizer, tokenized_datasets_train, tokenized_datasets_test, compute_metrics, name_model, model, num_beam, max_len):
        training_args = Seq2SeqTrainingArguments(
                output_dir = "eval_time",
                run_name="eval_" + name_model,
                eval_strategy="epoch", #
                per_device_train_batch_size=32,#
                per_device_eval_batch_size=4,
                predict_with_generate=True, 
                generation_num_beams = num_beam,
                generation_max_length = max_len,
                include_inputs_for_metrics = True,
                logging_strategy="epoch"
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets_train,
            eval_dataset=tokenized_datasets_test,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
        )

        eval = trainer.evaluate(tokenized_datasets_test)

        del trainer
        del training_args

        torch.cuda.empty_cache()
        
        return eval

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(PATH_MODEL).to(device)

In [None]:
eval_test=run_eval(tokenizer, tokenized_datasets_train, tokenized_datasets_test, compute_metrics, NAME_MODEL, model, N_BEAM, MAX_LENGTH)

In [None]:
eval_test