In [None]:
!pip install torch

!pip install transfomers

!pip install nltk

!pip install spacy

!pip install pandas



!pip install peft

!pip install textstat

!pip install sacrebleu

!pip install evaluate

!pip install sacremoses

!pip install bert_score



In [None]:
import os
import json

import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed

import pandas as pd

from datasets import Dataset

from sklearn.model_selection import train_test_split

import textstat

import numpy as np

from torch.utils.data import DataLoader

import evaluate

In [None]:
# Path to dataset in 'Final'
PATH_DATASET = ""
# Path to dictionary
PATH_DICTIONARY = ""
# Insert List of sentences to apply to post-processing
list_sentence_post_processing = []


In [None]:
set_seed(42)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

In [None]:
dataset = pd.read_csv(PATH_DATASET, sep="¶", engine='python')

dataset = dataset[["normal", "simplified"]]

dataset.dropna(inplace=True)

dataset_shuffled = dataset.sample(frac=1, random_state=42)

dataset_shuffled.reset_index(drop=True, inplace = True)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("morenolq/bart-it")


In [None]:
MAX_LENGTH = 1024

In [None]:
def filter_by_token_length(row):

    # Tokenizza entrambe le colonne

    normal_tokens = tokenizer(row['normal'], truncation=False, return_tensors="pt")

    simplified_tokens = tokenizer(row['simplified'], truncation=False, return_tensors="pt")

    # Controlla se entrambe le sequenze non superano max_length

    

    return len(normal_tokens.input_ids[0]) <= MAX_LENGTH and len(simplified_tokens.input_ids[0]) <= MAX_LENGTH



# Applica la funzione al DataFrame e filtra le righe

df_filtered_by_token_length = dataset_shuffled[dataset_shuffled.apply(filter_by_token_length, axis=1)]

In [None]:
df_filtered_by_token_length.reset_index(drop=True, inplace = True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_filtered_by_token_length["normal"],

                 df_filtered_by_token_length["simplified"],

                 test_size=0.2,

                 random_state = 42)

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train,

                 y_train,

                 test_size=0.2,

                 random_state = 42)

In [None]:
train_dataset = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)

eval_dataset = pd.concat([X_eval, y_eval], axis=1).reset_index(drop=True)

test_dataset = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

In [None]:
hf_dataset_train = Dataset.from_pandas(train_dataset)

hf_dataset_eval = Dataset.from_pandas(eval_dataset)

In [None]:
def preprocess_function(examples):

    inputs = examples['normal']

    targets = examples['simplified']

    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt").to(device)

    with tokenizer.as_target_tokenizer():

        labels = tokenizer(targets, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt").to(device)



    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [None]:
tokenized_datasets_train = hf_dataset_train.map(preprocess_function, batched=True)

tokenized_datasets_eval = hf_dataset_eval.map(preprocess_function, batched=True)

In [None]:
bleu = evaluate.load("bleu")

sari = evaluate.load("sari")

bertscore = evaluate.load("bertscore")

# PARTE POST-PROC

In [None]:
import nltk
from transformers import AutoTokenizer, BertTokenizerFast, BertForTokenClassification, pipeline, set_seed
import spacy
import string
import pickle
import re

In [None]:
nltk.download('omw-1.4')

In [None]:
!spacy download it_core_news_lg 

In [None]:
tokenizer_ner = BertTokenizerFast.from_pretrained("osiria/bert-italian-uncased-ner")
model_ner = BertForTokenClassification.from_pretrained("osiria/bert-italian-uncased-ner").to("cuda:0")
ner_pipe = pipeline("ner", model = model_ner, tokenizer = tokenizer_ner, aggregation_strategy="first", device = "cuda:0")

In [None]:
nlp = spacy.load('it_core_news_lg')

In [None]:
acceptable_chars = set(
    string.ascii_lowercase + "àèìòùáéíóú" + " "
)
check_word_in = ("ADV", "VERB", "NOUN", "ADJ")

In [None]:
def word_in_letter_set(word, acceptable_chars):
    # Check if word is in dictionary or in exception list
    validation = set(word)
    if validation.issubset(acceptable_chars):
        return True
    else:
        return False

In [None]:
def find_subarray(arr1, arr2):
    
    arr1t = [str(a1).lower() for a1 in arr1]
    
    n = len(arr1t)
    m = len(arr2)

    for i in range(0, n - m + 1):
        if arr1t[i:i + m] == arr2:
            return i
    
    return -1

In [None]:
# Remove entity
def remove_entity(arr_nlp, ner_result):
    final_arr = arr_nlp
    for result in ner_result:
        longer_word = result["word"].split()
        found_sub_arr = find_subarray(final_arr,longer_word)
        tmp_arr = list()
        for i in range(0, len(final_arr)):
            if i not in range(found_sub_arr, found_sub_arr + len(longer_word)):
                tmp_arr.append(final_arr[i]) 
        final_arr = tmp_arr
    return final_arr

In [None]:
entire_vocab = []

with open(PATH_DICTIONARY, 'rb') as file1:
    entire_vocab = pickle.load(file1)

In [None]:
def find_word_in_vocabs(word_from_spacy):
    if str(word_from_spacy) in entire_vocab:
            return True
    return  False

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct").to("cuda:1")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
def output_prompt(word, type_word, sen):
    
    if type_word == "ADV":
        type_word_insert = "dell'avverbio"
    elif type_word == "VERB":
        type_word_insert = "del verbo"
    elif type_word == "ADJ":
        type_word_insert = "dell'aggettivo"
    elif type_word == "NOUN":
        type_word_insert = "del sostantivo"
    else:
        print("nulla")
        return []
        
    final = f"Sei un esperto di sinonimi italiani adatti per bambini dalla terza alla quinta elementare. data questa frase di riferimento '{sen}', dimmi 3 sinonimi {type_word_insert} '{word}'."
    
    messages = [{"role": "user", "content": final}]
    prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda:1")
    outputs = model.generate(prompt, max_new_tokens=1200)
    text = tokenizer.batch_decode(outputs)[0]
    
    #pdb.set_trace()
    
    del prompt
    del outputs
    torch.cuda.empty_cache()
    
    synonim_words_list= re.findall(r"\d\.\s([^\n(<-]+)", text)

    del text
    

    synonim_words_list = [sy.replace('*', '').lower().rstrip() for sy in synonim_words_list]
       
    if word.lower() in synonim_words_list:
        synonim_words_list.remove(word.lower())
    
    synonim_words_list = list(set(synonim_words_list))
    
    torch.cuda.empty_cache()
    
    return synonim_words_list

In [None]:
def adjusting(word_to_replace, eval_sentence):
    count_adj = 0
    nlp_eval_sent = nlp(eval_sentence)
    for i in range(0, len(nlp_eval_sent)):
        if i> 0 and nlp_eval_sent[i].text.lower() == word_to_replace.lower():
            
            gender_val = nlp_eval_sent[i].morph.get("Gender")
            if gender_val != []:
                gender = gender_val[0][0]
            else:
                gender = "Z"
                if nlp_eval_sent[i].pos_ == "NOUN":
                    count_adj = count_adj + 1
            
            k = i-1
            #pdb.set_trace()
            if nlp_eval_sent[k].tag_ in ("RI", "RD", "E_RD", "DI", "AP", "DD", "PI", "PP", "PD"):
                lung_coso  = len(nlp_eval_sent[k])
                where_parola = str(nlp_eval_sent).find(str(nlp_eval_sent[i]))
                
                indeterm_art = ("uno", "un", "una", "un'")
                art_masch = ("il", "lo", "l'", "i", "gli")
                art_femm = "la"
                ecc_art = ("z","x", "y")
                ecc_art_2 = ("ps", "gn")
                
                prep = ("a", "da", "de", "ne", "su")

                prep_common_male = ("llo", "l", "ll'", "gli", "i")
                prep_common_female = ("lla", "ll'", "lle")
                
                #GESTIONE ARTICOLI INDETERMINATIVI
                if nlp_eval_sent[k].tag_ == "RI":
                    #QUELLI MASCHILI
                    if gender == "M" or (gender == "Z" and nlp_eval_sent[k].text.lower() in indeterm_art[0:2]):
                        if (
                            nlp_eval_sent[i].text.lower()[0] in ecc_art 
                            or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2  
                            or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in 'aeiou')
                            or (nlp_eval_sent[i].text.lower()[0] in 'iu' and nlp_eval_sent[i].text.lower()[1]  in 'aeiou')
                        ):
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "uno")
                            eval_sentence = first_piece + second_piece
                        else:
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "un")
                            eval_sentence = first_piece + second_piece
                    #QUELLI FEMMINILI
                    elif gender == "F" or (gender == "Z" and nlp_eval_sent[k].text.lower() in indeterm_art[2:]):
                        if (
                             nlp_eval_sent[i].text.lower()[0] not in  'aeiou'
                             or (nlp_eval_sent[i].text.lower()[0] in 'iu' and nlp_eval_sent[i].text.lower()[1]  in 'aeiou')
                        ):
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "una")
                            eval_sentence = first_piece + second_piece
                        else:
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "un'")
                            eval_sentence = first_piece + second_piece
                elif nlp_eval_sent[k].tag_ == "RD":
                    ##ARTICOLI DET SING
                    if nlp_eval_sent[k].text.lower() in art_masch[0:3] or nlp_eval_sent[k].text.lower() == art_femm:
                        #QUELLI MASCHILI
                        if gender == "M" or (gender == "Z" and nlp_eval_sent[k].text.lower() in art_masch[0:3]):
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':

                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "l'")
                                eval_sentence = first_piece + second_piece

                            elif (
                                nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "lo")
                                eval_sentence = first_piece + second_piece

                            else:
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "il")
                                eval_sentence = first_piece + second_piece
                        #QUELLI FEMMINILI
                        elif gender == "F" or(gender == "Z" and nlp_eval_sent[k].text.lower() == art_femm):
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':

                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "l'")
                                eval_sentence = first_piece + second_piece
                            else:
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "la")
                                eval_sentence = first_piece + second_piece

                    ##ARTICOLI DET PLU
                    else:
                        #QUELLI MASCHILI
                        if gender == "M" or (gender == "Z" and nlp_eval_sent[k].text.lower() in art_masch[3:]):
                            if (
                                nlp_eval_sent[i].text.lower()[0] in 'aeiou' 
                                or nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "gli")
                                eval_sentence = first_piece + second_piece

                            else:
                                    first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                    second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "i")
                                    eval_sentence = first_piece + second_piece
                        # QUELLI FEMMINILI
                        elif gender == "F" or (gender == "Z" and nlp_eval_sent[k].text.lower() == "le"):
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "le")
                            eval_sentence = first_piece + second_piece
                elif nlp_eval_sent[k].tag_ == "E_RD":
                    first_alpha = nlp_eval_sent[k].text.lower()[0] == prep[0]
                    first_beta =  nlp_eval_sent[k].text.lower()[0:2] in prep[1:]
                    first_part = first_alpha or first_beta
                    second_part_sing_m = nlp_eval_sent[k].text.lower()[-3:] in prep_common_male[0:2] or nlp_eval_sent[k].text.lower()[-1] == prep_common_male[2]
                    second_part_sing_f = nlp_eval_sent[k].text.lower()[-3:] in prep_common_female[0:2]
                    second_part_plur_m = nlp_eval_sent[k].text.lower()[-3:] == prep_common_male[3] or nlp_eval_sent[k].text.lower()[-1] == prep_common_male[-1]
                    second_part_plur_f = nlp_eval_sent[k].text.lower()[-3:] == prep_common_female[-1]
                    
                    base_word_part = nlp_eval_sent[k].text.lower()[0]
                    if first_beta:
                        base_word_part = nlp_eval_sent[k].text.lower()[0:2]
                    #PREP e PARITITVI SING
                    if first_part and (second_part_sing_m or second_part_sing_f):

                        #QUELLI MASCHILI
                        if gender == "M" or (gender == "Z" and second_part_sing_m):
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':
                                
                                compound =  base_word_part + "ll'"
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece
                            elif (
                                nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                compound =  base_word_part + "llo "
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece
                            else:
                                compound =  base_word_part + "l "
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece

                        #QUELLI FEMMINILI
                        elif gender == "F" or (gender == "Z" and second_part_sing_f):
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':
                                
                                compound =  base_word_part + "ll'"
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece
                            else:
                                compound =  base_word_part + "lla "
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece
                    elif first_part and (second_part_plur_m or second_part_plur_f):
                        #QUELLI MASCHILI
                        if gender == "M" or(gender == "Z" and second_part_plur_m):
                            if (
                                nlp_eval_sent[i].text.lower()[0] in 'aeiou' 
                                or nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                compound =  base_word_part + "gli "
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece

                            else:
                                compound =  base_word_part + "i "
                                
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                                eval_sentence = first_piece + second_piece

                        # QUELLI FEMMINILI
                        elif gender == "F" or(gender == "Z" and nlp_eval_sent[k].text.lower() == "le"):
                            
                            compound =  base_word_part + "lle "
                            
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                #"DI", "AP", "DD"
                elif nlp_eval_sent[k].tag_ in  ("DI", "PI"):
                    indef_s = ("alcun", "nessun", "ciascun", "cert", "altr",
                               "ognun", "qualcun", "poc", "alquant", "vari", "divers",
                              "parecch", "tant", "tropp", "tutt", "un")
                    #SING INDEF
                    if nlp_eval_sent[k].text.lower()[:-1] in indef_s and nlp_eval_sent[k].text.lower()[-1] not in ("i", "e"):
                        if gender == "M":
                            extra =  "o"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "a"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                    #PLUR INDEF
                    elif nlp_eval_sent[k].text.lower()[:-1] in indef_s and nlp_eval_sent[k].text.lower()[-1] in ("i", "e"):
                        if gender == "M":
                            extra =  "i"
                            if nlp_eval_sent[k].text.lower()[:-1] == "poc":
                                extra = "hi"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "e"
                            if nlp_eval_sent[k].text.lower()[:-1] == "poc":
                                extra = "he"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                elif nlp_eval_sent[k].tag_ in  ("AP", "PP"):
                    poss_s_s = ("mi", "tu", "su")
                    poss_p_s = ("nostr", "vostr","ess")
                    #SINGOLARE POSS
                    if (
                        (nlp_eval_sent[k].text.lower()[:-1] in poss_s_s 
                        or nlp_eval_sent[k].text.lower()[:-1] in poss_p_s )
                        and (nlp_eval_sent[k].text.lower()[-1] in ('o','a'))
                    ):
                        if gender == "M":
                            extra =  "o"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "a"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                    #PLURALE POSS
                    elif (
                        (nlp_eval_sent[k].text.lower()[:-1] in poss_s_s 
                        or nlp_eval_sent[k].text.lower()[:-1] in poss_p_s )
                        and (nlp_eval_sent[k].text.lower()[-1] not in ('o','a'))
                    ):
                        if gender == "M":
                            extra =  "i"
                            if nlp_eval_sent[k].text.lower()[:-1] == "mi":
                                extra = "ei"
                            elif nlp_eval_sent[k].text.lower()[:-1] in ("tu", "su"):
                                extra = "oi"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "e"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
            #DIMOSTRATIVI
                elif nlp_eval_sent[k].tag_ in  ("DD", "PD"):
                    dimost_s = ("quest", "codest")
                    dimonost_s_extra = "que"
                    #SINGOLAR POSS
                    if nlp_eval_sent[k].text.lower()[:-1] in dimost_s and nlp_eval_sent[k].text.lower()[-1] in ('o', 'a'):
                        if gender == "M":
                            extra =  "o"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "a"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                    elif nlp_eval_sent[k].text.lower()[0:3] == dimonost_s_extra and nlp_eval_sent[k].text.lower()[-1] in ("o", "a", "l", "'"):
                        if gender == "M":
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':
        
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quell'")
                                eval_sentence = first_piece + second_piece
        
                            elif (
                                nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quello")
                                eval_sentence = first_piece + second_piece
        
                            else:
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quel")
                                eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            if nlp_eval_sent[i].text.lower()[0] in 'aeiou':
    
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quell'")
                                eval_sentence = first_piece + second_piece
                            else:
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quella")
                                eval_sentence = first_piece + second_piece
                    #PLURALE DIMOSTRATIVI
                    elif nlp_eval_sent[k].text.lower()[:-1] in dimost_s and nlp_eval_sent[k].text.lower()[-1] not in ('o', 'a'):
                        if gender == "M":
                            extra =  "i"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            extra =  "e"
                            compound =  nlp_eval_sent[k].text[:-1] + extra
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, compound)
                            eval_sentence = first_piece + second_piece
                    elif nlp_eval_sent[k].text.lower()[0:3] == dimonost_s_extra and nlp_eval_sent[k].text.lower()[-1] in ("i", "e"):
                        if gender == "M":
                            if (
                                    nlp_eval_sent[i].text.lower()[0] in 'aeiou' 
                                    or nlp_eval_sent[i].text.lower()[0] in ecc_art 
                                    or nlp_eval_sent[i].text.lower()[0:2] in ecc_art_2 
                                    or (nlp_eval_sent[i].text.lower()[0] == "s" and nlp_eval_sent[i].text.lower()[1] not in ("a", "e", "i", "o", "u"))
                            ):
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quegli")
                                eval_sentence = first_piece + second_piece
    
                            else:
                                first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                                second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quei")
                                eval_sentence = first_piece + second_piece
                        elif gender == "F":
                            first_piece = eval_sentence[0:where_parola-1-len(nlp_eval_sent[k])] 
                            second_piece = eval_sentence[where_parola-1-len(nlp_eval_sent[k]):].replace(nlp_eval_sent[k].text, "quelle")
                            eval_sentence = first_piece + second_piece
    return eval_sentence, count_adj

In [None]:
def replace_syn(frasi_test):
    perc_vocab = 0.3
    perc_ai = 1- perc_vocab
    final_frasi = []
    count_adj_final = 0
    count_not_found = 0
    for sentence in frasi_test:
        print(sentence)
        # Pos-tagged version
        eval_sentence = sentence
    
        arr_nlp = nlp(sentence)
        # NER version
        ner_result = ner_pipe(sentence)
        # Remove entities
        final_arr = remove_entity(arr_nlp, ner_result)
        
        for word_elaborated in final_arr:
            if word_elaborated.pos_ in check_word_in and word_elaborated.text != "non":
                found = find_word_in_vocabs(word_elaborated)
    
                #If not found start synonym replacment
                if not found:
    
                    synonim_words_list = output_prompt(word_elaborated.text, word_elaborated.pos_, eval_sentence)

                    create_sentences = list()

                    synonim_words = synonim_words_list
                    # Point if synonyms in vocab
                    final_score = list()
                    print(synonim_words)
                    for synonim in synonim_words:
                        cleaned_synonim = synonim.replace("_", " ")
                        if find_word_in_vocabs(cleaned_synonim):
                            #ADD basic score
                            final_score.append(perc_vocab)
                        else:
                            count_not_found = count_not_found +  1
                            final_score.append(0.0)
                        sentence_with_replace = re.sub(word_elaborated.text ,cleaned_synonim.lower(), eval_sentence)
                        create_sentences.append(sentence_with_replace)
                    
                    if len(synonim_words) > 0:
                        #Bertscore analysis
                        
                        predictions = create_sentences
                        references =  eval_sentence 
                        sum_score = list()
                        for (predicted, score_base) in zip(predictions,final_score) :
                            bert_score_results = bertscore.compute(predictions=[predicted], references=[references], model_type="xlm-roberta-large")
                            value_score = perc_ai * bert_score_results["f1"][0] + score_base
                            sum_score.append(value_score)
                        
                        #Max similariry
                        candidate_index = sum_score.index(max(sum_score))
                        
                        candidate_final_sent = predictions[candidate_index]
                        word_to_replace = synonim_words[candidate_index]
                        
                        eval_sentence, val_adj = adjusting(word_to_replace,candidate_final_sent)
                        count_adj_final = count_adj_final + val_adj
                        
        final_frasi.append(eval_sentence)
    
    return final_frasi

In [None]:
#with open(PATH_TEXT_TO_REPLACE, "rb") as pb:
#    list_sent = pickle.load(pb)

modified_sent = replace_syn(list_sentence_post_processing)

In [None]:
modified_sent