In [1]:
from sensitivity_data_generation import LLMClient, PubMedRetriever, QuestionFactChecking, QuestionReasoning, AnswerGroundTruth, AnswerCorrectSimilar, AnswerCorrectDifferent,AnswerIncorrectSimilar, AnswerIncorrectRelated, GenerateCompletePair, SensitivityDatasetGenerator

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LLM CLIENT

In [2]:
llm_client_meta = LLMClient(developer="meta", model_name="llama3-70b-8192")
llm_client_openai = LLMClient(developer="openai", model_name="gpt-3.5-turbo-instruct")

In [3]:
prompt = "Tell me which is the capital of France"

In [4]:
answer_1 = llm_client_meta.call_llm(prompt)
answer_2 = llm_client_openai.call_llm(prompt)

In [5]:
answer_1

"That's an easy one! The capital of France is Paris!"

In [6]:
answer_2

'The capital of France is Paris.'

PUB MED RETRIVER

In [7]:
pubmed_apikey = "20f8e223bcce9c230523437db9efc6b2b709"
retriever = PubMedRetriever(pubmed_apikey)
query = "hearts"

In [8]:
abstract_try = retriever.get_abstracts(query,1)

In [9]:
abstract_try

['Preoperative prism adaptation (PPA) simulates postoperative status and possibly can predict postoperative undercorrection before surgery in esotropia. The present study aimed to assess the effect of 4-week PPA in preventing postoperative residual esotropia.']

QUESTION GENERATE

In [10]:
abstract= "For evaluating machine-generated texts, automatic methods hold the promise of avoiding collection of human judgments, which can be expensive and time-consuming. The most common automatic metrics, like BLEU and ROUGE, depend on exact word matching, an inflexible approach for measuring semantic similarity. We introduce methods based on sentence mover’s similarity; our automatic metrics evaluate text in a continuous space using word and sentence embeddings. We find that sentence-based metrics correlate with human judgments significantly better than ROUGE, both on machine-generated summaries (average length of 3.4 sentences) and human-authored essays (length of 7.5). We also show that sentence mover’s similarity can be used as a reward when learning a generation model via reinforcement learning; we present both automatic and human evaluations of summaries learned in this way, finding that our approach outperforms ROUGE."

In [11]:
prompt_fc_question = "Given the following abstract from a scientific paper, generate a fact-checking questions that can be answered with a specific piece of text from this abstract. The question should be directly answerable with a specific line or statement from the abstract. Abstract: {abstract}. Please generate a fact-checking question and return it as string. PLease output only the question, do not include the answer or anything else."
promp_validate_question = "You will be given a context and a question. Your task is to categorize the question according to the types listed below, based solely on the relationship between the question and the provided context. Fact Checking: the answer is a composition based on only one substring of the context, that can be used directly as it is written in the context. Reasoning: answer is a composition based on a more elaborated understanding made from more than one substring of the context.  substrings can not be used exactly as they are written, it requires a deeper understanding of the content. Unrelated: the answer cannot be derived from the context provided. Context: {abstract} Question: {question} Please provide only the category name of the question based on the definitions provided."""
question_fc = QuestionFactChecking(llm_client_meta, prompt_fc_question, promp_validate_question)
question_fc_1 = question_fc.generate(abstract)

In [12]:
question_fc_1

'What is the average length of machine-generated summaries in the study?'

In [13]:
prompt_r_question = """Given the following abstract from a scientific paper, generate a deep reasoning questions that require synthesis of multiple parts of the abstract to form a comprehensive answer. 
                                These question should necessitate an understanding of the overall content, and the answer should include a paraphrase of more than one substring of the abstract. 
                                Abstract: {abstract}
                                Please generate a deep reasoning question and return it as a string. 
                                Please output only the question, do not include the answer or anything else."""
question_r = QuestionReasoning(llm_client_meta, prompt_r_question,promp_validate_question)
question_r_1 = question_r.generate(abstract)

In [14]:
question_r_1

'what implications does this have for the development of machine-generated text evaluation methods that can effectively correlate with human judgments?'

GROUND TRUTH

In [15]:
prompt_gen_gt = """You will be given an abstract from a scientific paper and a question related to it.
                                            Considering the abstract, generate a short answer for the question.
                                            If the answer is only a name or a number write a full sentence, do not return only one name or number.
                                            Abstract: {abstract}
                                            Question: {question}
                                            Please output only the accurate short answer.
                                            """
gt = AnswerGroundTruth(llm_client_meta, prompt_gen_gt)

In [16]:
gen_gt = gt.generate(abstract, question_fc_1)

In [17]:
gen_gt

'The average length of machine-generated summaries in the study is 3.4 sentences.'

CORRECT SIMILAR

In [18]:
prompt_gen_correct_sim = """You will be given a sentence.
                                                Your task is to write a new sentence that is a paraphrase similar to the original one. 
                                                It should keep the same meaning but using some different words.
                                                Make sure that the original and the new sentence share at least 3 content words.
                                                Sentence: {ground_truth}
                                                Please output only the new sentence. 
                                            """
correct_similar = AnswerCorrectSimilar(llm_client_meta, prompt_gen_correct_sim)

In [19]:
corr_sim = correct_similar.generate(gen_gt)

In [20]:
corr_sim

'The study reveals that the typical machine-generated summary consists of approximately 3.4 sentences on average.'

CORRECT DIFFERENT

In [21]:
prompt_correct_different = """You will be given a sentence.
                                                Your task is to write a new sentence that is a paraphrase completely different to the original one. 
                                                It should keep the same meaning but using completely different words.
                                                Make sure that the original and the new sentence do not have any content word in common, not even 1.
                                                
                                                Example 1:
                                                Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                                New sentence: This CNN demonstrated decreased behavior with upside-down items in comparison to those oriented correctly.

                                                Example 2:
                                                Original sentence: This paper tries to demonstrate, first, that the behavioral signatures associated to human face recognition can be explained as a result of the optimization of the task. Also, they show that it is not so “special”, as this behavior can be found in CNNs trained to recognize other objects (like cars). 
                                                New sentence: This article aims to demonstrate that the distinct characteristics linked to recognizing people’s visages may stem from enhancement of the activity. Furthermore, it illustrates that this pattern is not unique, as similar ones occur in neural networks developed to identify different items, such as vehicles.

                                                Your task:
                                                Original sentence: {ground_truth} 
                                                New sentence:
                                                Please output only the new sentence."""

correct_different = AnswerCorrectDifferent(llm_client_meta, prompt_correct_different)

In [22]:
corr_dif = correct_different.generate(gen_gt)

In [23]:
corr_dif

'The typical duration of automated abstracts in the research was equivalent to approximately three to four phrases.'

INCORR SIMILAR

In [24]:
prompt_inc_sim = """You will be provided a sentence.
                                    Your objective is to modify this sentence slightly so that it becomes incorrect or nonsensical.

                                    Example 1:
                                    Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                    New sentence: The car-trained network showed an increase in performance for inverted versus upright cars. 
                                    Example 2:
                                    Original sentence: The authores utilize deep convolutional neural networks (CNNs) on their experiments.
                                    New sentence: The authores utilize deep recurrent neural networks (CNNs) on their experiments.         

                                    Sentence: {ground_truth}

                                    Please output only the new sentence.
                                    """
prompt_val_inc = """You will be presented with a question, its correct answer, and a candidate sentence. Your task is to categorize the candidate sentence based on how it relates to the correct answer and the content of the question. Choose from the categories below:

                    Incorrect Similar: The sentence resembles the correct answer but contains incorrect information or misinterpretations.
                    Incorrect Related: The sentence is topically related to the question but does not address or correctly answer it.
                    Incorrect Unrelated: The sentence has no relevance to the topic or context of the question.
                    Rigth: The sentence is a possible rigth answer to the question.

                    Question: {question}
                    Correct Answer: {ground_truth}
                    Candidate Sentence: {answer}

                    Please provide only the category name for the candidate sentence based on the definitions provided."""
inc_sim = AnswerIncorrectSimilar(llm_client_meta, prompt_inc_sim, prompt_val_inc)

In [25]:
incorr_sim = inc_sim.generate(gen_gt, question_fc_1) 

In [26]:
incorr_sim

'The average length of machine-generated sandwiches in the study is 3.4 sentences.'

INCORR RELATED

In [27]:
prompt_gen_inc_rel = """You will be provided a sentence.
                                    Your task is to write a new short sentence on the topic of the original one but with different information.

                                    Example 1:
                                    Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                    New sentence: The inverted versus upright effect is a very popular phenomena investigated on the field of Computer Vision. 
                                    Example 2:
                                    Original sentence: The authores utilize deep convolutional neural networks (CNNs) on their experiments.
                                    New sentence: Deep convolutional neural networks (CNNs) are widely use on scientific research.


                                    Sentence: {ground_truth}
                                    Please output only the new sentence.
                                    """

inc_rela = AnswerIncorrectRelated(llm_client_meta, prompt_gen_inc_rel, prompt_val_inc)

In [28]:
inc_related = inc_rela.generate(gen_gt, question_fc_1)

In [29]:
inc_related

'Researchers have found that human-generated summaries tend to be longer and more detailed than machine-generated ones.'

COMPLETE PAIR

In [30]:
prompts = { "prompt_factchecking" : "Given the following abstract from a scientific paper, generate a fact-checking questions that can be answered with a specific piece of text from this abstract. The question should be directly answerable with a specific line or statement from the abstract. Abstract: {abstract}. Please generate a fact-checking question and return it as string. Please output only the question, do not include the answer or anything else.",
            "prompt_reasoning" : """Given the following abstract from a scientific paper, generate a deep reasoning questions that require synthesis of multiple parts of the abstract to form a comprehensive answer. 
                                These question should necessitate an understanding of the overall content, and the answer should include a paraphrase of more than one substring of the abstract. 
                                Abstract: {abstract}
                                Please generate a deep reasoning question and return it as a string. 
                                Please output only the question, do not include the answer or anything else
                                """,

"prompt_ground_truth" : """You will be given an abstract from a scientific paper and a question related to it.
                                            Considering the abstract, generate a short answer for the question.
                                            If the answer is only a name or a number write a full sentence, do not return only one name or number.
                                            Abstract: {abstract}
                                            Question: {question}
                                            Please output only the accurate short answer.
                                            """,

"prompt_correct_similar" : """You will be given a sentence.
                                                Your task is to write a new sentence that is a paraphrase similar to the original one. 
                                                It should keep the same meaning but using some different words.
                                                Make sure that the original and the new sentence share at least 3 content words.
                                                Sentence: {ground_truth}
                                                Please output only the new sentence. 
                                            """,

"prompt_correct_different" : """You will be given a sentence.
                                                Your task is to write a new sentence that is a paraphrase completely different to the original one. 
                                                It should keep the same meaning but using completely different words.
                                                Make sure that the original and the new sentence do not have any content word in common, not even 1.
                                                
                                                Example 1:
                                                Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                                New sentence: This CNN demonstrated decreased behavior with upside-down items in comparison to those oriented correctly.

                                                Example 2:
                                                Original sentence: This paper tries to demonstrate, first, that the behavioral signatures associated to human face recognition can be explained as a result of the optimization of the task. Also, they show that it is not so “special”, as this behavior can be found in CNNs trained to recognize other objects (like cars). 
                                                New sentence: This article aims to demonstrate that the distinct characteristics linked to recognizing people’s visages may stem from enhancement of the activity. Furthermore, it illustrates that this pattern is not unique, as similar ones occur in neural networks developed to identify different items, such as vehicles.

                                                Your task:
                                                Original sentence: {ground_truth} 
                                                New sentence:

                                                Please output only the new sentence.""",

"prompt_incorrect_similar" : """You will be provided a sentence.
                                    Your objective is to modify this sentence slightly so that it becomes incorrect or nonsensical.

                                    Example 1:
                                    Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                    New sentence: The car-trained network showed an increase in performance for inverted versus upright cars. 
                                    Example 2:
                                    Original sentence: The authores utilize deep convolutional neural networks (CNNs) on their experiments.
                                    New sentence: The authores utilize deep recurrent neural networks (CNNs) on their experiments.         

                                    Sentence: {ground_truth}

                                    Please output only the new sentence.
                                    """,

"prompt_incorrect_related" : """You will be provided a sentence.
                                    Your task is to write a new short sentence on the topic of the original one but with different information.

                                    Example 1:
                                    Original sentence: The car-trained network showed a drop in performance for inverted versus upright cars. 
                                    New sentence: The inverted versus upright effect is a very popular phenomena investigated on the field of Computer Vision. 
                                    Example 2:
                                    Original sentence: The authores utilize deep convolutional neural networks (CNNs) on their experiments.
                                    New sentence: Deep convolutional neural networks (CNNs) are widely use on scientific research.


                                    Sentence: {ground_truth}
                                    Please output only the new sentence.
                                    """,
"prompt_validate_question" : """You will be given a context and a question. Your task is to categorize the question according to the types listed below, based solely on the relationship between the question and the provided context.
                Fact Checking: the answer is a composition based on only one substring of the context, that can be used directly as it is written in the context.
                Reasoning: answer is a composition based on a more elaborated understanding made from more than one substring of the context.  substrings can not be used exactly as they are written, it requires a deeper understanding of the content.
                Unrelated: the answer cannot be derived from the context provided.
                Context: {abstract}
                Question: {question}
                Please provide only the category name of the question based on the definitions provided.""",

"prompt_validate_incorrect" : """You will be presented with a question, its correct answer, and a candidate sentence. Your task is to categorize the candidate sentence based on how it relates to the correct answer and the content of the question. Choose from the categories below:

                    Incorrect Similar: The sentence resembles the correct answer but contains incorrect information or misinterpretations.
                    Incorrect Related: The sentence is topically related to the question but does not address or correctly answer it.
                    Incorrect Unrelated: The sentence has no relevance to the topic or context of the question.
                    Rigth: The sentence is a possible rigth answer to the question.

                    Question: {question}
                    Correct Answer: {ground_truth}
                    Candidate Sentence: {answer}

                    Please provide only the category name for the candidate sentence based on the definitions provided.""",}

In [31]:
pair_generator = GenerateCompletePair(llm_client_meta, prompts)

In [32]:
pair = pair_generator.generate_complete_pair(abstract, "fact_checking")

In [33]:
print(pair)

QAPair(question='What is the average length of machine-generated summaries in the study?', ground_truth='The average length of machine-generated summaries in the study is 3.4 sentences.', correct_similar='The study reveals that the typical machine-generated summaries consist of 3.4 sentences on average.', correct_different='The research reveals that the typical duration of automated abstracts in the analysis corresponds to approximately four brief paragraphs.', incorrect_similar='The average length of machine-generated summaries in the study is 3.4 kilograms.', incorrect_related='Most machine-generated summaries aim to provide concise information to readers.', incorrect_unrelated=None)


FULL DATA

In [34]:
sensitivity_data_generator = SensitivityDatasetGenerator(llm_client_meta, pubmed_apikey, prompts)

In [35]:
querys = ["heart", "neural", "eye"]

In [36]:
full_df, abstracts = sensitivity_data_generator.generate_sensitivity_dataset(querys, abs_per_query= 1,question_per_abs=2)

Working with query: heart


Working with abstract: 0
A fact checking set was created.
A reasoning set was created.
Working with query: neural
Working with abstract: 1
A fact checking set was created.
A reasoning set was created.
Working with query: eye
Working with abstract: 2
A fact checking set was created.
A reasoning set was created.
All the answers' sets were successfully created.
Adding Incorrect Unrelated.


In [38]:
full_df

Unnamed: 0,query,abstract_id,question_type,question,ground_truth,correct_similar,correct_different,incorrect_similar,incorrect_related,incorrect_unrelated
0,heart,0,Fact Checking,What is the duration of preoperative prism ada...,The duration of preoperative prism adaptation ...,The present study examines the 4-week period o...,The length of time examined in this study for ...,The duration of preoperative prism adaptation ...,Researchers have found that the optimal durati...,Accurately recognizing low-prognosis risk pati...
1,heart,0,Reasoning,What potential benefits does the 4-week preope...,The 4-week preoperative prism adaptation (PPA)...,The 4-week pre-surgical prism therapy (PST) of...,"This brief, four-phase preparatory treatment o...",The 4-week postoperative prism adaptation (PPA...,Researchers have identified prism adaptation a...,This study's use of federated learning technol...
2,neural,1,Fact Checking,What is the significance of accurately recogni...,Accurately recognizing low-prognosis risk pati...,Identifying high-risk patients with acute pulm...,Identifying individuals with a high likelihood...,Accurately recognizing high-prognosis risk pat...,Early diagnosis of acute pulmonary thromboembo...,The duration of preoperative prism adaptation ...
3,neural,1,Reasoning,what implications does this have for the devel...,This study's use of federated learning technol...,The application of federated learning technolo...,This research leverages decentralized machine ...,This study's use of federated baking technolog...,The application of federated learning in healt...,The 4-week preoperative prism adaptation (PPA)...
4,eye,2,Fact Checking,What is the purpose of school screenings in re...,The purpose of school screenings is to identif...,The goal of school-based vision tests is to de...,The goal of educational health checks is to de...,The purpose of school screenings is to identif...,Regular eye exams are crucial for early detect...,The duration of preoperative prism adaptation ...
5,eye,2,Reasoning,What might be some potential limitations or co...,Some potential limitations or consequences of ...,Relying exclusively on school-based vision tes...,"This approach may entail drawbacks, including ...",Here is the modified sentence:\n\nSome potenti...,Vision problems in children can have a signifi...,The 4-week preoperative prism adaptation (PPA)...


In [39]:
abstracts

{0: 'Preoperative prism adaptation (PPA) simulates postoperative status and possibly can predict postoperative undercorrection before surgery in esotropia. The present study aimed to assess the effect of 4-week PPA in preventing postoperative residual esotropia.',
 1: 'Acute pulmonary thromboembolism (PTE) is a common cardiovascular disease and recognizing low prognosis risk patients with PTE accurately is significant for clinical treatment. This study evaluated the value of federated learning (FL) technology in PTE prognosis risk assessment while ensuring the security of clinical data.',
 2: 'Although school screenings identify children with vision problems and issue referrals for medical treatment at an ophthalmic hospital, the effectiveness of this approach remains unverified.'}