# Imports

In [4]:
import os
import json
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.metrics import f1_score
import random
import numpy as np

# to generate csv for fine-tuning
import csv

from genre.trie import MarisaTrie

from datetime import datetime

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
import os
# set available gpu's
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Plotting

In [3]:
import os
import matplotlib.pyplot as plt

In [6]:
# function that creates and saves the plots for the iterations evolution
def plot_and_save_scores(all_scores, max_scores, directory_path, display_only_top_values, iteration_folders, 
                         y_min, y_max, score, keep_list):
    # Prepare the x-axis values (iteration numbers)
    x_values = range(len(all_scores))
    # Custom x-axis labels from the iteration folder names
    x_labels = [folder.replace('Iteration_', '') for folder in iteration_folders]

    # Create the plot
    plt.figure(figsize=(10, 6))

    if display_only_top_values:
        # Plot filename for only top scores
        plot_filename = 'top_scores_plot.png'
        # Plot only the maximum scores for each iteration
        plt.plot(x_values, max_scores, '-o', color='darkblue', label='Top Scores')
    
    else:
        # Plot filename for all scores
        plot_filename = 'all_scores_plot.png'
        # Define a list of base colors for the iterations
        base_colors = plt.cm.get_cmap('tab20', len(all_scores))

        # Plot all scores with lighter color for those not in keep_list
        for i, (scores, keep_indices) in enumerate(zip(all_scores, keep_list)):
            base_color = base_colors(i)  # Get the base color for this iteration
            # Convert base color to RGBA and then lighten the color for non-highlighted points
            lighter_color = to_rgba(base_color, alpha=0.12)  # Adjust alpha to make lighter
            # Plot all scores in lighter color
            plt.scatter([i] * len(scores), scores, color=lighter_color, label='Iteration {}'.format(i) if i == 0 else "")
            # Overlay highlighted scores in original color
            highlighted_scores = [scores[idx] for idx in keep_indices if idx < len(scores)]
            plt.scatter([i] * len(highlighted_scores), highlighted_scores, color=base_color)

        # Plot the line for top scores in a consistent color
        plt.plot(x_values, max_scores, '-o', color='black', label='Top Scores')

    # Labeling the plot
    plt.title('Scores by Iteration')
    plt.xlabel('Iteration')
    plt.ylabel(score)
    plt.xticks(x_values, x_labels, rotation='vertical')  # Set custom x-axis labels
    plt.ylim(y_min, y_max)  # Set the y-axis range
    #plt.legend()
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Save the plot to the specified directory
    plt.savefig(os.path.join(directory_path, plot_filename))
    plt.close()  # Close the plot to free up memory
    
    print("Plots have been saved to:", directory_path)

Plots have been saved to: RUNS/SemEval/Runs_2024-03-08_17-57-50
Plots have been saved to: RUNS/SemEval/Runs_2024-03-08_17-57-50
Plots have been saved to: RUNS/SemEval/Runs_2024-03-08_17-57-50


In [7]:
# function that takes file path to folder with runs, created during the evolution of the model
# also takes y axix scale
# and label for the y axis, which is the score
def create_plots_from_RUNS_folder(directory_path):
    if "SemEval" in directory_path:
        ymin = 0.60
        ymax = 0.80
        score = 'F1-Score'
    elif 'CSQA' in directory_path:
        ymin = 0.50
        ymax = 0.80
        score = 'Accuracy'
    else:
        print(f"Incorrect task name")
        return None

    # List all items in the directory
    items = os.listdir(directory_path)

    # Filter out items that are not directories or are 'Iteration_best'
    iteration_folders = [item for item in items 
                        if os.path.isdir(os.path.join(directory_path, item)) and item != 'Iteration_best']

    # Custom sorting function
    def custom_sort(folder_name):
        if folder_name == 'Iteration_initial':
            return -1  # Ensure 'Iteration_initial' comes first
        else:
            # Extract the iteration number and convert it to an integer for proper numerical sorting
            num_part = folder_name.split('_')[-1]
            return int(num_part) if num_part.isdigit() else float('inf')  # Non-numeric suffixes go at the end

    # Sort the folders using the custom function
    iteration_folders.sort(key=custom_sort)

    # Initialize a list to hold all scores lists and a list for max scores
    all_scores = []
    max_scores = []
    keep_lists = []

    for folder in iteration_folders:
        # Construct the path to the evaluation.txt file
        file_path = os.path.join(directory_path, folder, 'evaluations.txt')
        
        # Initialize a list to hold scores for this iteration
        scores = []
        
        # Open the file and read the scores
        with open(file_path, 'r') as file:
            for line in file:
                # Convert each line to a float and append to the scores list
                scores.append(float(line.strip()))
        
        # Append this iteration's scores to the all_scores list
        all_scores.append(scores)
        
        # Find and append the max score for this iteration to the max_scores list
        max_scores.append(max(scores))

        file_path = os.path.join(directory_path, folder, 'keep_list.txt')
        keep_list = []
        # Open the file and read the scores
        with open(file_path, 'r') as file:
            for line in file:
                # Convert each line to a float and append to the scores list
                keep_list.append(int(line.strip()))

        keep_lists.append(keep_list)

    plot_and_save_scores(all_scores, max_scores, directory_path, False, iteration_folders, ymin, ymax, score, keep_lists)  # For all scores
    plot_and_save_scores(all_scores, max_scores, directory_path, True, iteration_folders, ymin, ymax, score, keep_lists)  # For only top scores

    return None

# Initial prompt population

In [5]:
# extract txt files in folder_path to dict with all the subprompts for task, ctr, statement and answer description
def extract_lines_to_dict(folder_path):
    files_dict = {}
    # Iterate through each file in the folder
    for file_name in os.listdir(folder_path):
        # Check if the file is a .txt file
        if file_name.endswith('.txt'):
            # Construct the full file path
            file_path = os.path.join(folder_path, file_name)
            # Read lines from the file
            with open(file_path, 'r') as file:
                lines = file.readlines()
            # Remove the newline characters from each line
            lines = [line.strip() for line in lines]
            # Remove the '.txt' extension and use the name as a key in the dictionary
            files_dict[file_name[:-4]] = lines
    return files_dict

In [13]:
folder_path = 'SemEval_self_initial_population_prompts'

initial_population_prompts = extract_lines_to_dict(folder_path)
#initial_population_prompts

In [14]:
initial_population_prompts.keys()

dict_keys(['self_A', 'statement_description', 'self_C', 'answer_description', 'task_description', 'ctr_description', 'self_B'])

In [20]:
initial_population_prompts['self_C'][-1] == ''

True

In [15]:
len(initial_population_prompts['self_A'])

5

In [7]:
folder_path = 'SemEval_initial_population_prompts'

initial_population_prompts = extract_lines_to_dict(folder_path)
initial_population_prompts

{'statement_description': ['Consider also the following statement generated by a clinical domain expert, a clinical trial organizer, or a medical researcher.',
  'Contemplate the ensuing statement formulated by a clinical expert or researcher.',
  'Review the subsequent statement provided by an expert in clinical trials, attending to the medical terminology and carefully addressing any ambiguities.',
  'Deliberate upon the subsequent statement formulated by an healthcare practitioner, a coordinator of clinical trials, or a medical researcher.',
  'Reflect upon the ensuing statement crafted by an expert in clinical trials.'],
 'answer_description': ['Answer YES or NO to the question of whether one can conclude the validity of the statement with basis on the clinical trial report information.',
  'Indicate with either YES or NO whether it is possible to determine the validity of the statement based on the Clinical Trial Report (CTR) descriptions. An answer of YES means that the statement

In [30]:
initial_population_prompts.keys()

dict_keys(['statement_description', 'answer_description', 'task_description', 'ctr_description'])

In [31]:
folder_path_self_reason = 'SemEval_initial_population_prompts/self_reasoning'

initial_population_self_reasoning = extract_lines_to_dict(folder_path_self_reason)
initial_population_self_reasoning

{'B': ['Consider also the following reflection upon the reasons as to why one can conclude if the statement is supported by the CTR descriptions or not.',
  'Consider also the following reflection upon the reasons as to why one can conclude if the statement is supported by the CTR descriptions or not.',
  'Consider also the following reflection upon the reasons as to why one can conclude if the statement is supported by the CTR descriptions or not.',
  'Consider also the following reflection upon the reasons as to why one can conclude if the statement is supported by the CTR descriptions or not.',
  ''],
 'A': ['Taking into account the statement and the Clinical Trial Report (CTR) descriptions, summarize the main reasoning steps that would allow one to conclude if the statement is supported by the CTR descriptions or not.',
  'Taking into account the statement and the Clinical Trial Report (CTR) descriptions, summarize the main reasoning steps that would allow one to conclude if the st

# CommonSense Q&A Task

In [12]:
# based on code from https://github.com/jonathanherzig/commonsenseqa/blob/master/esim/reader_csqa.py
# function to extract 
def extract_CSQA_data(file_path = 'CSQA_data', type='dev'):
    if type == 'dev':
        file_path += '/dev_rand_split.jsonl'
    elif type == 'train':
        file_path += '/train_rand_split.jsonl'

    label_dict = {'A':'A', 'B':'B', 'C':'C', 'D':'D', 'E':'E'}

    data_expanded = []
    with open(file_path, "r") as f:
        for line_num, line in enumerate(f):
            #print(f"line-->{line}")
            line = line.strip("\n")
            line = json.loads(line)
            if not line:
                continue
            question = line['question']['stem']
            choice = [c['text'] for c in line['question']['choices']]
            label = label_dict[line['answerKey']] if 'answerKey' in line else None

            temp = {}
            temp['question'] = question
            temp['choice'] = choice
            temp['label'] = label
            data_expanded.append(temp)
    
    return data_expanded

In [13]:
file_path = 'CSQA_data/dev_rand_split.jsonl'
data_csqa = extract_CSQA_data(file_path)
data_expanded = extract_CSQA_data(file_path)

line-->{"answerKey": "A", "id": "1afa02df02c908a558b4036e80242fac", "question": {"question_concept": "revolving door", "choices": [{"label": "A", "text": "bank"}, {"label": "B", "text": "library"}, {"label": "C", "text": "department store"}, {"label": "D", "text": "mall"}, {"label": "E", "text": "new york"}], "stem": "A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?"}}

question-->A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?
choice-->['bank', 'library', 'department store', 'mall', 'new york']
label-->A
line-->{"answerKey": "A", "id": "a7ab086045575bb497933726e4e6ad28", "question": {"question_concept": "people", "choices": [{"label": "A", "text": "complete job"}, {"label": "B", "text": "learn from each other"}, {"label": "C", "text": "kill animals"}, {"label": "D", "text": "wear hats"}, {"label": "E", "text": "talk to each other"}], "stem": "What do people aim to d

In [14]:
folder_path = 'CSQA_initial_population_prompts'

csqa_initial_population_prompts = extract_lines_to_dict(folder_path)
initial_population_prompts = extract_lines_to_dict(folder_path)
csqa_initial_population_prompts

{'answer_description': ['Please provide only the letter of the correct answer.',
  'Answer only with the letter (A, B, C, D or E) corresponding to the correct answer.',
  'Choose only one of the options A, B, C, D or E and return nothing else.',
  'Reply by just indicating the letter of the correct option.',
  'I just want you to answer with one of the letters A through E associated with the right choice.'],
 'task_description': ['In this task, you are presented with a question that requires common sense reasoning to answer correctly. Each question is followed by five possible answers, labeled A through E. Your goal is to choose the most appropriate answer based on general knowledge, everyday experiences, and logical reasoning. The questions are designed to evaluate your ability to understand and apply common sense knowledge to situations that are not explicitly detailed in the text. You must infer, deduce, or extrapolate the information necessary to select the correct option, relying 

In [15]:
print(data_csqa[0]['question'])

A revolving door is convenient for two direction travel, but it also serves as a security measure at a what?


In [16]:
# function to create list of dictionaries with:
# text: text to prompt the LLM, made from the subprompts and the data
# label: true label ('Entailment' or 'Contradiction')
# based on code from https://aclanthology.org/2023.semeval-1.137.pdf
def prompt_creation_csqa(data_expanded, task_description, answer_description):
    samples = []
    letters = ['A', 'B', 'C', 'D', 'E']
    for sample in data_expanded:
        prompt = task_description + '\n' 

        sentence = f"{prompt}\n{sample['question']}"

        option_list = ''
        for i, j in zip(sample['choice'], letters):
            option = f"{j} - {i}\n"
            #print(f"option-->{option}")
            option_list += option

        #print(f"option_list-->{option_list}")

        #answer_description = 'Please provide only the letter of the correct option.'

        sentence = f"[INST]{sentence}\n{option_list}\n{answer_description}[/INST]\n\nANSWER:"
        temp = {"text":sentence, "label":sample['label']}
        samples.append(temp)

    return samples

In [17]:
task = csqa_initial_population_prompts['task_description'][0]

# Extract SemEval data

In [25]:
# function to extract SemEval data to a list of dictionaries with the 
# id's, 'statement', 'primary_evidence', 'label' and  'secondary_evidence' if it existss
# based on code from https://aclanthology.org/2023.semeval-1.137.pdf
def extract_SemEval_data(folder = 'SemEval_data', type = 'dev'):

    type += '.json'
    split = type
    data = json.load(open(f"{folder}/{split}"))
    files = os.listdir(folder + "/CT json/")
    files.remove(".DS_Store")

    files_data = {file[:-5]:json.load(open(f"{folder}/CT json/{file}")) for file in files}

    data_expanded = []
    for _id, value in data.items():
        temp = {}
        temp["id"] = _id
        p_nctid = value["Primary_id"]
        s_nctid = value.get("Secondary_id")
        section_id = value["Section_id"]
        statement = value["Statement"]
        primary_evidence = files_data[p_nctid][section_id]
        temp["statement"] = statement
        temp["primary_evidence"] = primary_evidence
        temp["label"] = value["Label"]
        
        if s_nctid is not None:
            secondary_evidence = files_data[s_nctid][section_id]
            temp["secondary_evidence"] = secondary_evidence
        
        data_expanded.append(temp)

    return data_expanded

In [26]:
data_expanded = extract_SemEval_data()

In [27]:
# function to create list of dictionaries with:
# text: text to prompt the LLM, made from the subprompts and the data
# label: true label ('Entailment' or 'Contradiction')
# based on code from https://aclanthology.org/2023.semeval-1.137.pdf
def prompt_creation_semeval(data_expanded, task_description, ctr_description, statement_description, answer_description):
    samples = []
    for sample in data_expanded:
        prompt = task_description + '\n\n' + ctr_description + '\n\n'
        primary_evidence = "\n".join(sample['primary_evidence'])
        sentence = f"{prompt}Primary Trial\n{primary_evidence}"
        secondary_evidence = sample.get("secondary_evidence")
        if secondary_evidence:
            secondary_evidence = "\n".join(sample['secondary_evidence'])
            sentence = f"{sentence}\n\nSecondary Trial\n{secondary_evidence}"
        #input_text = get_input_text(sentence, sample['statement'])
        stat = "".join(sample['statement'])
        sentence = f"[INST]{sentence}\n\n{statement_description}\n\n{stat}\n\n{answer_description}[/INST]\n\nANSWER:"
        temp = {"text":sentence, "label":sample['label']}
        samples.append(temp)

    return samples

In [35]:
def prompt_creation_semeval_self(data_expanded, task_description, ctr_description, statement_description, answer_description,
                                 self_A, self_B, self_C, model, tokenizer):
    samples = []
    for sample in tqdm(data_expanded, desc='creating self reasoning prompts'):
        prompt = task_description + '\n\n' + ctr_description + '\n\n'
        primary_evidence = "\n".join(sample['primary_evidence'])
        sentence = f"{prompt}Primary Trial\n{primary_evidence}"
        secondary_evidence = sample.get("secondary_evidence")
        if secondary_evidence:
            secondary_evidence = "\n".join(sample['secondary_evidence'])
            sentence = f"{sentence}\n\nSecondary Trial\n{secondary_evidence}"
        #input_text = get_input_text(sentence, sample['statement'])
        stat = "".join(sample['statement'])
        text_self = f"[INST]{sentence}\n\n{statement_description}\n\n{stat}\n\n{self_A}[/INST]\n\nANSWER:"

        # Tokenize input and generate attention mask
        prompt_self = tokenizer.encode(text_self, return_tensors="pt").to('cuda')
        prompt_length = prompt_self[0].shape[0]

        try:
            # to improve efficiency
            with torch.inference_mode():
                output = model.generate(prompt_self, pad_token_id=tokenizer.eos_token_id, max_new_tokens=400, do_sample=True, temperature = 1.0, top_p=0.8)
        except:
            output = ''
            prompt_length = 0

        #print(f"prompt_length-->{prompt_length}")
        new_tokens = output[0, prompt_length:]
        reflection = tokenizer.decode(new_tokens, skip_special_tokens=True)
        print(f"tokenizer.decode(output[0], skip_special_tokens=False)-->\n{tokenizer.decode(output[0], skip_special_tokens=False)}")

        sentence = f"[INST]{sentence}\n\n{statement_description}\n\n{stat}\n\n{self_B}\n\n{reflection}\n\n{self_C}[/INST]\n\nANSWER:"

        temp = {"text":sentence, "label":sample['label']}
        samples.append(temp)

    return samples

# testing self reasoning prompts

In [36]:
task = initial_population_prompts['task_description'][0]
ctr = initial_population_prompts['ctr_description'][0]
statement = initial_population_prompts['statement_description'][0]
ans = initial_population_prompts['answer_description'][0]
self_A = initial_population_prompts['self_A'][0]
self_B = initial_population_prompts['self_B'][0]
self_C = initial_population_prompts['self_C'][0]

In [37]:
samples_self = prompt_creation_semeval_self(data_expanded[0:20], task_description = task, ctr_description = ctr,
                                                 statement_description = statement,
                                                 answer_description = ans, self_A=self_A,
                                                 self_B=self_B,
                                                 self_C=self_C,
                                                 model=model, tokenizer=tokenizer)

creating self reasoning prompts:   0%|          | 0/20 [00:00<?, ?it/s]

tokenizer.decode(output[0], skip_special_tokens=False)-->
<s> [INST]Consider the task of determining semantic entailment relations between individual sections of Clinical Trial Reports (CTRs) and statements made by clinical domain experts. Note that CTRs outline the methodology and findings of a clinical trial, which are conducted to assess the effectiveness and safety of new treatments. Each trial involves 1-2 patient groups, called cohorts or arms, and these groups may receive different treatments, or have different baseline characteristics. The complete CTRs contain 4 sections, corresponding to (1) a list of the ELIGIBILITY CRITERIA corresponding to the conditions for patients to be allowed to take part in the clinical trial, (2) a description for the INTERVENTION that specifies the type, dosage, frequency, and duration of treatments being studied, (3) a summary of the RESULTS, detailing aspects such as the number of participants in the trial, the outcome measures, the units, and th

In [38]:
print(samples_self[-1]['text'])

[INST]Consider the task of determining semantic entailment relations between individual sections of Clinical Trial Reports (CTRs) and statements made by clinical domain experts. Note that CTRs outline the methodology and findings of a clinical trial, which are conducted to assess the effectiveness and safety of new treatments. Each trial involves 1-2 patient groups, called cohorts or arms, and these groups may receive different treatments, or have different baseline characteristics. The complete CTRs contain 4 sections, corresponding to (1) a list of the ELIGIBILITY CRITERIA corresponding to the conditions for patients to be allowed to take part in the clinical trial, (2) a description for the INTERVENTION that specifies the type, dosage, frequency, and duration of treatments being studied, (3) a summary of the RESULTS, detailing aspects such as the number of participants in the trial, the outcome measures, the units, and the conclusions, and (4) a list of ADVERSE EVENTS corresponding 

In [80]:
tam = []
for i in range(200):
    tam.append(len(samples_self[i]['text']))

In [84]:
tam.index(11430)

44

## to test

In [38]:
task = initial_population_prompts['task_description'][0]
ctr = initial_population_prompts['ctr_description'][0]
statement = initial_population_prompts['statement_description'][0]
ans = initial_population_prompts['answer_description'][0]

In [39]:
samples = prompt_creation_semeval(data_expanded, task_description = task, ctr_description = ctr, statement_description = statement, answer_description = ans)

In [40]:
print(samples[0]['text'])

[INST]Consider the task of determining semantic entailment relations between individual sections of Clinical Trial Reports (CTRs) and statements made by clinical domain experts. Note that CTRs outline the methodology and findings of a clinical trial, which are conducted to assess the effectiveness and safety of new treatments. Each trial involves 1-2 patient groups, called cohorts or arms, and these groups may receive different treatments, or have different baseline characteristics. The complete CTRs contain 4 sections, corresponding to (1) a list of the ELIGIBILITY CRITERIA corresponding to the conditions for patients to be allowed to take part in the clinical trial, (2) a description for the INTERVENTION that specifies the type, dosage, frequency, and duration of treatments being studied, (3) a summary of the RESULTS, detailing aspects such as the number of participants in the trial, the outcome measures, the units, and the conclusions, and (4) a list of ADVERSE EVENTS corresponding 

In [33]:
# function to extract yes or no from the generated string
def extract_yes_no_after_answer(s):
    if 'Yes' in s or 'YES' in s:
      return('YES')
    elif 'No' in s or 'NO' in s:
      return('NO')
    else:
      return('Answer not found')
    

# Inferencing on Mistral 7B LLM with 4-bit quantization 

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = 'filipealmeida/Mistral-7B-Instruct-v0.1-sharded'
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        #load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config,
        device_map = 'cuda'
    )
    return model

def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = 'cuda')
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer


model = load_quantized_model(model_name)

tokenizer = initialize_tokenizer(model_name)

# Define stop token ids
stop_token_ids = [0]

text = "[INST] How AI will replace Engineers [/INST]"

encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
model_input = encoded
generated_ids = model.generate(**model_input, max_new_tokens=200, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] How AI will replace Engineers [/INST] WhileAIcanperformengineeringtasks, Question-answering models like me don't have the ability to replace engineers directly. Engineers design, build, test, and maintain complex systems, which requires a deep understanding of physics, mathematics, and real-world applications. AI and machine learning can assist engineers by automating repetitive tasks, analyzing data, and providing design suggestions. However, the creativity, problem-solving skills, and practical experience required for complex engineering projects are beyond the capabilities of current AI systems. It's more likely that AI will augment, rather than replace, the role of engineers.</s>


In [3]:
# Check if the tokenizer has a pad token
if tokenizer.pad_token is None:
    # Set pad_token_id to eos_token_id if pad_token is not defined
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

# Model loading

In [3]:
torch.cuda.device_count()

1

In [4]:
def load_model(checkpoint = "mistralai/Mistral-7B-Instruct-v0.2" ):

    torch.cuda.empty_cache()
    # choosing the gpu device to be used

    # loading
    tokenizer = AutoTokenizer.from_pretrained(checkpoint, device_map = 'cuda')
    model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map = 'cuda')

    # Check if the tokenizer has a pad token
    if tokenizer.pad_token is None:
        # Set pad_token_id to eos_token_id if pad_token is not defined
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

    model = model.to('cuda')  # Move model to GPU

    return model, tokenizer

In [5]:
model, tokenizer = load_model()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Mutation and Combination prompts

In [35]:
evolutionary_prompts = extract_lines_to_dict("evolutionary_prompts")

In [36]:
evolutionary_prompts

{'mutation_prompts': ['Consider the problem of re-writing a textual instruction, in which the objective is to rephrase the description while keeping the exact same meaning. Assume that the audience for the resulting instruction consists of experts in the field. The re-written instruction can either be shorter, summarizing the main points while keeping consistency with the original intent, or it can be made longer, by adding definitions and further clarifications. You will be penalized for the inclusion of incorrect information. The re-written instruction should be concise and direct, and it should inform the execution of the task in a clearer way than the original instruction. Considering the aforementioned task description, rewrite the textual instruction shown next in quotes, generating only the resulting instruction without any additional explanation.'],
 'combination_prompts': ['Consider the problem of combining two different textual instructions, pertaining to the same task. The o

In [37]:
prompt = initial_population_prompts['task_description'][0]
mutation_prompt = evolutionary_prompts['mutation_prompts'][0]
combination_prompt = evolutionary_prompts['combination_prompts'][0]

prompt_2 = initial_population_prompts['task_description'][-1]

In [41]:
# function to mutate prompts with a given LLM
# takes a mutation prompt (asking to paraphrase) and a subprompt (to be mutated), outputs the NEW mutated subprompt
def mutate_prompt(prompt, mutation_prompt, model, tokenizer, temperature = 1.0, top_p=0.8):
    instruction = '[INST]' + mutation_prompt + "\nINSTRUCTION: " + prompt + '[/INST]' + "\n\nNEW INSTRUCTION: "
    #print(f"instruction-->{instruction}")

    # Tokenize input and generate attention mask
    prompt = tokenizer.encode(instruction, return_tensors="pt").to('cuda')
    prompt_length = prompt[0].shape[0]

    try:
        # to improve efficiency
        with torch.inference_mode():
            output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, max_length=800, do_sample=True, temperature = temperature, top_p=top_p)
    except:
        output = ''

    new_tokens = output[0, prompt_length:]
    mutated = tokenizer.decode(new_tokens, skip_special_tokens=True)
    #print(f"tokenizer.decode(output[0], skip_special_tokens=False)-->{tokenizer.decode(output[0], skip_special_tokens=False)}")

    return mutated

In [42]:
print(f"mutate_prompt(prompt, mutation_prompt, model, tokenizer)-->{mutate_prompt(prompt, mutation_prompt, model, tokenizer)}")

instruction-->[INST]Consider the problem of re-writing a textual instruction, in which the objective is to rephrase the description while keeping the exact same meaning. Assume that the audience for the resulting instruction consists of experts in the field. The re-written instruction can either be shorter, summarizing the main points while keeping consistency with the original intent, or it can be made longer, by adding definitions and further clarifications. You will be penalized for the inclusion of incorrect information. The re-written instruction should be concise and direct, and it should inform the execution of the task in a clearer way than the original instruction. Considering the aforementioned task description, rewrite the textual instruction shown next in quotes, generating only the resulting instruction without any additional explanation.
INSTRUCTION: In this task, you are presented with a question that requires common sense reasoning to answer correctly. Each question is 

In [21]:
# function to combine prompts using an LLM
# takes a combination prompt (asking to join two instructions) and two subprompt (to be combined), outputs the NEW combined subprompt
def combine_prompts(prompt_1, prompt_2, combination_prompt, model, tokenizer, temperature = 1.0, top_p=0.8):
    instruction = '[INST]' + combination_prompt + "\nINSTRUCTION 1: " + prompt_1 + "\nINSTRUCTION 2: " + prompt_2 + '[/INST]' + "\n\nNEW INSTRUCTION: "

    # Tokenize input and generate attention mask
    prompt = tokenizer.encode(instruction, return_tensors="pt").to('cuda')
    prompt_length = prompt[0].shape[0]
    # Tokenize input and generate attention mask

    try:
        # to improve efficiency
        with torch.inference_mode():
            output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, max_length=1600, do_sample=True, temperature = temperature, top_p=top_p)
    except:
        output = ''

    new_tokens = output[0, prompt_length:]
    combined = tokenizer.decode(new_tokens, skip_special_tokens=True)

    return combined

In [33]:
prompt

'In this task, you are presented with a question that requires common sense reasoning to answer correctly. Each question is followed by five possible answers, labeled A through E. Your goal is to choose the most appropriate answer based on general knowledge, everyday experiences, and logical reasoning. The questions are designed to evaluate your ability to understand and apply common sense knowledge to situations that are not explicitly detailed in the text. You must infer, deduce, or extrapolate the information necessary to select the correct option, relying on a broad understanding of how the world works, including societal norms, physical principles, and human behavior. This task tests your capability to navigate ambiguous or incomplete information and to apply common sense logic in a variety of scenarios.'

In [26]:
print(mutate_prompt(prompt, mutation_prompt, model, tokenizer))


1. Identify the sections of Clinical Trial Reports (CTRs) to be evaluated: eligibility criteria, intervention, results, and adverse events.
2. For each CTR, evaluate statements that make claims related to these sections, ensuring that:
   a. The statements are consistent with the CTR information.
   b. The statements do not contradict the descriptions provided.
3. Determine entailment relations between the statements and the corresponding sections of CTRs, considering:
   a. Whether the statements follow logically from the CTR information.
   b. If applicable, whether the statements make valid comparisons between different CTRs.
4. Document the results of your analysis, including the statements and their corresponding CTR sections, as well as your assessment of the entailment relationship.


In [27]:
print(combine_prompts(prompt, prompt_2, combination_prompt, model, tokenizer))


To assess semantic entailment between Clinical Trial Reports (CTRs) and statements made by clinical domain experts, consider the following: Complete CTRs consist of four sections: (1) ELIGIBILITY CRITERIA, detailing patient inclusion conditions, (2) INTERVENTION, specifying treatment details, (3) RESULTS, summarizing statistics, outcome measures, units, and conclusions, and (4) AD


# Evaluate prompt on population with llm - CSQA task

In [116]:
# function to generate predictions for the task for a given prompt
# outputs both the predictions and the true labels
def csqa_predictions(model, tokenizer, samples):

    class MyMarisaTrie(MarisaTrie):
        def __init__(self, data): super().__init__(data)
        def get(self, data, length_to_ignore): return super().get([tokenizer.bos_token_id] + data[length_to_ignore:])

    # limit options for decoding
    trie = MyMarisaTrie([tokenizer.encode("A") + [tokenizer.eos_token_id],
                         tokenizer.encode("B") + [tokenizer.eos_token_id],
                         tokenizer.encode("C") + [tokenizer.eos_token_id],
                         tokenizer.encode("D") + [tokenizer.eos_token_id],
                         tokenizer.encode("E") + [tokenizer.eos_token_id]])

    # get max no. of tokens needed
    token_counts = []
    for sample in samples:
        # Tokenize input and generate attention mask
        encoding = tokenizer(sample["text"])
        token_counts.append(len(encoding['input_ids']))

    max_token_count = max(token_counts) + 6

    labels = []
    preds = []
    with torch.inference_mode():
        for sample in tqdm(samples, desc = f"Generating Predictions with LLM"):
            labels.append(sample["label"])
            # Tokenize input and generate attention mask
            prompt = tokenizer.encode(sample["text"], return_tensors="pt", padding=True, truncation=True, return_attention_mask=True).to('cuda')
            prompt_length = prompt[0].shape[0]

            output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, 
                                    max_length=max_token_count, 
                                    prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist(), prompt_length))
            
            #print(f"tokenizer.decode(output[0], skip_special_tokens=False)-->{tokenizer.decode(output[0], skip_special_tokens=False)}")

            # Decode only the newly generated tokens
            # Skip the input tokens by starting the slice at input_length
            new_tokens = output[0, prompt_length:]

            #print(f"tokenizer.decode(new_tokens)-->{tokenizer.decode(new_tokens)}")
            pred = tokenizer.decode(new_tokens, skip_special_tokens=True)
            preds.append(pred)
            #print(f"preds-->{preds}")

    return labels, preds

In [117]:
samples[0]['label']

'A'

In [118]:
labels, preds = csqa_predictions(model, tokenizer, samples)

Generating Predictions with LLM:   0%|          | 0/1221 [00:00<?, ?it/s]

In [121]:
from sklearn.metrics import accuracy_score

In [122]:
accuracy_score(y_true=labels, y_pred=preds)

0.6756756756756757

# Evaluate prompt on population with LLM - SemEval

Marisa Trie-Limiting possible answers to Yes or No or YES or NO. it's then called in the function below it

In [36]:
# function to generate predictions for the task for a given prompt
# outputs both the predictions and the true labels
def semeval_predictions(model, tokenizer, samples):

    class MyMarisaTrie(MarisaTrie):
        def __init__(self, data): super().__init__(data)
        def get(self, data, length_to_ignore): return super().get([tokenizer.bos_token_id] + data[length_to_ignore:])


    # limit options for decoding
    trie = MyMarisaTrie([tokenizer.encode("NO.") + [tokenizer.eos_token_id],
                         tokenizer.encode("No.") + [tokenizer.eos_token_id],
                         tokenizer.encode("YES.") + [tokenizer.eos_token_id],
                         tokenizer.encode("Yes.") + [tokenizer.eos_token_id]])

    # get max no. of tokens needed
    token_counts = []
    for sample in samples:
        # Tokenize input and generate attention mask
        encoding = tokenizer(sample["text"])
        token_counts.append(len(encoding['input_ids']))

    max_token_count = max(token_counts) + 6

    labels = []
    preds = []
    with torch.inference_mode():
        for sample in tqdm(samples, desc = f"Generating Predictions with LLM"):
            labels.append(sample["label"])
            # Tokenize input and generate attention mask
            prompt = tokenizer.encode(sample["text"], return_tensors="pt", padding=True, truncation=True, return_attention_mask=True).to('cuda')
            prompt_length = prompt[0].shape[0]

            output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, 
                                    max_length=max_token_count, 
                                    prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist(), prompt_length))

            # Decode only the newly generated tokens
            # Skip the input tokens by starting the slice at input_length
            new_tokens = output[0, prompt_length:]

            pred = extract_yes_no_after_answer(tokenizer.decode(new_tokens, skip_special_tokens=True))
            preds.append(pred)
    return labels, preds

In [37]:
labels, preds = semeval_predictions(model, tokenizer, samples)

Generating Predictions with LLM:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
# prompt the model com o A

# guardar resposta

# nova prompt com B e C

## test

In [34]:
# Tokenize input and generate attention mask
prompt = tokenizer.encode(samples[0]["text"], return_tensors="pt", padding=True, truncation=True, return_attention_mask=True).to('cuda')
prompt_length = prompt[0].shape[0]

output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, max_length=1000, prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist(), prompt_length))
print(tokenizer.decode(output[0]))

# Calculate the length of the input to isolate new tokens
#input_length = input_ids.shape[1]
input_length = prompt_length

# Decode only the newly generated tokens
# Skip the input tokens by starting the slice at input_length
new_tokens = output[0, input_length:]
print(f"tokenizer.decode(new_tokens)-->{tokenizer.decode(new_tokens)}")
print(f"tokenizer.decode(outputs[0])-->{tokenizer.decode(output[0])}")
pred = extract_yes_no_after_answer(tokenizer.decode(new_tokens))
print(f"pred-->{pred}")

<s> Consider the task of determining semantic entailment relations between individual sections of Clinical Trial Reports (CTRs) and statements made by clinical domain experts. Note that CTRs outline the methodology and findings of a clinical trial, which are conducted to assess the effectiveness and safety of new treatments. Each trial involves 1-2 patient groups, called cohorts or arms, and these groups may receive different treatments, or have different baseline characteristics. The complete CTRs contain 4 sections, corresponding to (1) a list of the ELIGIBILITY CRITERIA corresponding to the conditions for patients to be allowed to take part in the clinical trial, (2) a description for the INTERVENTION that specifies the type, dosage, frequency, and duration of treatments being studied, (3) a summary of the RESULTS, detailing aspects such as the number of participants in the trial, the outcome measures, the units, and the conclusions, and (4) a list of ADVERSE EVENTS corresponding to

Finding the max number of tokens for the padding

In [35]:
token_counts = []
for sample in tqdm(samples):
    # Tokenize input and generate attention mask
    #print(f'sample["text"]-->{sample["text"]}')
    encoding = tokenizer(sample["text"])
    #print(encoding.keys())
    #print(f'enocding-->{encoding}')
    token_counts.append(len(encoding['input_ids']))

max_token_count = max(token_counts) + 6

  0%|          | 0/200 [00:00<?, ?it/s]

In [36]:
labels = []
pred = []
with torch.inference_mode():
    for sample in tqdm(samples[0:50]):
        labels.append(sample["label"])

        prompt = tokenizer.encode(sample["text"], return_tensors="pt").to('cuda')
        prompt_length = prompt[0].shape[0]

        output = model.generate(prompt, pad_token_id=tokenizer.eos_token_id, max_length=max_token_count, prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist(), prompt_length))   

        # Decode only the newly generated tokens
        # Skip the input tokens by starting the slice at input_length
        new_tokens = output[0, prompt_length:]

        print(f"tokenizer.decode(new_tokens)-->{tokenizer.decode(new_tokens)}")
        pred.append(extract_yes_no_after_answer(tokenizer.decode(new_tokens)))
        print(f"pred-->{pred}")

  0%|          | 0/50 [00:00<?, ?it/s]

tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES']
tokenizer.decode(new_tokens)--><s> NO.</s>
pred-->['YES', 'NO']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES']
tokenizer.decode(new_tokens)--><s> NO.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO']
tokenizer.decode(new_tokens)--><s> NO.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO', 'YES']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO', 'YES', 'YES']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO', 'YES', 'YES', 'YES']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO', 'YES', 'YES', 'YES', 'YES']
tokenizer.decode(new_tokens)--><s> YES.</s>
pred-->['YES', 'NO', 'YES', 'YES', 'NO', 'NO', 'YES', 'YES', 'YES', 'YES',

In [37]:
def convert_preds_from_yesno(preds):
    preds_2 = []
    no_of_not_founds = 0
    for i in preds:
        if i == 'YES' or i == 'Yes':
            preds_2.append('Entailment')
        elif i == 'NO' or i == 'No':
            preds_2.append('Contradiction')
        else:
            preds_2.append('Contradiction')
            no_of_not_founds += 1
    return preds_2, no_of_not_founds

In [38]:
preds_2, no_of_not_founds = convert_preds_from_yesno(pred)
f1_score(y_true = labels, y_pred=preds_2, pos_label="Entailment")

0.6875

In [39]:
# function to evaluate prompt population
# outputs a list with the scores for each prompt
# n_samples is the no. of samples where the evaluation will be done
def eval_pop(n_pop, population_prompts, data_expanded, model, tokenizer, n_samples=0, task='SemEval'):
    if task != 'SemEval' and task != 'CSQA':
        return None

    scores = []

    if n_samples == 0 or n_samples > len(data_expanded):
        n_samples = len(data_expanded)

    if task == "SemEval":
        for i in tqdm(range(n_pop), desc = f"Evaluating prompt population"):
            samples = prompt_creation_semeval(data_expanded, 
                                                population_prompts['task_description'][i], 
                                                population_prompts['ctr_description'][i], 
                                                population_prompts['statement_description'][i], 
                                                population_prompts['answer_description'][i])

            labels, predictions = semeval_predictions(model, tokenizer, samples[:n_samples])
            preds, n_not_founds = convert_preds_from_yesno(predictions)
            #print(f"n_not_founds-->{n_not_founds}")
            score = f1_score(y_true=labels, y_pred=preds, pos_label="Entailment")
            scores.append(score)

    if task == "CSQA":
        for i in tqdm(range(n_pop), desc = f"Evaluating prompt population"):
            samples = prompt_creation_csqa(data_expanded, 
                                           population_prompts['task_description'][i],
                                           population_prompts['answer_description'][i])

            labels, predictions = csqa_predictions(model, tokenizer, samples[:n_samples])
            score = accuracy_score(y_true=labels, y_pred=predictions)
            scores.append(score)

    return scores

A alterar
- Sampling para combinations com base na fitness value
- Manter só as 5 melhores da população total

In [40]:
def create_root_folder():
    # Format: Runs_YYYY-MM-DD_HH-MM-SS
    folder_name = datetime.now().strftime("RUNS/Runs_%Y-%m-%d_%H-%M-%S")
    os.makedirs(folder_name, exist_ok=True)
    return folder_name

In [41]:
def save_population(iteration, population_dict, additional_list, root_folder):
    # Create a folder for the current iteration
    iteration_folder = os.path.join(root_folder, f"Iteration_{iteration}")
    os.makedirs(iteration_folder, exist_ok=True)
    
    # Save each key in the population dictionary as a .txt file
    for key, values in population_dict.items():
        file_path = os.path.join(iteration_folder, f"{key}.txt")
        with open(file_path, 'w') as file:
            for value in values:
                file.write(f"{value}\n")
    
    # Save the additional list in a separate .txt file
    additional_file_path = os.path.join(iteration_folder, "evaluations.txt")
    with open(additional_file_path, 'w') as file:
        for item in additional_list:
            file.write(f"{item}\n")

In [15]:
def sort_pop(population, eval):

    sorted_indices = np.argsort(eval)[:][::-1]
    sorted_pop = {key: [value[i] for i in sorted_indices if i < len(value)] for key, value in population.items()}
    sorted_eval = [eval[i] for i in sorted_indices if i < len(eval)]

    return sorted_pop, sorted_eval

In [42]:
# function
def evo_alg(initial_population_prompts, evolutionary_prompts, model, tokenizer, 
            n_pop = 5, # initial population size and the number of elements kepts at each iteration
            n_combinations = 10,
            SemEval_folder_path = '',
            patience = 10,
            max_iter = 50,
            save = True,
            task = 'SemEval'):
    
    # Call the function to create the folder and print its name
    if save == True:
        root_folder = create_root_folder()
        print(f"Root folder created: {root_folder}")

    # extract SemEval data
    data_expanded = extract_SemEval_data()
    
    patience_counter = 0
    iter = 0

    population = {key: [] for key in initial_population_prompts.keys()}
    initial_eval = eval_pop(n_pop = n_pop, population_prompts = initial_population_prompts, data_expanded = data_expanded, model=model, tokenizer=tokenizer, task=task)
    population = initial_population_prompts
    print(f"initial_eval-->{initial_eval}")
    print(f"initial_eval-->{initial_eval}")
    
    if save == True:
        save_population('initial', population, initial_eval, root_folder)
        print(f"Data saved for iteration {iter}.")
    
    while patience_counter <= patience and iter < max_iter:
        
        # mutate population 
        mutated_population = {key: [] for key in initial_population_prompts.keys()}
        combined_population = {key: [] for key in initial_population_prompts.keys()}

        # iterate through each prompt to generate mutations
        for i in tqdm(range(n_pop), desc = f"iteration {iter} - Mutating prompts"):
            # iterate through the subprompts
            for j in initial_population_prompts.keys():

                # mutate each subprompt and add to the mutated population
                #print(f'initial_population_prompts[j][i]-->{initial_population_prompts[j][i]}')
                mutated = mutate_prompt(initial_population_prompts[j][i], evolutionary_prompts['mutation_prompts'][0], model, tokenizer) 

                mutated_population[j].append(mutated)
                population[j].append(mutated)
        
        #new_pop = mutated_population
        mutated_eval = eval_pop(n_pop = n_pop, population_prompts = mutated_population, 
                                data_expanded = data_expanded, model=model, tokenizer=tokenizer, task=task)
        
        eval = initial_eval + mutated_eval
        print(f"eval-->{eval}")

        for i in tqdm(range(n_combinations), desc = f"iteration {iter} - Combining prompts"):
            #sel4comb = random.sample(range(n_initial_population + n_pop), 2)
            #print(f"n_pop-->{n_pop}")
            #print(f"range(n_pop + n_pop)-->{range(n_pop + n_pop)}")
            #print(f"eval (weights)-->{eval}")
            sel4comb = random.choices(range(n_pop + n_pop), weights=eval, k=2)
            #print(f"sel4comb-->{sel4comb}")
            # iterate through the subprompts
            for j in initial_population_prompts.keys():

                # combine each subprompt randomly selected and add to the combined and total population
                combined = combine_prompts(population[j][sel4comb[0]], population[j][sel4comb[1]], 
                                           evolutionary_prompts['combination_prompts'][0], model, tokenizer)
                combined_population[j].append(combined)
                population[j].append(combined)
                #new_pop[j].append(combined)
        
        combined_eval = eval_pop(n_pop = n_combinations, population_prompts = combined_population, 
                                 data_expanded=data_expanded, model=model, tokenizer=tokenizer,
                                 task=task)
        eval += combined_eval
        print(f"evaluation at iteration {iter + 1}-->{eval}")

        # Call the function
        if save == True:
            save_population(iter, population, eval, root_folder)
            print(f"Data saved for iteration {iter}.")

        if max(eval) > max(initial_eval):
            #continue = True
            patience_counter = 0
            iter += 1

            keep_pop_indices = np.argsort(eval)[-n_pop:][::-1]
                    
            # Create a new dictionary with the same keys, but values are lists with only the selected indices
            # keep the top performing prompts
            keep_pop = {key: [value[i] for i in keep_pop_indices if i < len(value)] for key, value in population.items()}
            population = keep_pop
            initial_eval = [eval[i] for i in keep_pop_indices if i < len(eval)]
            print(f"initial_eval-->{initial_eval}")
        
        # difference to the if is that there was no overall improvment so patience counter increases
        else:
            patience_counter += 1
            iter += 1
    
            keep_pop_indices = np.argsort(eval)[-n_pop:][::-1]
                    
            # Create a new dictionary with the same keys, but values are lists with only the selected indices
            # keep the top performing prompts
            keep_pop = {key: [value[i] for i in keep_pop_indices if i < len(value)] for key, value in population.items()}
            population = keep_pop
            initial_eval = [eval[i] for i in keep_pop_indices if i < len(eval)]
            print(f"initial_eval-->{initial_eval}")
    
    best_index = np.argsort(initial_eval)[-1:][::-1]
    # Create a new dictionary with the same keys, but values are lists with only the selected indices
    best_prompt = {key: [value[i] for i in best_index if i < len(value)] for key, value in keep_pop.items()}

    return best_prompt, keep_pop, initial_eval

In [37]:
best_prompt, keep_pop, initial_eval = evo_alg(initial_population_prompts = initial_population_prompts, 
                      evolutionary_prompts = evolutionary_prompts, 
                      model = model, 
                      tokenizer = tokenizer,
                      n_pop = 5)

Root folder created: RUNS/Runs_2024-03-05_02-19-48


Evaluating prompt population:   0%|          | 0/5 [00:00<?, ?it/s]

Generating Predictions with LLM:   0%|          | 0/200 [00:00<?, ?it/s]

In [42]:
best_prompt

{'statement_description': ['1. Clinical domain experts, clinical trial organizers, and medical researchers may generate the following statement. 2. Consult this statement for further guidance.'],
 'answer_description': ["\nBased on the clinical trial report data, determine if the statement's validity can be confirmed."],
 'task_description': ['\n1. Identify semantic entailment relations between individual sections of Clinical Trial Reports (CTRs) and statements from clinical domain experts.\n2. CTRs consist of four sections: (a) Eligibility Criteria, (b) Intervention, (c) Results, and (d) Adverse Events.\n3. Eligibility Criteria outline patient conditions for clinical trial participation.\n4. Intervention specifies treatment type, dosage, frequency, and duration.\n5. Results detail trial participants, outcome measures, units, and conclusions.\n6. Adverse Events list signs and symptoms observed during the trial.\n7. Statements are sentences that claim information related to one CTR or c

# Overview

INPUT: 5 PROMPTS TO BEGIN THE PROCEDURE, COMPOSED OF 4 DIFFERENT PARTS THAT NEED TO BE COMBINED WITH THE CLINICAL TRIAL INFORMATION
INPUT: DATA FROM THE SEMEVAL TASK, COMPOSED OF TRAINING AND TESTING SPLITS

1 - EVALUATE THE INITIAL 5 PROMPTS IN THE TRAINING DATA FROM THE SEMEVAL TASK

2 - REPEAT THE FOLLOWING STEPS CORRESPONDING TO AN EVOLUTIONARY PROCEDURE

2.1 - TAKE THE 5 BEST PROMPTS FROM THE PREVIOUS ITERATION (OR THE 5 INITIAL ONES)

2.2 - FOR EACH OF THE 5 PROMPTS, GENERATE 1 NEW MUTATED VERSION, THUS RESULTING IN A SET OF 10 PROMPTS (INITIAL PLUS NEW). AN LLM IS USED TO GENERATE THE MUTATIONS, USING A FIXED PROMPT TO PROCESS INDIVIDUALLY EACH OF THE 4 PARTS THAT COMPOSE THE PROMPT.

2.3 - WITH THE SET OF 10 PROMPTS FROM THE PREVIOUS STEP, GENERATE 15 NEW PROMPTS THROUGH A COMBINATION OF TWO OF THE PROMPTS SAMPLED RANDOMLY FROM THE SET. THIS RESULTS IN A SET OF 20 PROMPTS (THE 5 FROM STEP 2.2, PLUS THE NEW 15). AN LLM IS USED TO GENERATE THE COMBiNATIONS, USING A FIXED PROMPT TO PROCESS INDIVIDUALLY EACH OF THE 4 PARTS IN EACH INSTANCE FROM THE PAIR OF PROMPTS BEING COMBINED.

2.4 - EVALUATE THE 20 PROMPTS IN THE SET RESULTING FROM THE PREVIOUS STEP, USING THE TRAINING DATA FROM THE SEMEVAL TASK. TAKE THE 5 BEST PROMPTS FROM THE SET.

2.5 - IF ONE OF THE 5 NEW BEST PROMPTS HAS A BETTER EVALUATION SCORE THAN THE BEST PROMPT FROM THE INITIAL SET OF 5, THEN CONTINUE THE PROCEDURE AGAIN FROM STEP 2.

2.6 - CONSIDERING A PATIENCE OF ONE, BUILT A NEW DATASET OF 20 PROMPTS CONSIDERING THE FIVE INITIAL PROMPTS, AND REPEATING THE PROCEDURES FROM STEPS 2.2 AND 2.3.

2.7 - EVALUATE THE NEW SET OF 20 PROMPTS RESULTING FROM THE PREVIOUS STEP, USING THE TRAINING DATA FROM THE SEMEVAL TASK. TAKE THE 5 BEST PROMPTS FROM THE SET.

2.8 - IF ONE OF THE 5 NEW BEST PROMPTS HAS A BETTER EVALUATION SCORE THAN THE BEST PROMPT FROM THE INITIAL SET OF 5, THEN CONTINUE THE PROCEDURE AGAIN FROM STEP 2. OTHERWISE, STOP THE ITERATION AND KEEP THE BEST RESULT/PROMPT.

3 - WITH BASIS ON THE BEST PROPMT RESULTING FROM THE EVOLUTIONARY PROCEDURE, EVALUATE THE RESULTS ON THE TESTING SPLIT FROM THE SEMEVAL TASK. REPORT THIS BEST RESULT, TOGETHER WITH THE NUMBER OF ITERATIONS THAT WERE REQUIRED, AND TOGETHER WITH THE EVALUATION SCORE ALSO ON THE TEST SPLIT.

4 - TAKING THE BEST PROMPT RESULTING FROM THE PREVIOUS PROCEDURE, FINE-TUNE AN LLM TO THE SEMEVAL TASK, USING THE TRAINING DATA.

5 - EVALUATE THE PERFORMANCE OF THE FINE-TUNED MODEL ON THE TESTING SPLIT FROM THE SEMEVAL TASK, COMPARING WITH THE RESULTS OBTAINED FOR STEP 3.

# Fine-Tuning

In [57]:
# function that takes data in the samples format (list of dictionarys with keys: text and label)
# converts to csv format, used in training
def convert_data_fine_tuning(samples, filename = "training_data.csv"):

    # Open the file in write mode
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        # Create a csv writer
        writer = csv.writer(file)
        
        # Write the header
        writer.writerow(["prompt", "output"])
        
        # Write the data rows
        for item in samples:
            if item['label'] == 'Entailment':
                lab = 'YES'
            else:
                lab = 'NO'
            writer.writerow([item['text'], lab])

In [58]:
convert_data_fine_tuning(samples, filename = "training_data.csv")

In [59]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('csv', data_files='training_data.csv')

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'output'],
        num_rows: 1700
    })
})


  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [60]:
for sample in samples:
    # Tokenize input and generate attention mask
    encoding = tokenizer(sample["text"])
    token_counts.append(len(encoding['input_ids']))

max_token_count = max(token_counts) + 6

In [61]:
max_token_count

3535

In [62]:
def tokenize_function(examples):
    # Tokenize the inputs and labels. Adjust `max_length` as needed.
    model_inputs = tokenizer(examples["prompt"], padding="max_length", truncation=True, max_length=max_token_count)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding="max_length", truncation=True, max_length=max_token_count)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = load_dataset('csv', data_files='training_data.csv')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1700 [00:00<?, ? examples/s]



In [65]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Ensure mlm is set to False for causal language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

trainer.train()