# Testing data quality function

In [3]:
import os
# set available gpu's
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [4]:
def generate_string_for_semeval_data_quality(datapoint: dict):

    task_description = "Natural Language Inference task between Clinical Trial Reports' (CTRs) Sections and a Statement made by a specialist. The Statement can be made about a single CTR or two CTRs. There are two possible relations between the statement and the CTRs: Entailment or Contradiction"
    primary_evidence = "\n".join(datapoint['primary_evidence'])
    statement = datapoint['statement']
    label = datapoint['label']

    semeval_text = f"""Task description: {task_description}\n\nPrimary Trial:\n{primary_evidence}\n\n"""

    if 'secondary_evidence' in datapoint.keys():
        secondary_evidence = "\n".join(datapoint['secondary_evidence'])
        semeval_text = f"""{semeval_text}Secondary Trial:\n{secondary_evidence}\n\n"""

    semeval_text = f"""{semeval_text} Statement: {statement}\n\nRelation: {label}"""

    return semeval_text

In [None]:
def data_quality_assessment_save(task):
    # extract data acording to selected task
    # combine the validation and the training data
    # expect same format for all TRUE
    # create prompt part between ### and ###, one for each task
        # semeval
        # contract nli
        # mediqa sum
        # lex sum
    # append to common part
    # loop through and performance the inferences
        # in loop save the score
    # save new variation with the scores (in the same format they were extracted from but in reverse order of the scores)
    # so that in the implementation of extract from scored variation we just select the top k and o ordering is done there


    return None

In [12]:
import torch
import torch.nn.functional as F

def data_quality_inference(data_quality_prompt, model, tokenizer):

    encoded_inputs = tokenizer(data_quality_prompt, return_tensors="pt", return_attention_mask=True, padding=True).to('cuda')
    input_len = encoded_inputs['input_ids'][0].shape[0]

    with torch.inference_mode():
        output = model.generate(encoded_inputs['input_ids'],
                                attention_mask=encoded_inputs['attention_mask'], 
                                max_new_tokens=1,
                                return_dict_in_generate=True,
                                output_scores=True,
                                )
        

    # Decode the generated sequence (excluding input)
    generated_ids = output.sequences[0, encoded_inputs['input_ids'].shape[-1]:]  # Exclude the input part
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    print(f"generated_text-->{generated_text}")

    logits = output.scores[-1]  # logits of the last token

    # Calculate probabilities
    probabilities = F.softmax(logits, dim=-1)

    yes_token_id = tokenizer.convert_tokens_to_ids("yes")

    # Check if the token id is valid and present
    if yes_token_id != tokenizer.pad_token_id and yes_token_id < len(probabilities[0]):
        yes_token_prob = probabilities[0, yes_token_id].item()
    else:
        yes_token_prob = 0.0

    print(f"yes_token_prob-->{yes_token_prob}")
        
    return yes_token_prob

In [13]:
from evo_functions import load_model, extract_SemEval_data
import json

def data_quality_assessment_and_save(task: str):

    base_data_quality_prompt = f"""The previous paragraph demarcated within ### and ### is a datapoint from a dataset. Your task is to determine whether this datapoint is a well-structured and informative example that would be valuable for evaluating the performance of a large-language model. An informative datapoint should be well-formatted, contain some usable knowledge of the task, and serve as a strong representation of the overall dataset.\n\nOPTIONS:\n- yes\n- no """
    
    # Step 1: Determine and execute the appropriate data extraction function based on `task`
    if task == "SemEval":
        train_data = extract_SemEval_data(type = 'train')
        validation_data = extract_SemEval_data(type = 'dev')
        full_data = train_data + validation_data
        full_data = full_data[:10]
    elif task == "task2":
        pass
    elif task == "task3":
        pass
    elif task == "task4":
        pass
    else:
        raise ValueError("Invalid task provided.")
    
    # Step 2: Generate a specific string for each data point
    for i in range(len(full_data)):
        if task == "SemEval":
            datapoint_string = generate_string_for_semeval_data_quality(full_data[i])
        elif task == "task2":
            pass
        elif task == "task3":
            pass
        elif task == "task4":
            pass
        
        # Combine with base_string
        data_quality_prompt = f"<s><|user|>\n###\n{datapoint_string}\n###\n\n{base_data_quality_prompt}<|end|>\n<|assistant|>"
        
        # Add to data_list
        full_data[i]['data_quality_prompt'] = data_quality_prompt
        #processed_data.append({"original_data": item, "combined_string": combined_string, "score": None})
    
    # Step 3: Perform inferences in a loop

    model, tokenizer = load_model(checkpoint = "microsoft/Phi-3-mini-4k-instruct", quantized = True)

    for i in range(len(full_data)):
        data_quality_score = data_quality_inference(data_quality_prompt = full_data[i]["data_quality_prompt"], 
                                                    model = model, 
                                                    tokenizer = tokenizer)
        full_data[i]["score"] = data_quality_score
    
    # Step 4: Sort the processed data by the `score` in descending order
    full_data.sort(key=lambda x: x['score'], reverse=True)

    filtered_data = [{k: v for k, v in item.items() if k != 'data_quality_prompt'} for item in full_data]

    # Save the data to a JSON file
    file_name = 'DATASETS_data_quality/' + "semeval_data_quality.json"
    with open(file_name, 'w') as json_file:
        json.dump(filtered_data, json_file, indent=4)
    
    print(f"Data with data quality assessment saved to {file_name}!")

    return None

In [14]:
data_quality_assessment_and_save(task = 'SemEval')

Used data with already retrieved examples from DATASETS/SemEval_data/train_w_retrieved.json
Used data with already retrieved examples from DATASETS/SemEval_data/dev_w_retrieved.json


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.01s/it]


Phi-3 selecionado
generated_text-->no
yes_token_prob-->1.0730951544246636e-05
generated_text-->no
yes_token_prob-->2.451200771247386e-06
generated_text-->yes
yes_token_prob-->1.1769451703003142e-05
generated_text-->no
yes_token_prob-->3.345698132761754e-06
generated_text-->no
yes_token_prob-->3.581266810215311e-06
generated_text-->no
yes_token_prob-->3.4976460483449046e-06
generated_text-->no
yes_token_prob-->5.558356406254461e-06
generated_text-->no
yes_token_prob-->7.388226549664978e-06
generated_text-->no
yes_token_prob-->8.138037628668826e-06
generated_text-->no
yes_token_prob-->1.031119427352678e-05
Data with data quality assessment saved to DATASETS_data_quality/semeval_data_quality.json!


In [24]:
test = generate_string_for_semeval_data_quality(full_data[0])
print(test)

###
Task description: Natural Language Inference between Clinical Trial Reports Sections and a Statement made by a specialist. The Statement can be made about one or two Clinical Trial Reports. There are two possible relations: Entailment and Contradiction 
Primary Trial:
INTERVENTION 1: 
  Diagnostic (FLT PET)
  Patients with early stage, ER positive primary breast cancer undergo FLT PET scan at baseline and 1-6 weeks after the start of standard endocrine treatment. The surgery follows 1-7 days after the second FLT PET scan.
  Tracer used in the FLT PET (positron emission tomography) scanning procedure: [F18] fluorothymidine.
  Positron Emission Tomography: Undergo FLT PET
  Laboratory Biomarker Analysis: Correlative studies - Ki67 staining of the tumor tissue in the biopsy and surgical specimen.

 Secondary Trial:
INTERVENTION 1: 
  Arm A
  Patients receive oral capecitabine twice daily on days 1-14 and oral lapatinib ditosylate once daily on days 1-21. Courses repeat every 21 days i

In [25]:
test = generate_string_for_semeval_data_quality(full_data[1])
print(test)

###
Task description: Natural Language Inference between Clinical Trial Reports Sections and a Statement made by a specialist. The Statement can be made about one or two Clinical Trial Reports. There are two possible relations: Entailment and Contradiction 
Primary Trial:
DISEASE CHARACTERISTICS:
  Histologically or cytologically confirmed infiltrating breast cancer
  Clinical evidence of metastatic disease
  Measurable disease, defined as at least one measurable lesion per RECIST criteria
  No non-measurable disease only, defined as all other lesions, including small lesions (longest diameter < 2 cm) and truly non-measurable lesions, including any of the following:
  Bone lesions
  Leptomeningeal disease
  Ascites
  Pleural/pericardial effusion
  Inflammatory breast disease
  Lymphangitis cutis/pulmonis
  Abdominal masses that are not confirmed and followed by imaging techniques
  Cystic lesions
  Patients with HER-2/neu positive tumors, must have received prior treatment with trastuz

In [None]:
#extract_LEXSUM_data(used_retrieved_file = False)

import os
# set available gpu's
os.environ["CUDA_VISIBLE_DEVICES"] = "1"


import torch 
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 
from evo_functions import convert_text_mistral_phi3

torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-4k-instruct",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
    attn_implementation="flash_attention_2"
) 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") 


sentence = """<s><|user|>Below will be a question to a test with two possible answers: yer or no. Output only one of them

Is Lisbon the capital of Portugal?

OPTIONS:
- yes
- no<|end|>
<|assistant|> 
"""

#sentence = convert_text_mistral_phi3(sentence)

encoded_inputs = tokenizer(sentence,return_tensors="pt", return_attention_mask=True, padding=True).to('cuda')
input_len = encoded_inputs['input_ids'][0].shape[0]

with torch.inference_mode():
    output = model.generate(encoded_inputs['input_ids'],
                            attention_mask=encoded_inputs['attention_mask'], 
                            max_new_tokens=1,
                            return_dict_in_generate=True,
                            output_scores=True,
                            )
    

# Decode the generated sequence (excluding input)
generated_ids = output.sequences[0, encoded_inputs['input_ids'].shape[-1]:]  # Exclude the input part
generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)


print(f"output.keys()-->{output.keys()}")
logits = output.scores[-1]  # logits of the last token

# Calculate probabilities
probabilities = F.softmax(logits, dim=-1)

yes_token_id = tokenizer.convert_tokens_to_ids("yes")
no_token_id = tokenizer.convert_tokens_to_ids("no")


# Check if the token id is valid and present
if yes_token_id != tokenizer.pad_token_id and yes_token_id < len(probabilities[0]):
    yes_token_prob = probabilities[0, yes_token_id].item()
else:
    yes_token_prob = 0.0

# Check if the token id is valid and present
if no_token_id != tokenizer.pad_token_id and no_token_id < len(probabilities[0]):
    no_token_prob = probabilities[0, no_token_id].item()
else:
    no_token_prob = 0.0


print(f"Generated text: {generated_text}")
print(f"Probability of 'yes': {yes_token_prob:.8f}")
print(f"Probability of 'no': {no_token_prob:.8f}")




from evo_functions import extract_SemEval_data

