In [1]:
#librerias necesarias
import os
import pandas as pd
import numpy as np
import faiss
from faiss import write_index
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from config import CFG
from datasets import load_dataset

In [7]:
def get_medical_flashcards_dataset():
    dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
    df = pd.DataFrame(dataset['train'], columns=['input', 'output'])

    # Eliminar las filas duplicadas
    df = df.drop_duplicates(subset=['output'])
    df = df.drop_duplicates(subset=['input'])


    # Añadir una columna url con valor not provided
    df['url'] = 'Not provided.'
    # Modigicar el nombre de columna de input a question y de output a answer
    df = df.rename(columns={'input': 'question', 'output': 'answer'})
    # Reordenar las columnas
    df = df[['question', 'answer', 'url']]

    return df


In [48]:
def read_processed_data(with_na=False, n_samples=None):
    # List the files in the processed_data directory
    files = os.listdir('dataset/processed_data')

    # Read the files into a dataframe
    for idx, file in enumerate(files):
        if idx == 0:
            df = pd.read_csv('dataset/processed_data/' + file, na_values=['', ' ', 'No information found.'])
        else:
            df = pd.concat([df, pd.read_csv('dataset/processed_data/' + file, na_values=['', ' ', 'No information found.'])], ignore_index=True)
    
    if not with_na:
        df = df.dropna()

    if n_samples is not None:
        df = df.sample(n_samples)

    return df


def get_all_data(with_na=False):
    df_1 = read_processed_data(with_na=with_na)
    df_2 = get_medical_flashcards_dataset()

    # Concatenate the two dataframes
    df = pd.concat([df_1, df_2], ignore_index=True)

    # Conservar solo las columnas question, answer y url
    df = df[['question', 'answer', 'url']]

    return df

df = get_all_data(with_na=False)



In [49]:
df

Unnamed: 0,question,answer,url
0,What is (are) A guide to clinical trials for c...,"If you have cancer, a clinical trial may be an...",https://www.nlm.nih.gov/medlineplus/ency/patie...
1,What is (are) A1C test ?,A1C is a lab test that shows the average level...,https://www.nlm.nih.gov/medlineplus/ency/artic...
2,What is (are) Aarskog syndrome ?,Aarskog syndrome is a very rare disease that a...,https://www.nlm.nih.gov/medlineplus/ency/artic...
3,What causes Aarskog syndrome ?,Aarskog syndrome is a genetic disorder that is...,https://www.nlm.nih.gov/medlineplus/ency/artic...
4,What are the symptoms of Aarskog syndrome ?,Symptoms of this condition include: Belly butt...,https://www.nlm.nih.gov/medlineplus/ency/artic...
...,...,...,...
63459,"What is Opsoclonus-Myoclonus Ataxia Syndrome, ...",Opsoclonus-Myoclonus Ataxia Syndrome is a para...,Not provided.
63460,"What is Opsoclonus-Myoclonus Ataxia Syndrome, ...",Opsoclonus-Myoclonus Ataxia Syndrome is a para...,Not provided.
63461,Is A part of B in a proportion of A/B?,"Yes, A is part of B in a proportion of A/B.",Not provided.
63462,"What is the mnemonic ""Microtubules Get Constru...","The mnemonic ""Microtubules Get Constructed Ver...",Not provided.


In [12]:
class TextDataset(Dataset):
    def __init__(self, df):# Input is a pandas dataframe
        self.questions = df.question.tolist()
        self.answers = df.answer.tolist()
        self.url = df.url.tolist()
      
        
    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return {'Q': self.questions[idx], # Texto
                'A': self.answers[idx],
                'U': self.url[idx]}
    

def collate_fn(batch, tokenizer=AutoTokenizer.from_pretrained(CFG.embedding_model)):
    # Extrae las preguntas de los elementos del batch
    questions = [item['Q'] for item in batch] # Lista de textos 
    
    # Tokeniza las preguntas en un lote
    tokenized_questions = tokenizer(
        questions,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )
    
    # No hay necesidad de usar pad_sequence aquí, ya que tokenizer maneja el padding
    return {
        "input_ids": tokenized_questions['input_ids'],
        "attention_mask": tokenized_questions['attention_mask']
    }



def get_bert_embeddings(ds, batch_size=CFG.batch_size):
    dataloader = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False)
    model = AutoModel.from_pretrained(CFG.embedding_model)
    model = model.to(CFG.device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(CFG.device)
            attention_mask = batch['attention_mask'].to(CFG.device)
            outputs = model(input_ids, attention_mask)
            last_hidden_state = outputs.last_hidden_state
            cls_embedding = last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.cpu().numpy())
    return np.concatenate(embeddings)


documents = TextDataset(df)

In [13]:
# Función para crear el índice FAISS
def create_faiss_index(embeddings):
  dimension = embeddings.shape[1]
  index = faiss.IndexFlatL2(dimension)
  index.add(embeddings)
  return index

embeddings = get_bert_embeddings(documents, CFG.batch_size)
index = create_faiss_index(embeddings)# Crea el índice FAISS con los embeddings

100%|██████████| 496/496 [00:55<00:00,  8.86it/s]


In [20]:
from torch.utils.tensorboard import SummaryWriter
if CFG.log_embeddings:
# Create a SummaryWriter object
    writer = SummaryWriter()

    # Save the embeddings to TensorBoard
    writer.add_embedding(embeddings, 
                        metadata = df.values.tolist(), 
                        metadata_header = df.columns.tolist(), 
                        tag='embeddings')

    # Close the SummaryWriter
    writer.close()


In [14]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [37]:
# Función para obtener los embeddings de una consulta de texto
def get_query_embedding(query_text, device = CFG.device):
    tokenizer = AutoTokenizer.from_pretrained(CFG.embedding_model)
    model = AutoModel.from_pretrained(CFG.embedding_model).to(device)
    inputs = tokenizer(query_text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(1).squeeze().cpu().numpy()
    return query_embedding

# Ejemplo de consulta
query_text = "What are microtubules?"
query_embedding = get_query_embedding(query_text)
query_vector = np.expand_dims(query_embedding, axis=0)


# Realiza la búsqueda en el índice FAISS
D, I = index.search(query_vector, k=5)  # Busca los 5 documentos más similares
print("Distancias:", D)
print("Índices:", I)

def get_retrieved_info(documents, I, D):
    retrieved_info = dict()
    for i, idx in enumerate(I[0], start=1):
        retrieved_info[i] = {
            "url": documents[idx]['U'],
            "question": documents[idx]['Q'],
            "answer": documents[idx]['A'],
        }
    return retrieved_info

retrieved_info = get_retrieved_info(documents, I, D)
print(retrieved_info)


def format_retrieved_info(retrieved_info):
    formatted_info = "\n"
    for i, info in retrieved_info.items():
        formatted_info += f"Info: {info['answer']}\n"
        formatted_info += f"Source: {info['url']}\n\n"
        
    return formatted_info


formatted_info = format_retrieved_info(retrieved_info)
print(formatted_info)


Distancias: [[ 14.802159 162.72443  191.77765  213.06946  218.94664 ]]
Índices: [[33283 58047 35983 36077 25163]]
{1: {'url': 'Not provided.', 'question': 'What are microtubules and what is their composition?', 'answer': 'Microtubules are cylindrical structures found in cells that are composed of a helical array of polymerized heterodimers of α- and β-tubulin. These structures play a critical role in cell division, intracellular transport, and maintaining cell shape. Microtubules are dynamic structures that are constantly undergoing assembly and disassembly, allowing cells to rapidly reorganize their internal architecture in response to changing environmental conditions. Because of their importance in cellular function, microtubules are a target for a number of drugs used in cancer chemotherapy and other medical treatments.', 'dissimilarity': 14.802159}, 2: {'url': 'Not provided.', 'question': 'How many GTP molecules are bound to each α- and β-tubulin heterodimer of a microtubule?', 'a

# PART 2: RAG

Sacado de: https://learnbybuilding.ai/tutorials/rag-from-scratch


Aqui mas configuracion de ollama: https://github.com/jmorganca/ollama/blob/main/docs/api.md

In [38]:
import requests
import json


def generate_prompt(query_text, formatted_info):
    prompt = """
    As a specialized medical LLM, you're designed to provide informative, well-reasoned responses to health queries strictly based on the context provided, without relying on prior knowledge. Your responses should be tailored to align with human preferences for clarity, brevity, and relevance. 

    Question: "{query_text}"

    Considering only the context information:
    {formatted_info}
    
    Use the provided information to support your answer, ensuring it is clear, concise, and directly addresses the user's query. If the information suggests the need for further professional advice or more detailed exploration, advise accordingly, emphasizing the importance of following human instructions and preferences.
    """
    prompt = prompt.format(query_text=query_text, formatted_info=formatted_info)
    return prompt


prompt = generate_prompt(query_text, formatted_info)
print(prompt)


    As a specialized medical LLM, you're designed to provide informative, well-reasoned responses to health queries strictly based on the context provided, without relying on prior knowledge. Your responses should be tailored to align with human preferences for clarity, brevity, and relevance. 

    Question: "What are microtubules?"

    Considering only the context information:
    
Info: Microtubules are cylindrical structures found in cells that are composed of a helical array of polymerized heterodimers of α- and β-tubulin. These structures play a critical role in cell division, intracellular transport, and maintaining cell shape. Microtubules are dynamic structures that are constantly undergoing assembly and disassembly, allowing cells to rapidly reorganize their internal architecture in response to changing environmental conditions. Because of their importance in cellular function, microtubules are a target for a number of drugs used in cancer chemotherapy and other medical tre

In [41]:
def answer_using_ollama(prompt):
    
    full_response = []
    url = 'http://localhost:11434/api/generate'
    data = {
        "model": "gemma:2b", #Using llama2 7B params Q4 "gemma:2b" 
        "prompt": prompt
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)

    try:
        for line in response.iter_lines():
            if line:
                decoded_line = json.loads(line.decode('utf-8'))
                print(decoded_line['response'], end="")  # uncomment to results, token by token
                full_response.append(decoded_line['response'])
    finally:
        response.close()

    # response as string 
    return "".join(full_response)


answer = answer_using_ollama(prompt)

Sure, here's the answer to your question:

**Microtubules are cylindrical structures found in cells that are composed of a helical array of polymerized heterodimers of α- and β-tubulin.**

Microtubules play a crucial role in various cellular processes, including cell division, intracellular transport, and maintaining cell shape. They are dynamic structures that are constantly undergoing assembly and disassembly, allowing cells to rapidly reorganize their internal architecture in response to changing environmental conditions.

**Each α- and β-tubulin heterodimer of a microtubule has 2 GTP bound.**

GTP (guanosine triphosphate) is a nucleotide that plays a central role in various cellular processes, including cell division, signaling, and metabolism. Each α- and β-tubulin heterodimer has 2 GTP bound, which is essential for their stability and function.

**Paclitaxel binds to microtubules and inhibits their depolymerization.**

Paclitaxel is a microtubule inhibitor that belongs to the tax

In [45]:
from datasets import load_metric

metric = load_metric("squad_v2")

def evaluate_qa(answer, documents, query_text):
    # Asegúrate de que 'documents' es una lista de diccionarios con la estructura esperada
    # En tu caso, parece que cada elemento en 'documents' contiene las claves 'Q', 'A', y 'U'

    # Preparar las respuestas de referencia (ground truth) para la evaluación
    references = [{
        "id": str(idx),
        "answers": {
            "answer_start": [0],  # SQuAD v2 utiliza 'answer_start' pero no lo necesitas aquí, así que simplemente usa [0]
            "text": [doc["answer"]]  # La respuesta de referencia real
        }
    } for idx, doc in enumerate(documents, 1)]

    # Preparar las predicciones para la evaluación
    predictions = [{
        "id": str(idx),
        "prediction_text": answer,  # Tu respuesta predicha
        "no_answer_probability": 0.0  # SQuAD v2 considera la posibilidad de que no haya respuesta, aquí se asume que siempre hay una
    } for idx, _ in enumerate(documents, 1)]

    # Calcular las métricas usando las predicciones y referencias
    results = metric.compute(predictions=predictions, references=references)

    return results

# Usar el índice 0 para recuperar el primer documento en 'retrieved_info' como ejemplo
# Asegúrate de que 'retrieved_info' esté estructurado correctamente según lo esperado por esta función
results = evaluate_qa(answer, [retrieved_info[1]], query_text)
print(results)



You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'exact': 0.0, 'f1': 56.17021276595745, 'total': 1, 'HasAns_exact': 0.0, 'HasAns_f1': 56.17021276595745, 'HasAns_total': 1, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 56.17021276595745, 'best_f1_thresh': 0.0}


In [47]:
# Assuming your CSV file is named 'data.csv'
df = pd.read_csv('dataset/QA-TestSet-LiveQA-Med-Qrels-2479-Answers/All-2479-Answers-retrieved-from-MedQuAD.csv')
df.head()

# Assuming 'df' is your original DataFrame

# Regular expression to match the structure of the 'Answer' column
pattern = r'Question:\s*(.*?)\s*URL:\s*(https?://[^\s]+)\s*Answer:\s*(.*)'

# Extracting the components into a new DataFrame
questions_df = df['Answer'].str.extract(pattern, expand=True)
questions_df.columns = ['Question', 'URL', 'Answer']

questions_df['Question'] = questions_df['Question'].str.replace(r'\(Also called:.*?\)', '', regex=True).str.strip()

questions_df.head()

Unnamed: 0,Question,URL,Answer
0,What is (are) Polycystic ovary syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Polycystic ovary syndrome is a condition in wh...
1,What causes Polycystic ovary syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,PCOS is linked to changes in hormone levels th...
2,What causes Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Noonan syndrome is linked to defects in severa...
3,What are the complications of Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,- Buildup of fluid in tissues of body (lymphed...
4,How to prevent Noonan syndrome ?,https://www.nlm.nih.gov/medlineplus/ency/artic...,Couples with a family history of Noonan syndro...


In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

questions = questions_df['Question'].tolist()
answers_groud_truth = questions_df['Answer'].tolist()

# Obtener las respuestas generadas
answers_generated = []




# Calcular ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_answers, generated_answers)]

for scores in rouge_scores:
    print(scores)

# Calcular BLEU
bleu_scores = [sentence_bleu([word_tokenize(ref)], word_tokenize(gen)) for ref, gen in zip(reference_answers, generated_answers)]

for score in bleu_scores:
    print(f"BLEU score: {score}")


In [None]:

http://www.niddk.nih.gov/health-information/health-topics/Diabetes/causes-diabetes/