In [1]:
#librerias necesarias
import os
import pandas as pd
import numpy as np
import faiss
from faiss import write_index
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from config import CFG

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_processed_data(with_na=False, n_samples=None):
    # List the files in the processed_data directory
    files = os.listdir('dataset/processed_data_2')

    # Read the files into a dataframe
    for idx, file in enumerate(files):
        if idx == 0:
            df = pd.read_csv('dataset/processed_data_2/' + file, na_values=['', ' ', 'No information found.'])
        else:
            df = pd.concat([df, pd.read_csv('dataset/processed_data_2/' + file, na_values=['', ' ', 'No information found.'])], ignore_index=True)
    
    if not with_na:
        df = df.dropna()

    if n_samples is not None:
        df = df.sample(n_samples)

    return df

df = read_processed_data(with_na = CFG.with_na, n_samples=CFG.n_samples)

In [3]:
df

Unnamed: 0,question,question_id,question_type,answer,focus,id,source,url,cui,semanticType,semanticGroup
0,What is (are) A guide to clinical trials for c...,0000001-1,information,"If you have cancer, a clinical trial may be an...",A guide to clinical trials for cancer,1,ADAM,https://www.nlm.nih.gov/medlineplus/ency/patie...,C0006826,T191,Disorders
1,what research (or clinical trials) is being do...,0000001-2,research,,A guide to clinical trials for cancer,1,ADAM,https://www.nlm.nih.gov/medlineplus/ency/patie...,C0006826,T191,Disorders
2,what research (or clinical trials) is being do...,0000001-3,research,,A guide to clinical trials for cancer,1,ADAM,https://www.nlm.nih.gov/medlineplus/ency/patie...,C0006826,T191,Disorders
3,what research (or clinical trials) is being do...,0000001-6,research,,A guide to clinical trials for cancer,1,ADAM,https://www.nlm.nih.gov/medlineplus/ency/patie...,C0006826,T191,Disorders
4,what research (or clinical trials) is being do...,0000001-7,research,,A guide to clinical trials for cancer,1,ADAM,https://www.nlm.nih.gov/medlineplus/ency/patie...,C0006826,T191,Disorders
...,...,...,...,...,...,...,...,...,...,...,...
47242,What is (are) Parasites - Zoonotic Hookworm ?,0000440-1,information,"There are many different species of hookworms,...",Parasites - Zoonotic Hookworm,440.0,CDC,http://www.cdc.gov/parasites/zoonotichookworm/,,,
47243,Who is at risk for Parasites - Zoonotic Hookwo...,0000440-2,susceptibility,Dog and cat hookworms are found throughout the...,Parasites - Zoonotic Hookworm,440.0,CDC,http://www.cdc.gov/parasites/zoonotichookworm/,,,
47244,How to diagnose Parasites - Zoonotic Hookworm ?,0000440-5,exams and tests,Cutaneous larva migrans (CLM) is a clinical di...,Parasites - Zoonotic Hookworm,440.0,CDC,http://www.cdc.gov/parasites/zoonotichookworm/,,,
47245,What are the treatments for Parasites - Zoonot...,0000440-6,treatment,The zoonotic hookworm larvae that cause cutane...,Parasites - Zoonotic Hookworm,440.0,CDC,http://www.cdc.gov/parasites/zoonotichookworm/,,,


In [4]:
# Ver valores unicos de focus
print([el for el in list(df['focus'].unique())])
print([el for el in list(df['semanticType'].unique())])

['T191', nan, 'T033', 'T019', 'T047', 'T184', 'T037', 'T046', 'T005', 'T048', 'T007', 'T020', 'T190', 'T049', 'T028']


In [5]:
df.keys()

Index(['question', 'question_id', 'question_type', 'answer', 'focus', 'id',
       'source', 'url', 'cui', 'semanticType', 'semanticGroup'],
      dtype='object')

In [7]:
class TextDataset(Dataset):
    def __init__(self, df):# Input is a pandas dataframe
        self.questions = df.question.tolist()
        self.question_ids = df.question_id.tolist()
        self.question_types = df.question_type.tolist()
        self.answers = df.answer.tolist()
        self.focus = df.focus.tolist()
        self.doc_id = df.id.tolist()
        self.source = df.source.tolist()
        self.url = df.url.tolist()
        self.cui = df.cui.tolist()
        self.semantic_type = df.semanticType.tolist()
        self.semantic_group = df.semanticGroup.tolist()
        
    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return {'Q': self.questions[idx], # Texto
                'Q_id': self.question_ids[idx], 
                'Q_T': self.question_types[idx], 
                'A': self.answers[idx],
                'F': self.focus[idx],
                'D_id': self.doc_id[idx],
                'S': self.source[idx],
                'U': self.url[idx],
                'C': self.cui[idx],
                'S_T': self.semantic_type[idx],
                'S_G': self.semantic_group[idx]}
    

def collate_fn(batch, tokenizer=AutoTokenizer.from_pretrained(CFG.embedding_model)):
    # Extrae las preguntas de los elementos del batch
    questions = [item['Q'] for item in batch] # Lista de textos 
    
    # Tokeniza las preguntas en un lote
    tokenized_questions = tokenizer(
        questions,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=512
    )
    
    # No hay necesidad de usar pad_sequence aquí, ya que tokenizer maneja el padding
    return {
        "input_ids": tokenized_questions['input_ids'],
        "attention_mask": tokenized_questions['attention_mask']
    }



def get_bert_embeddings(ds, batch_size=CFG.batch_size):
    dataloader = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False)
    model = AutoModel.from_pretrained(CFG.embedding_model)
    model = model.to(CFG.device)
    model.eval()
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(CFG.device)
            attention_mask = batch['attention_mask'].to(CFG.device)
            outputs = model(input_ids, attention_mask)
            last_hidden_state = outputs.last_hidden_state
            cls_embedding = last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.cpu().numpy())
    return np.concatenate(embeddings)


documents = TextDataset(df)

In [8]:
# Función para crear el índice FAISS
def create_faiss_index(embeddings):
  dimension = embeddings.shape[1]
  index = faiss.IndexFlatL2(dimension)
  index.add(embeddings)
  return index

embeddings = get_bert_embeddings(documents, CFG.batch_size)
index = create_faiss_index(embeddings)# Crea el índice FAISS con los embeddings

100%|██████████| 370/370 [01:06<00:00,  5.55it/s]


In [9]:
from torch.utils.tensorboard import SummaryWriter
if CFG.log_embeddings:
# Create a SummaryWriter object
    writer = SummaryWriter()

    # Save the embeddings to TensorBoard
    writer.add_embedding(embeddings, 
                        metadata = df.values.tolist(), 
                        metadata_header = df.columns.tolist(), 
                        tag='embeddings')

    # Close the SummaryWriter
    writer.close()


In [10]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [11]:
# Función para obtener los embeddings de una consulta de texto
def get_query_embedding(query_text, device = CFG.device):
    tokenizer = AutoTokenizer.from_pretrained(CFG.embedding_model)
    model = AutoModel.from_pretrained(CFG.embedding_model).to(device)
    inputs = tokenizer(query_text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(1).squeeze().cpu().numpy()
    return query_embedding

# Ejemplo de consulta
query_text = "What is the cause of diabetes?"
query_embedding = get_query_embedding(query_text)
query_vector = np.expand_dims(query_embedding, axis=0)


# Realiza la búsqueda en el índice FAISS
D, I = index.search(query_vector, k=5)  # Busca los 5 documentos más similares
print("Distancias:", D)
print("Índices:", I)

def get_retrieved_info(documents, I, D):
    retrieved_info = dict()
    for i, idx in enumerate(I[0], start=1):
        retrieved_info[i] = {

            "url": documents[idx]['U'],
            "question": documents[idx]['Q'],
            "answer": documents[idx]['A'],
            "dissimilarity": D[0][i-1]
        }
    return retrieved_info

retrieved_info = get_retrieved_info(documents, I, D)
print(retrieved_info)


def format_retrieved_info(retrieved_info):
    formatted_info = "\n"
    for i, info in retrieved_info.items():
        
        formatted_info += f"Question: {info['question']}\n"
        formatted_info += f"Answer: {info['answer']}\n"
        formatted_info += f"Source: {info['url']}\n\n"
        # formatted_info += f"Dissimilarity: {info['dissimilarity']}\n\n"
    return formatted_info


formatted_info = format_retrieved_info(retrieved_info)
print(formatted_info)


Distancias: [[4.021392 4.021392 4.021392 4.021392 4.021392]]
Índices: [[43542 43543 43544 43545 43546]]
{1: {'url': 'http://www.niddk.nih.gov/health-information/health-topics/Diabetes/causes-diabetes/Pages/index.aspx#gestational', 'question': 'What causes Causes of Diabetes ?', 'answer': 'Type 1 diabetes is caused by a lack of insulin due to the destruction of insulin-producing beta cells in the pancreas. In type 1 diabetesan autoimmune diseasethe bodys immune system attacks and destroys the beta cells. Normally, the immune system protects the body from infection by identifying and destroying bacteria, viruses, and other potentially harmful foreign substances. But in autoimmune diseases, the immune system attacks the bodys own cells. In type 1 diabetes, beta cell destruction may take place over several years, but symptoms of the disease usually develop over a short period of time.                Type 1 diabetes typically occurs in children and young adults, though it can appear at any 

# PART 2: RAG

Sacado de: https://learnbybuilding.ai/tutorials/rag-from-scratch


Aqui mas configuracion de ollama: https://github.com/jmorganca/ollama/blob/main/docs/api.md

In [12]:
import requests
import json


def generate_prompt(query_text, formatted_info):
    prompt = """ 
    You are a medical sciences bot tailored for precision and succinctness. 
    Your programming dictates responding directly to the user's query with utmost brevity. 
    Your key task is to evaluate the user's question against your vast database of documents. 
    The lower the dissimilarity between the query and the document, the more emphasis you should place on that information in your response. 
    Your recommendation should be concise, backed by a URL to the most pertinent document for user reference, serving as proof of the recommendation's validity. 
    Swift and relevant information retrieval is your principal function.

    Given the user's question: {query_text}

    And taking into account the pertinent information: 
    {formatted_info}

    Formulate a targeted recommendation for the user. 
    The recommendation should be aligned closely with their query, and provide the source (url) of the selected info that has been provided.  
    """
    prompt = prompt.format(query_text=query_text, formatted_info=formatted_info)

    return prompt


prompt = generate_prompt(query_text, formatted_info)
print(prompt)

 
    You are a medical sciences bot tailored for precision and succinctness. 
    Your programming dictates responding directly to the user's query with utmost brevity. 
    Your key task is to evaluate the user's question against your vast database of documents. 
    The lower the dissimilarity between the query and the document, the more emphasis you should place on that information in your response. 
    Your recommendation should be concise, backed by a URL to the most pertinent document for user reference, serving as proof of the recommendation's validity. 
    Swift and relevant information retrieval is your principal function.

    Given the user's question: What is the cause of diabetes?

    And taking into account the pertinent information: 
    
Question: What causes Causes of Diabetes ?
Answer: Type 1 diabetes is caused by a lack of insulin due to the destruction of insulin-producing beta cells in the pancreas. In type 1 diabetesan autoimmune diseasethe bodys immune system

In [15]:
def answer_using_ollama(prompt):
    
    full_response = []
    url = 'http://localhost:11434/api/generate'
    data = {
        "model": "llama2", #Using llama2 7B params Q4
        "prompt": prompt
    }
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=json.dumps(data), headers=headers, stream=True)

    try:
        for line in response.iter_lines():
            if line:
                decoded_line = json.loads(line.decode('utf-8'))
                print(decoded_line['response'], end="")  # uncomment to results, token by token
                full_response.append(decoded_line['response'])
    finally:
        response.close()

    # response as string 
    return "".join(full_response)



answer = answer_using_ollama(prompt)

     Based on the users query, I would recommend the following targeted information:

"It is important to understand the causes of diabetes in order to effectively manage the condition. There are several possible causes of diabetes, including genetic mutations affecting beta cells, insulin, and insulin action; damage or removal of the pancreas; endocrine diseases; autoimmune disorders; medications and chemical toxins; and lipodystrophy. Understanding the specific cause of diabetes can help individuals develop a personalized treatment plan and make informed lifestyle choices to manage their condition."

Source: http://www.niddk.nih.gov/health-information/health-topics/Diabetes/causes-diabetes/Pages/index.aspx#gestational