## Librerías a Usar


In [3]:
!pip install -q \
  torch==2.2.1 \
  pyodbc \
  langdetect \
  datasets \
  transformers \
  accelerate \
  einops \
  langchain \
  xformers bitsandbytes \
  faiss-gpu \
  sentence_transformers \
  dill==0.3.7 \
  multiprocess==0.70.15 \
  googletrans==3.1.0a0 \
  torchaudio \
  torchtext \
  torchvision

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.3 requires httpx>=0.27.0, but you have httpx 0.13.3 which is incompatible.
fastapi 0.111.0 requires httpx>=0.23.0, but you have httpx 0.13.3 which is incompatible.[0m[31m
[0m

In [4]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [5]:
!pip show bitsandbytes

Name: bitsandbytes
Version: 0.43.1
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/TimDettmers/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, torch
Required-by: 


In [6]:
!pip show transformers

Name: transformers
Version: 4.41.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: sentence-transformers


In [7]:
!pip install langchain_community



In [8]:
import os
import warnings
from datetime import datetime

import re
import pyodbc
import spacy

import numpy as np
import pandas as pd

import nltk
import openpyxl

import wordcloud
from wordcloud import WordCloud

import matplotlib.pyplot as plt

from langdetect import detect

import unicodedata

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

from string import digits

import torch
from torch import cuda, bfloat16

import transformers
from transformers import pipeline, StoppingCriteria, StoppingCriteriaList

from langchain import HuggingFacePipeline, PromptTemplate, LLMChain

from datasets import Dataset

from tqdm import tqdm
from google.colab import drive

from googletrans import Translator, constants
from pprint import pprint

In [9]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('spanish'))
def preprocess_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# MODELO Mistral

## Parámetros y funciones

Define Tokenizador

Define stop_token_ids y stopping_criteria

Crea el modelo: modelo = cargar_modelo()

Crea el pipeline

In [10]:
MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.2'
HF_AUTH = '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'
STOP_LIST = ['\n```\n']
DEVICE = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
print(DEVICE)

cuda:0


In [11]:
# Definir tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    MODEL_ID,
    token=HF_AUTH
)

In [12]:
# Function to define stop tokens
def define_stop_tokens(stop_list: list[str],
                       tokenizer: transformers.AutoTokenizer,
                       device: str) -> list[torch.LongTensor]:
    stop_token_ids = [tokenizer(stop_word)['input_ids'] for stop_word in stop_list]
    stop_token_tensors = [torch.LongTensor(token_ids).to(device) for token_ids in stop_token_ids]
    return stop_token_tensors

# Define custom stopping criteria object
class StopOnTokens(StoppingCriteria):

  def __init__(self, stop_token_ids):
    self.stop_token_ids = stop_token_ids

  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
      for stop_ids in self.stop_token_ids:
          if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
              return True
      return False

stop_token_ids = define_stop_tokens(STOP_LIST, tokenizer, DEVICE)
stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_token_ids)])

In [13]:
!pip install accelerate



In [14]:
def cargar_modelo() -> transformers.AutoModelForCausalLM:
  # Set quantization configuration to load large model with less GPU memory
  # This requires the `bitsandbytes` library
  bnb_config = transformers.BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=bfloat16
  )

  # Begin initializing HF items, you need an access token
  modelo_config = transformers.AutoConfig.from_pretrained(
      MODEL_ID,
      token=HF_AUTH
  )

  # Cargar transformer
  modelo = transformers.AutoModelForCausalLM.from_pretrained(
      MODEL_ID,
      trust_remote_code=True,
      config=modelo_config,
      quantization_config=bnb_config,
      device_map='auto',
      token=HF_AUTH
  )

  # Enable evaluation mode to allow model inference
  modelo.eval()

  return modelo

modelo = cargar_modelo()



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
def crear_pipeline(
    modelo: transformers.AutoModelForCausalLM,
    tokenizer: transformers.AutoTokenizer,
    stopping_criteria: StoppingCriteriaList) -> HuggingFacePipeline:

  # Definir pipeline
  pipe = transformers.pipeline(
    model=modelo,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    stopping_criteria=stopping_criteria, # without this model rambles during chat
    temperature=0.05,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=300,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
    do_sample=True, # Añadido para el mistral
    eos_token_id=tokenizer.eos_token_id, # Añadido para el mistral
    pad_token_id=tokenizer.eos_token_id # Añadido para el mistral
  )

  # Definir LLM
  llm = HuggingFacePipeline(pipeline = pipe)

  return llm

llm = crear_pipeline(modelo=modelo, tokenizer=tokenizer, stopping_criteria=stopping_criteria)

  warn_deprecated(


## RAG

### RAG LangChain

En esta seccion se generan los chunks y se guardan en la base de datos vectorial. Se volverá a usar este contenido al llamar al retriever de la base de datos vectorial (retriever from the vector store. The retriever is used to search for similar chunks based on queries).

#### DocumentLoader: docs; Procesamiento de la base de datos de Códigos oficiales.

- Carga de base de datos de codigos oficiales ICD10 y generación de documentos para preparar la VDB.

In [16]:
!pip install langchain



In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
csv_path = '/content/drive/MyDrive/Data/codigos_cie_format.csv'

In [19]:
import re
import pandas as pd


In [20]:
df_officialcodes = pd.read_csv(csv_path)
df_officialcodes['Code'] = df_officialcodes['Code'].str.strip()
df_officialcodes['Description'] = df_officialcodes['Description'].str.strip()
df_officialcodes = df_officialcodes.dropna(subset=['Code', 'Description'])
df_officialcodes = df_officialcodes.reset_index(drop=True)
df_officialcodes.head()

Unnamed: 0,Code,Description
0,A00.0,"cholera due to vibrio cholerae 01, biovar chol..."
1,A00.1,"cholera due to vibrio cholerae 01, biovar eltor"
2,A00.9,"cholera, unspecified"
3,A01.00,"typhoid fever, unspecified"
4,A01.01,typhoid meningitis


In [21]:
df_officialcodes.shape

(22903, 2)

In [22]:
code = df_officialcodes["Code"].tolist()
description = df_officialcodes["Description"].tolist()

new_unique_id = [f"{x}_{y}" for x, y in zip(code, description)]

In [23]:
from langchain.docstore.document import Document
docs = [
        Document(page_content=x, metadata={"ids": id}) for x, id in zip(description, new_unique_id)
    ]

In [24]:
docs

[Document(page_content='cholera due to vibrio cholerae 01, biovar cholerae', metadata={'ids': 'A00.0_cholera due to vibrio cholerae 01, biovar cholerae'}),
 Document(page_content='cholera due to vibrio cholerae 01, biovar eltor', metadata={'ids': 'A00.1_cholera due to vibrio cholerae 01, biovar eltor'}),
 Document(page_content='cholera, unspecified', metadata={'ids': 'A00.9_cholera, unspecified'}),
 Document(page_content='typhoid fever, unspecified', metadata={'ids': 'A01.00_typhoid fever, unspecified'}),
 Document(page_content='typhoid meningitis', metadata={'ids': 'A01.01_typhoid meningitis'}),
 Document(page_content='typhoid fever with heart involvement', metadata={'ids': 'A01.02_typhoid fever with heart involvement'}),
 Document(page_content='typhoid pneumonia', metadata={'ids': 'A01.03_typhoid pneumonia'}),
 Document(page_content='typhoid arthritis', metadata={'ids': 'A01.04_typhoid arthritis'}),
 Document(page_content='typhoid osteomyelitis', metadata={'ids': 'A01.05_typhoid oste

In [25]:
len(docs)

22903

#### Text Splitter y VDB Chroma

- Carga de modelo de embeddings
- Procesamiento y generación de chunks: Chroma VDB
- `vectorstore`y `retriever`

In [26]:
!pip install chromadb

Collecting httpx>=0.27.0 (from chromadb)
  Using cached httpx-0.27.0-py3-none-any.whl (75 kB)
Collecting httpcore==1.* (from httpx>=0.27.0->chromadb)
  Using cached httpcore-1.0.5-py3-none-any.whl (77 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx>=0.27.0->chromadb)
  Using cached h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: h11, httpcore, httpx
  Attempting uninstall: h11
    Found existing installation: h11 0.9.0
    Uninstalling h11-0.9.0:
      Successfully uninstalled h11-0.9.0
  Attempting uninstall: httpcore
    Found existing installation: httpcore 0.9.1
    Uninstalling httpcore-0.9.1:
      Successfully uninstalled httpcore-0.9.1
  Attempting uninstall: httpx
    Found existing installation: httpx 0.13.3
    Uninstalling httpx-0.13.3:
      Successfully uninstalled httpx-0.13.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency

In [27]:
!pip install sentence-transformers



In [28]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
embedder = SentenceTransformerEmbeddings(
        # model_name='pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
        # model_name = 'medicalai/ClinicalBERT'
        model_name = 'bert-base-nli-mean-tokens'
    )



In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_chroma import Chroma
from langchain_community.vectorstores.chroma import Chroma

#Text splitter: Determinar el mejor tamaño de chunks (con este salen 14 chunks, mismo num de registros)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

In [30]:
len(splits)

23474

In [31]:
splits

[Document(page_content='cholera due to vibrio cholerae 01, biovar cholerae', metadata={'ids': 'A00.0_cholera due to vibrio cholerae 01, biovar cholerae'}),
 Document(page_content='cholera due to vibrio cholerae 01, biovar eltor', metadata={'ids': 'A00.1_cholera due to vibrio cholerae 01, biovar eltor'}),
 Document(page_content='cholera, unspecified', metadata={'ids': 'A00.9_cholera, unspecified'}),
 Document(page_content='typhoid fever, unspecified', metadata={'ids': 'A01.00_typhoid fever, unspecified'}),
 Document(page_content='typhoid meningitis', metadata={'ids': 'A01.01_typhoid meningitis'}),
 Document(page_content='typhoid fever with heart involvement', metadata={'ids': 'A01.02_typhoid fever with heart involvement'}),
 Document(page_content='typhoid pneumonia', metadata={'ids': 'A01.03_typhoid pneumonia'}),
 Document(page_content='typhoid arthritis', metadata={'ids': 'A01.04_typhoid arthritis'}),
 Document(page_content='typhoid osteomyelitis', metadata={'ids': 'A01.05_typhoid oste

In [32]:
print(len(splits[0].page_content))
print(splits[0].metadata)

50
{'ids': 'A00.0_cholera due to vibrio cholerae 01, biovar cholerae'}


In [33]:
len(splits)

23474

In [34]:
# VDB
vector_db_path = '/content/drive/My Drive/Data/vdb/'
vectorstore = Chroma.from_documents(documents=splits, embedding=embedder, persist_directory=vector_db_path)

In [35]:
# retriever: buscar chunks relevantes basandose en una query
retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) #ver params de búsqueda ('similarity')

### Prompt template

- template para RAG. input_variables: 'note' y 'context'

In [36]:
template = '''[INST] <<SYS>>

## CONTEXT

- The following chunks, extracted from the vectorial base, contain relevant information about symptoms and their corresponding ICD-10 codes:
{context}

## Task Description

Your task is to create a small realistic clinical note for a patient from a medical write-up given to you.
This note could include various clinical elements such as diagnostics, medications, and observations. It doesn't have to contain all types of concepts, maybe the note is only about a diagnostic.
The note can contain zero, one or multiple concepts that are codifiable by the ICD-10 codes.

By realistic, I mean that the note can contain few spelling mistakes, can lack overall structure and contain abbreviations and medical jargons.
The note should be quite short, one or two paragraphs maximum.

Your task consist of generating a table accompanying the note. Each row in the table should represent a concept, featuring a text excerpt from the note in one column and the corresponding ICD-10 code in the other column.
Columns names are "Text" and "ICD10". This table should only include ICD-10 codes mentioned in the note. If there are no relevant rows (no relevant concepts in the note), the table should be empty.
This table should be returned in CSV format with quotations mark around the text except.
Don't invent any ICD-10 codes.

To help you here is an example of what you should return. Please return something in the same format.
However the content should be completely different and should satisfy the above requirements.
Remember this is just an example to give you the format, pay very close attention to the requirements.


## EXAMPLES OF ANSWERS:

### EXAMPLE 1:
#### Clinical note:
'The patient presents progressive weakness in the muscles of the extremities, especially in the arms and legs.'
#### ICD-10 Coding Table:
'Text', 'ICD10'
'progressive weakness in the muscles of the extremities', 'M62.81'

### EXAMPLE 2:
#### Clinical Note:
Patient presented with persistent muscle weakness and generalized fatigue, noted over the last few months. Complains of frequent muscle cramps and has experienced increasing difficulty in speaking, leading to anarthria. No signs of involuntary movements or facial weakness. Reports dysphagia, predominantly with solids, which has progressively worsened. The patient's motor coordination is also affected, with noted ataxia. Currently on Riluzole and Nuedexta to manage symptoms.
#### ICD-10 Coding Table:
"Text","ICD10"
"muscle weakness","M62.81"
"generalized fatigue","R53.83"
"muscle cramps","R25.2"
"difficulty in speaking, anarthria","R47.1"
"dysphagia","R13.1"
"motor coordination...ataxia","R27.0"


<</SYS>>
## NOTE:
```
{nota}
```
[/INST]'''

### Lectura y preprocesamiento de las notas

Carga de base de datos con notas y datos de validación: df_0

Crea dataframe de notas e índices: df_notes

Crea dataframe de prueba (con 3 notas): df_prueba



In [37]:
# from google.colab import drive
# drive.mount('/content/drive')

In [38]:
# csv_path = '/content/drive/MyDrive/Data/ntt_synthetic_data_150.csv'
# csv_path = '/content/drive/MyDrive/Data/Databases/NTT_synthetic_data/ntt_synthetic_data.csv'
# csv_path = '/content/drive/MyDrive/Data/Databases/DS_v1/DS_v1.csv'
csv_path = '/content/drive/MyDrive/Data/Databases/NTT_synthetic_data/150_notes/ntt_synthetic_data_150.csv'

In [39]:
def detect_separator(filepath):
  with open(filepath, 'r') as file:
    first_line = file.readline()
    if ';' in first_line:
      return ';'
    elif ',' in first_line:
      return ','
    else:
      raise ValueError("Unknown delimiter")

def load_csv(filepath):
  separator = detect_separator(filepath)
  df = pd.read_csv(filepath, sep=separator, encoding='utf-8-sig', header=0)
  return df

In [40]:
df_0 = load_csv(csv_path)

In [41]:
df_0.head()

Unnamed: 0,File,Text,ICD10,ICD10Description,Notes
0,1,difficulty in breathing and shortness of breath,R06.02,Breathing difficulty,Pt reports ongoing difficulty in breathing and...
1,1,chronic respiratory failure,J96.1,Chronic respiratory failure,Pt reports ongoing difficulty in breathing and...
2,1,dysarthria,R47.1,Dysarthria and anarthria,Pt reports ongoing difficulty in breathing and...
3,1,anxiety and panic attacks,F41.0,Panic disorder,Pt reports ongoing difficulty in breathing and...
4,1,dizziness,R42,Dizziness and syncope,Pt reports ongoing difficulty in breathing and...


In [42]:
df_notes = df_0.copy()

In [43]:
df_notes.rename(columns={'File': 'id_note', 'Notes': 'Note'}, inplace=True)

In [44]:
df_notes = df_notes[['id_note', 'Note']]
df_notes.shape

(1197, 2)

In [45]:
df_notes.drop_duplicates(inplace=True)
df_notes.shape

(150, 2)

In [46]:
df_notes.columns = df_notes.columns.str.strip()  # Eliminar espacios en blanco alrededor de los nombres de las columnas
df_notes['Note'] = df_notes['Note'].str.strip().str.lower()

In [47]:
df_notes.head()
df_notes.reset_index(inplace=True)
df_notes = df_notes[['id_note', 'Note']]

In [48]:
df_notes

Unnamed: 0,id_note,Note
0,1,pt reports ongoing difficulty in breathing and...
1,2,patient reports experiencing significant diffi...
2,3,pt. reports persistent muscl weakness and has ...
3,4,patient is experiencing persistent muscle weak...
4,5,pt. has reported severe muscle cramps and spas...
...,...,...
145,146,the pt has been experiencing severe muscle wkn...
146,147,pt presented today c/o severe difficulty swall...
147,148,pt. reports new onset of insomnia and persiste...
148,149,the pt. has been feeling increased muscle weak...


In [49]:
df_prueba = df_notes.loc[:2]
df_prueba

Unnamed: 0,id_note,Note
0,1,pt reports ongoing difficulty in breathing and...
1,2,patient reports experiencing significant diffi...
2,3,pt. reports persistent muscl weakness and has ...


### DEMO RAG LangChain

- Funciones auxiliares:
  - parse_docs: formatear texto de los docs de la VDB
  - crear_contexto: obtener los 15 docs más relevantes de la VDB con retriever
  - crear_prompt
  - identificar_codigoscie: ejecutar llm

- Demo para un ejemplo de prompt

In [50]:
# para configurar el contexto que se va a pasar al prompt (una vez obtenido el contenido relevante)
def parse_docs(docs):
  texts = ""
  for doc in docs:
      merged_cie10_id = doc.metadata.get("ids")
      cie10_id = merged_cie10_id.split("_")[1].capitalize()
      texts += f"\n{doc.page_content} ({cie10_id})"# string que contiene todas las descriptions separadas por \n
  return texts

def crear_contexto(nota:str)-> str:
  retrieved_context = retriever.get_relevant_documents(nota, top_k=5) #busca los 5 mejores chunks de la VDB antes creada
  context_text = parse_docs(retrieved_context)
  return context_text

def crear_prompt(template: str) -> PromptTemplate:
  return PromptTemplate(template=template, input_variables=["nota", 'context'])

def identificar_codigoscie(nota: str, llm_chain: HuggingFacePipeline) -> str:
  respuesta = llm_chain.run(nota).strip()
  return respuesta

In [51]:
# DEMO
nota = df_prueba['Note'][0]
print(nota)

pt reports ongoing difficulty in breathing and shortness of breath during physical activities. there's been a notable increase in muscle weakness, especially in the limbs. also complains of frequent spasms and involuntary movements. during the examination, dysarthria was observed, affecting pt's ability to communicate clearly. pt also mentioned episodes of dizziness and orthostatic hypotension. additionally, the pt has experienced chronic respiratory failure and requires assistance for mobility. pt's emotional state is impacted, with signs of anxiety and panic attacks.


In [52]:
context_text = crear_contexto(nota)
print('5 chunks más relevantes :',context_text)

5 chunks más relevantes : 
pre-existing hypertensive heart and chronic kidney disease complicating the puerperium (Pre-existing hypertensive heart and chronic kidney disease complicating the puerperium)
neonatal jaundice due to other specified excessive hemolysis (Neonatal jaundice due to other specified excessive hemolysis)
chronic venous hypertension (idiopathic) with other complications of right lower extremity (Chronic venous hypertension (idiopathic) with other complications of right lower extremity)
pre-existing hypertensive heart and chronic kidney disease complicating childbirth (Pre-existing hypertensive heart and chronic kidney disease complicating childbirth)
pre-existing hypertensive heart disease complicating the puerperium (Pre-existing hypertensive heart disease complicating the puerperium)


  warn_deprecated(


In [53]:
prompt = crear_prompt(template=template)

In [54]:
# para ver cómo queda el prompt
prompt_formatted_str: str = prompt.format(
    context=context_text,
    nota = nota)
print('Prompt final: \n', prompt_formatted_str)

Prompt final: 
 [INST] <<SYS>>

## CONTEXT

- The following chunks, extracted from the vectorial base, contain relevant information about symptoms and their corresponding ICD-10 codes:

pre-existing hypertensive heart and chronic kidney disease complicating the puerperium (Pre-existing hypertensive heart and chronic kidney disease complicating the puerperium)
neonatal jaundice due to other specified excessive hemolysis (Neonatal jaundice due to other specified excessive hemolysis)
chronic venous hypertension (idiopathic) with other complications of right lower extremity (Chronic venous hypertension (idiopathic) with other complications of right lower extremity)
pre-existing hypertensive heart and chronic kidney disease complicating childbirth (Pre-existing hypertensive heart and chronic kidney disease complicating childbirth)
pre-existing hypertensive heart disease complicating the puerperium (Pre-existing hypertensive heart disease complicating the puerperium)

## Task Description

Yo

In [55]:
llm_chain = LLMChain(prompt=prompt, llm=llm)

  warn_deprecated(


In [56]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain = (
 {"context": retriever, "nota": RunnablePassthrough()}
    | llm_chain
)

In [57]:
respuesta = rag_chain.invoke(nota)

In [58]:
final_response=respuesta['text']

In [59]:
note_match = re.search(r'```(.*?)```', final_response, re.DOTALL)
note = note_match.group(1).strip() if note_match else ""
note

"pt reports ongoing difficulty in breathing and shortness of breath during physical activities. there's been a notable increase in muscle weakness, especially in the limbs. also complains of frequent spasms and involuntary movements. during the examination, dysarthria was observed, affecting pt's ability to communicate clearly. pt also mentioned episodes of dizziness and orthostatic hypotension. additionally, the pt has experienced chronic respiratory failure and requires assistance for mobility. pt's emotional state is impacted, with signs of anxiety and panic attacks."

In [60]:
concepts_icd10_part = final_response.split('```\n[/INST]')[-1].strip()
concepts_icd10_part


'"Text","ICD10"\n"difficulty in breathing, shortness of breath during physical activities","J46.0"\n"increased muscle weakness, especially in limbs","M62.81"\n"frequent spasms","R29.2"\n"involuntary movements","G26.0"\n"dysarthria, impaired ability to communicate clearly","R47.1"\n"episodes of dizziness","R52.1"\n"orthostatic hypotension","R54.8"\n"chronic respiratory failure","J96.0"\n"assistance for mobility","Z92.2"\n"anxiety, panic attacks","F41.1"'

In [61]:
lines = [line.strip().strip('"') for line in concepts_icd10_part.split('\n')[1:] if line.strip()]
lines

['difficulty in breathing, shortness of breath during physical activities","J46.0',
 'increased muscle weakness, especially in limbs","M62.81',
 'frequent spasms","R29.2',
 'involuntary movements","G26.0',
 'dysarthria, impaired ability to communicate clearly","R47.1',
 'episodes of dizziness","R52.1',
 'orthostatic hypotension","R54.8',
 'chronic respiratory failure","J96.0',
 'assistance for mobility","Z92.2',
 'anxiety, panic attacks","F41.1']

In [62]:
concepts = []
icd10_codes = []
for line in lines:
    parts = line.split('","')
    if len(parts) == 2:
        concept = parts[0].strip('"')
        icd10_code = parts[1].strip('"')
        concepts.append(concept)
        icd10_codes.append(icd10_code)

In [63]:
concepts

['difficulty in breathing, shortness of breath during physical activities',
 'increased muscle weakness, especially in limbs',
 'frequent spasms',
 'involuntary movements',
 'dysarthria, impaired ability to communicate clearly',
 'episodes of dizziness',
 'orthostatic hypotension',
 'chronic respiratory failure',
 'assistance for mobility',
 'anxiety, panic attacks']

In [64]:
icd10_codes

['J46.0',
 'M62.81',
 'R29.2',
 'G26.0',
 'R47.1',
 'R52.1',
 'R54.8',
 'J96.0',
 'Z92.2',
 'F41.1']

In [65]:
from ast import literal_eval
context_match = re.search(r'\[Document\(page_content=.*\)\]', final_response, re.DOTALL)
context_str = context_match.group(0) if context_match else ""
relevant_docs = re.findall(r'Document\(page_content=.*?metadata=\{.*?\}\)', context_str)

In [66]:
relevant_docs

["Document(page_content='pre-existing hypertensive heart and chronic kidney disease complicating the puerperium', metadata={'ids': 'O10.33_pre-existing hypertensive heart and chronic kidney disease complicating the puerperium'})",
 "Document(page_content='neonatal jaundice due to other specified excessive hemolysis', metadata={'ids': 'P58.8_neonatal jaundice due to other specified excessive hemolysis'})",
 "Document(page_content='chronic venous hypertension (idiopathic) with other complications of right lower extremity', metadata={'ids': 'I87.391_chronic venous hypertension (idiopathic) with other complications of right lower extremity'})",
 "Document(page_content='pre-existing hypertensive heart and chronic kidney disease complicating childbirth', metadata={'ids': 'O10.32_pre-existing hypertensive heart and chronic kidney disease complicating childbirth'})",
 "Document(page_content='pre-existing hypertensive heart disease complicating the puerperium', metadata={'ids': 'O10.13_pre-exis

### RAG Obtencion de resultados

- Funciones auxiliares:
  - parse_docs
  - crear_context
  - crear_prompt
  - parse_concepts_icd10: sacar de la respuesta del LLM los 'concepts' y los 'ICD10'
- prompt_notes: lista de prompts
- df_relevant_chunks: guarda los chunks relevantes de cada nota
- result_json: json con los resultados del LLM

In [67]:
# para configurar el contexto que se va a pasar al prompt (una vez obtenido el contenido relevante)
def parse_docs(docs):
  texts = ""
  for doc in docs:
      merged_cie10_id = doc.metadata.get("ids")
      cie10_id = merged_cie10_id.split("_")[1].capitalize()
      texts += f"\n{doc.page_content} ({cie10_id})"# string que contiene todas las descriptions separadas por \n
  return texts

def crear_contexto(nota:str)-> str:
  retrieved_context = retriever.get_relevant_documents(nota, top_k=5) #busca los 5 mejores chunks de la VDB antes creada
  context_text = parse_docs(retrieved_context)
  return context_text

def crear_prompt(template: str) -> PromptTemplate:
  return PromptTemplate(template=template, input_variables=["nota", 'context'])

def parse_concepts_icd10(concept_icd10_str):
    lines = concept_icd10_str.strip().split('\n')[1:]  # Omitir la primera línea con los encabezados
    concepts = []
    icd10_codes = []
    for line in lines:
        parts = line.split('", "')
        if len(parts) == 2:
            concepts.append(parts[0].strip('"'))
            icd10_codes.append(parts[1].strip('"'))
    return concepts, icd10_codes


In [68]:
import json

prompt_notes = df_notes['Note'].tolist()

df_relevant_chunks = pd.DataFrame(columns= ['Note', 'docs'])

notes = []
relevant_chunks = []
concepts_icd10codes = []

result_dict = {}

for idx, fila in enumerate(prompt_notes):
  print(f'Obtaining {idx} ICD10 codes...')

  # crear un PromptTemplate
  prompt = crear_prompt(template=template)

  # crear una llm chain: sequence of operations that allows us to invoke a query
  llm_chain = LLMChain(prompt=prompt, llm=llm)

  # prompt específico
  nota = fila
  context_text = crear_contexto(nota)

  # obtener respuesta de Mistral
  rag_chain = (
    {"context": retriever, "nota": RunnablePassthrough()}
        | llm_chain
    )
  respuesta = rag_chain.invoke(nota)
  print('\nPrompt y Respuesta final: ', respuesta['text'])
  print('*'*15)

  # Procesamiento de respuesta
  final_response = respuesta['text']
  note_match = re.search(r'```(.*?)```', final_response, re.DOTALL)
  note = note_match.group(1).strip() if note_match else ""
  notes.append(note)

  concepts_icd10_part = final_response.split('```\n[/INST]')[-1].strip()
  concepts_icd10codes.append(concepts_icd10_part)

  # Obtener chunks seleccionados
  context_match = re.search(r'\[Document\(page_content=.*\)\]', final_response, re.DOTALL)
  context_str = context_match.group(0) if context_match else ""
  relevant_docs = re.findall(r'Document\(page_content=.*?metadata=\{.*?\}\)', context_str)
  relevant_chunks.append(relevant_docs)

  # Agregar al diccionario result_dict
  id_note = df_notes.loc[df_notes['Note'] == fila, 'id_note'].values[0]
  concepts, icd10_codes = parse_concepts_icd10(concepts_icd10_part)
  result_dict[str(id_note)] = {
        'Note': note,
        'Concepts': [{'Concept': concept, 'ICD10': icd10} for concept, icd10 in zip(concepts, icd10_codes)]
    }

result_json = json.dumps(result_dict, indent=4)
print(result_json)

df_relevant_chunks = pd.DataFrame({'id_note': df_notes['id_note'], 'relevant_chunks': relevant_chunks})


Obtaining 0 ICD10 codes...

Prompt y Respuesta final:  [INST] <<SYS>>

## CONTEXT

- The following chunks, extracted from the vectorial base, contain relevant information about symptoms and their corresponding ICD-10 codes:
[Document(page_content='pre-existing hypertensive heart and chronic kidney disease complicating the puerperium', metadata={'ids': 'O10.33_pre-existing hypertensive heart and chronic kidney disease complicating the puerperium'}), Document(page_content='neonatal jaundice due to other specified excessive hemolysis', metadata={'ids': 'P58.8_neonatal jaundice due to other specified excessive hemolysis'}), Document(page_content='chronic venous hypertension (idiopathic) with other complications of right lower extremity', metadata={'ids': 'I87.391_chronic venous hypertension (idiopathic) with other complications of right lower extremity'}), Document(page_content='pre-existing hypertensive heart and chronic kidney disease complicating childbirth', metadata={'ids': 'O10.32_pr

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
'The patient presents progressive weakness in the muscles of the extremities, especially in the arms and legs.'
#### ICD-10 Coding Table:
'Text', 'ICD10'
'progressive weakness in the muscles of the extremities', 'M62.81'

### EXAMPLE 2:
#### Clinical Note:
Patient presented with persistent muscle weakness and generalized fatigue, noted over the last few months. Complains of frequent muscle cramps and has experienced increasing difficulty in speaking, leading to anarthria. No signs of involuntary movements or facial weakness. Reports dysphagia, predominantly with solids, which has progressively worsened. The patient's motor coordination is also affected, with noted ataxia. Currently on Riluzole and Nuedexta to manage symptoms.
#### ICD-10 Coding Table:
"Text","ICD10"
"muscle weakness","M62.81"
"generalized fatigue","R53.83"
"muscle cramps","R25.2"
"difficulty in speaking, anarthria","R47.1"
"dysphagia","R13.1"
"m

In [69]:
df_relevant_chunks

Unnamed: 0,id_note,relevant_chunks
0,1,[Document(page_content='pre-existing hypertens...
1,2,[Document(page_content='pre-existing hypertens...
2,3,[Document(page_content='whooping cough due to ...
3,4,[Document(page_content='diabetes mellitus due ...
4,5,[Document(page_content='diabetes mellitus due ...
...,...,...
145,146,[Document(page_content='chronic venous hyperte...
146,147,[Document(page_content='chronic venous hyperte...
147,148,[Document(page_content='diabetes mellitus due ...
148,149,[Document(page_content='postprocedural cardiac...


In [70]:
id_notes = []
notes = []
concepts = []
icd10_codes = []

for id_note, content in result_dict.items():
    note = content['Note']
    for concept in content['Concepts']:
        id_notes.append(id_note)
        notes.append(note)
        concepts.append(concept['Concept'])
        icd10_codes.append(concept['ICD10'])

# Crear el DataFrame
df_response = pd.DataFrame({
    'id_note': id_notes,
    'Note': notes,
    'Concept': concepts,
    'ICD10': icd10_codes
})

df_response.head(2)

Unnamed: 0,id_note,Note,Concept,ICD10
0,2,patient reports experiencing significant diffi...,Text,ICD10
1,2,patient reports experiencing significant diffi...,"significant difficulty with coordination, part...",R27.0


### Descarga de resultados

In [71]:
from google.colab import drive

# Montar Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [72]:
import os
from datetime import datetime
# Obtener la fecha y hora actual
now = datetime.now()

# Formatear la fecha y hora actual según el formato deseado
date_time = now.strftime("%d%m%y_%H%M")
dataset = 'NTT_synthetic_data_150'
typeExe = 'BERT'

# json_path = f'/content/drive/My Drive/Data/Resultados_Mistral/response_{dataset}_Mistral_CIE10_{typeExe}_RAG_{date_time}.json'
json_path = f'/content/drive/My Drive/Data/response_{dataset}_Mistral_CIE10_{typeExe}_RAG_{date_time}.json'
with open(json_path, 'w') as f:
    f.write(result_json)

print(f'JSON guardado en {json_path}')

JSON guardado en /content/drive/My Drive/Data/response_NTT_synthetic_data_150_Mistral_CIE10_Mistral_RAG_250624_1142.json


In [73]:
# csv_path = f'/content/drive/My Drive/Data/Resultados_Mistral/response_{dataset}_Mistral_CIE10_{typeExe}_RAG_{date_time}.json'
csv_path = f'/content/drive/My Drive/Data/response_{dataset}_Mistral_CIE10_{typeExe}_RAG_{date_time}.csv'

# Guardar el DataFrame como un archivo CSV
df_response.to_csv(csv_path, index=False)

In [74]:
# csv_path = f'/content/drive/My Drive/Data/Resultados_Mistral/response_{dataset}_Mistral_CIE10_{typeExe}_RAG_{date_time}.json'
csv_reldocs_path = f'/content/drive/My Drive/Data/{dataset}_{typeExe}_relevantdocs.csv'

# Guardar el DataFrame como un archivo CSV
df_relevant_chunks.to_csv(csv_reldocs_path, index=False)