In [36]:
import chromadb
import json
import numpy as np
from typing import List, Dict, Any

# Initialize ChromaDB client
chroma_client = chromadb.Client()
print("ChromaDB client initialized")

ChromaDB client initialized


In [37]:
# Create or get entity collection without default embedding function
try:
    # Try to delete existing collection if it exists
    chroma_client.delete_collection(name="entity_collection")
    print("Deleted existing entity_collection")
except:
    print("No existing entity_collection to delete")

# Create new collection WITHOUT embedding function (we'll use pre-computed embeddings)
# For L2-normalized vectors, cosine similarity is more appropriate than Euclidean distance
entity_collection = chroma_client.create_collection(
    name="entity_collection",
    metadata={"description": "Collection of medical entities with L2-normalized embeddings and metadata",
              "hnsw:space": "cosine"},  # Use cosine similarity for normalized vectors
    embedding_function=None  # Important: Don't use default embedding function
)
print("Created new entity_collection with cosine similarity for L2-normalized embeddings")

Deleted existing entity_collection
Created new entity_collection with cosine similarity for L2-normalized embeddings


In [41]:
# Load and extract entity data from entity_processing_results.json
with open("entity_processing_results_unmc_complete.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

print(f"Loaded data with {len(data.get('chunks', {}))} chunks")

# Extract entities with their embeddings and metadata
entities_data = []
entity_count = 0

for chunk_number, chunk_data in data.get("chunks", {}).items():
    for paragraph_idx, paragraph in enumerate(chunk_data.get("paragraphs", [])):
        paragraph_content = paragraph.get("content", "")
        
        for entity in paragraph.get("entities", []):
            entity_name = entity.get("entity_name", "")
            entity_embedding = entity.get("content_embedding", [])
            entity_description = entity.get("entity_description", "")
            entity_type = entity.get("entity_type", "Unknown")
            
            # Skip entities without embeddings
            if not entity_embedding or len(entity_embedding) == 0:
                continue
                
            # Create unique ID for each entity
            entity_id = f"chunk_{chunk_number}_para_{paragraph_idx}_entity_{entity_count}"
            
            entities_data.append({
                "id": entity_id,
                "entity_name": entity_name,
                "embedding": entity_embedding,
                "metadata": {
                    "chunk_number": chunk_number,
                    "paragraph_index": paragraph_idx,
                    "paragraph_content": paragraph_content,
                    "entity_description": entity_description,
                    "entity_type": entity_type
                }
            })
            entity_count += 1

print(f"Extracted {len(entities_data)} entities with embeddings")
print(f"Sample entity: {entities_data[0]['entity_name'] if entities_data else 'None'}")

Loaded data with 161 chunks
Extracted 2516 entities with embeddings
Sample entity: surgery


In [42]:
# Populate ChromaDB collection with entity data
if entities_data:
    # Prepare data for ChromaDB
    ids = [entity["id"] for entity in entities_data]
    embeddings = [entity["embedding"] for entity in entities_data]
    documents = [entity["entity_name"] for entity in entities_data]  # Use entity name as document text
    metadatas = [entity["metadata"] for entity in entities_data]
    
    # Add to ChromaDB collection in batches (ChromaDB has limits on batch size)
    batch_size = 1000
    total_entities = len(entities_data)
    
    print(f"Adding {total_entities} entities to ChromaDB in batches of {batch_size}")
    
    for i in range(0, total_entities, batch_size):
        end_idx = min(i + batch_size, total_entities)
        batch_ids = ids[i:end_idx]
        batch_embeddings = embeddings[i:end_idx]
        batch_documents = documents[i:end_idx]
        batch_metadatas = metadatas[i:end_idx]
        entity_collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_documents,
            metadatas=batch_metadatas
        )
        
        print(f"Added batch {i//batch_size + 1}: entities {i+1}-{end_idx}")
    
    print(f"✓ Successfully added {total_entities} entities to ChromaDB")
    print(f"Collection count: {entity_collection.count()}")
else:
    print("No entities found to add to ChromaDB")

Adding 2516 entities to ChromaDB in batches of 1000
Added batch 1: entities 1-1000
Added batch 2: entities 1001-2000
Added batch 3: entities 2001-2516
✓ Successfully added 2516 entities to ChromaDB
Collection count: 2516


In [43]:
try:
    from transformers import AutoTokenizer, AutoModel
    import torch

    # Load BioMedBERT model for query embeddings
    model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    def generate_query_embedding(text: str, entity_type: str = "Medical Entity", description: str = "") -> List[float]:
        """Generate embedding for query text using BioMedBERT with the same format as entity embeddings."""
        with torch.no_grad():
            # Format the query text to match entity embedding format
            # This matches the format used in embedding_pipeline.py's encode_entity method
            if description:
                formatted_text = f"entity_name: {text} entity_type: {entity_type} entity_description: {description}"
            else:
                # If no description provided, create a simple one
                formatted_text = f"entity_name: {text} entity_type: {entity_type} entity_description: A medical concept related to {text}"

            inputs = tokenizer(formatted_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)

            # Mean pooling over token embeddings (matching embedding_pipeline.py)
            token_embeds = outputs.last_hidden_state
            attention_mask = inputs['attention_mask']

            # Perform mean pooling
            mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
            sum_embeds = (token_embeds * mask_expanded).sum(dim=1)
            sum_mask = mask_expanded.sum(dim=1).clamp(min=1e-9)
            mean_pooled = sum_embeds / sum_mask

            # L2 normalize (matching embedding_pipeline.py's normalize=True default)
            embeddings = torch.nn.functional.normalize(mean_pooled, p=2, dim=1)

            return embeddings[0].tolist()

    print("✓ BioMedBERT model loaded for query embeddings")
    print("✓ Query embeddings will use the same format as entity embeddings")

except ImportError:
    print("Warning: transformers library not available. Using simple embedding search fallback.")

    # Fallback: search by entity name similarity without embeddings
    def generate_query_embedding(text: str, entity_type: str = "Medical Entity", description: str = "") -> None:
        return None

✓ BioMedBERT model loaded for query embeddings
✓ Query embeddings will use the same format as entity embeddings


In [68]:
# Query functionality to find similar entities
def query_similar_entities(query_text: str = None, query_embedding: List[float] = None, n_results: int = 5, entity_type: str = None, description: str = None):
    """
    Query the entity collection for similar entities based on embedding       
similarity.

    Args:
        query_text: Text to search for similar entities (optional if
query_embedding provided)
        query_embedding: Pre-computed embedding vector to search with
(optional if query_text provided)
        n_results: Number of similar entities to return
        entity_type: Optional entity type to provide context (e.g.,
"Disease or Syndrome", "Pharmacologic Substance")
        description: Optional description to provide more context about       
what you're searching for

    Returns:
        Query results with entities and their metadata

    Examples:
        # Search by text
        query_similar_entities("infection", n_results=5)

        # Search by embedding from entity_processing_results.json
        import json
        with open('entity_processing_results.json', 'r') as f:
            data = json.load(f)
        embedding =
data['chunks']['4']['paragraphs'][0]['entities'][0]['content_embedding']      
        query_similar_entities(query_embedding=embedding, n_results=5)        
    """
    try:
        # Validate inputs
        if query_text is None and query_embedding is None:
            raise ValueError("Either query_text or query_embedding must be provided")

        if query_text is not None and query_embedding is not None:
            print("Warning: Both query_text and query_embedding provided. Using query_embedding.")

        # If query_embedding is provided directly, use it
        if query_embedding is not None:
            # Ensure it's a list and has the right dimension
            if not isinstance(query_embedding, list):
                query_embedding = list(query_embedding)

            # Normalize the embedding if needed (check if already normalized)
            import numpy as np
            embedding_array = np.array(query_embedding)
            norm = np.linalg.norm(embedding_array)

            if abs(norm - 1.0) > 0.001:  # Not normalized
                print(f"Normalizing provided embedding (original norm: {norm:.4f})")
                embedding_array = embedding_array / norm
                query_embedding = embedding_array.tolist()

            # Use the provided embedding for search
            results = entity_collection.query(
                query_embeddings=[query_embedding],
                n_results=n_results
            )

            print(f"Query: Using provided embedding (dim={len(query_embedding)})")

        else:
            # Generate embedding from text
            if entity_type is None:
                # Try to infer entity type from query text
                if any(word in query_text.lower() for word in ['infection', 'disease', 'syndrome', 'disorder']):
                    entity_type = "Disease or Syndrome"
                elif any(word in query_text.lower() for word in ['drug','medication', 'medicine']):
                    entity_type = "Pharmacologic Substance"
                elif any(word in query_text.lower() for word in ['surgery', 'procedure', 'treatment']):
                    entity_type = "Therapeutic or Preventive Procedure"       
                else:
                    entity_type = "Medical Entity"

            if description is None:
                description = f"A medical concept related to {query_text}"    

            generated_embedding = generate_query_embedding(query_text, entity_type, description)

            if generated_embedding is None:
                # Fallback: search by document text if embeddings not available
                print("Using text-based search (embeddings not available)")
                results = entity_collection.query(
                    query_texts=[query_text],
                    n_results=n_results
                )
            else:
                # Query embedding is already normalized by generate_query_embedding
                # Use embedding-based search
                results = entity_collection.query(
                    query_embeddings=[generated_embedding],
                    n_results=n_results
                )

            print(f"Query: '{query_text}' (type: {entity_type})")

        print(f"Found {len(results['ids'][0])} similar entities:\n")

        for i in range(len(results['ids'][0])):
            entity_id = results['ids'][0][i]
            entity_name = results['documents'][0][i]
            distance = results['distances'][0][i] if 'distances' in results else None
            metadata = results['metadatas'][0][i]

            print(f"{i+1}. Entity: {entity_name}")
            if distance is not None:
                # Convert distance to similarity score for better interpretation
                similarity = 1 - (distance / 2)  # Cosine distance to similarity
                print(f"   Distance: {distance:.4f} (Similarity: {similarity:.2%})")
            print(f"   Type: {metadata.get('entity_type', 'Unknown')}")       
            print(f"   Description: {metadata.get('entity_description','N/A')}")
            print(f"   Chunk: {metadata.get('chunk_number')}, Paragraph: {metadata.get('paragraph_index')}")
            print(f"   Context: {metadata.get('paragraph_content', 'N/A')[:100]}...")
            print()

        return results

    except Exception as e:
        print(f"Error querying entities: {e}")
        import traceback
        traceback.print_exc()

        # Try fallback approach - search by entity names directly
        if query_text:
            try:
                print("\nAttempting fallback search by entity name...")       
                # Get all entities and filter by name similarity
                all_results = entity_collection.get(limit=1000)

                # Simple text matching
                matching_entities = []
                query_lower = query_text.lower()

                for i, doc in enumerate(all_results['documents']):
                    if query_lower in doc.lower():
                        matching_entities.append({
                            'name': doc,
                            'metadata': all_results['metadatas'][i],
                            'id': all_results['ids'][i]
                        })

                print(f"Found {len(matching_entities)} entities containing '{query_text}':")
                for i, entity in enumerate(matching_entities[:n_results]):    
                    print(f"{i+1}. Entity: {entity['name']}")
                    print(f"   Type:{entity['metadata'].get('entity_type', 'Unknown')}")
                    print(f"   Chunk: {entity['metadata'].get('chunk_number')}")
                    print()

            except Exception as e2:
                print(f"Fallback search also failed: {e2}")

        return None




In [70]:
# Example: Search by content embedding directly
print("=== Demonstrating Direct Embedding Search ===\n")

# Method 1: Using pandas to access the JSON data
import pandas as pd
import json

# Load the JSON file
df = pd.read_json('entity_processing_results_mimic_complete.json')

# Access a specific entity's embedding
# For example, get the first entity from chunk 4, paragraph 0
chunk = 6
paragraph = 0
print("entity_name_from_mimic:" , df.chunks[str(chunk)]['paragraphs'][paragraph]['entities'][0]['entity_name']+"\nparagraph_content:", df.chunks[str(chunk)]['paragraphs'][paragraph]['content'])
example_embedding = df.chunks[str(chunk)]['paragraphs'][paragraph]['entities'][0]['content_embedding']
print("entity_embedding_from_mimic:", df.chunks[str(chunk)]['paragraphs'][paragraph]['entities'][0]['content_embedding'])


=== Demonstrating Direct Embedding Search ===

entity_name_from_mimic: SKIN
paragraph_content: SKIN: Warm. Cap refill <2s. No rashes. NEUROLOGIC: AOx3. CN2-12 intact. cogwheel UE b/l. Increased tone in LEs, ___ strength b/l ___. Normal sensation.
entity_embedding_from_mimic: [0.004791374318301, -0.0030316247139120004, 0.007956776767969001, -0.0039320266805580005, -0.007750302087515001, -0.00133522436954, 0.010447307489812001, 0.004236557520925, 0.0014395231846710001, -6.563484203070402e-05, -0.0017709594685580002, 0.0008952572243280001, 0.006933584343641001, 0.002469127532094, -0.009277086704969, -0.009595424868166, 0.005558195523917, -0.00013514510646900002, -0.001364765805192, -0.003730770433321, -0.007731267251074001, 0.00047628371976300005, -0.0029457588680080003, 0.01057672034949, -0.031202448531985002, 0.0016329463105640002, -0.00359556870535, 0.004896939266473, 0.001264162128791, -0.010365252383053001, 1.514448376838117e-05, 0.005842664744704001, 0.002936255186796, 0.06831751018

In [67]:
query_similar_entities(query_embedding=example_embedding, n_results=3)

Query: Using provided embedding (dim=768)
Found 3 similar entities:

1. Entity: skin
   Distance: 0.0046 (Similarity: 99.77%)
   Type: Body System
   Description: The outermost layer of the body that provides protection against external factors. In this context, bacteria are also present on the skin and may lead to infections after transplant.
   Chunk: 62, Paragraph: 0
   Context: Bacterial infections frequently occur after transplant. Bacteria are normally found throughout the b...

2. Entity: skin
   Distance: 0.0046 (Similarity: 99.77%)
   Type: Body System
   Description: The outermost layer of the body that provides protection against external factors. In this context, bacteria are also present on the skin and may lead to infections after transplant.
   Chunk: 211, Paragraph: 0
   Context: Perineal Care - special care of the skin and tissue in the genital and rectal areas....

3. Entity: skin
   Distance: 0.0046 (Similarity: 99.77%)
   Type: Body System
   Description: The outerm

{'ids': [['chunk_62_para_0_entity_671',
   'chunk_211_para_0_entity_2393',
   'chunk_211_para_2_entity_2415']],
 'embeddings': None,
 'documents': [['skin', 'skin', 'skin']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'paragraph_content': 'Bacterial infections frequently occur after transplant. Bacteria are normally found throughout the body and on the skin. Normally, these bacteria do not typically cause a problem, however, they may lead to infections after transplant because of the immunosuppressive medications you are',
    'entity_type': 'Body System',
    'entity_description': 'The outermost layer of the body that provides protection against external factors. In this context, bacteria are also present on the skin and may lead to infections after transplant.',
    'paragraph_index': 0,
    'chunk_number': '62'},
   {'paragraph_index': 0,
    'paragraph_content': 'Perineal Care - special care of the skin and tissue in the gen

In [65]:
import pandas as pd
df = pd.read_json('entity_processing_results.json')

df.chunks['4']['paragraphs'][0]['entities'][0]['content_embedding']


FileNotFoundError: File entity_processing_results.json does not exist

In [None]:
# Additional query examples and analysis functions

def query_by_entity_type(entity_type: str, n_results: int = 10):
    """
    Find entities of a specific type.
    """
    try:
        # Get entities directly without embedding query
        all_results = entity_collection.get(
            limit=1000,  # Get a large sample
            where={"entity_type": entity_type}
        )
        
        print(f"Entities of type '{entity_type}':")
        print(f"Found {len(all_results['ids'])} entities:\n")
        
        for i in range(min(n_results, len(all_results['ids']))):
            entity_name = all_results['documents'][i]
            metadata = all_results['metadatas'][i]
            
            print(f"{i+1}. {entity_name}")
            print(f"   Description: {metadata.get('entity_description', 'N/A')}")
            print(f"   Chunk {metadata.get('chunk_number')}: {metadata.get('paragraph_content', 'N/A')[:100]}...")
            print()
            
    except Exception as e:
        print(f"Error querying by type: {e}")

def get_collection_stats():
    """
    Get statistics about the entity collection.
    """
    try:
        total_count = entity_collection.count()
        print(f"Total entities in collection: {total_count}")
        
        # Get a sample of entities to show entity types
        sample_results = entity_collection.get(limit=min(100, total_count))
        
        entity_types = {}
        chunks = set()
        
        for metadata in sample_results['metadatas']:
            etype = metadata.get('entity_type', 'Unknown')
            entity_types[etype] = entity_types.get(etype, 0) + 1
            chunks.add(metadata.get('chunk_number'))
        
        print(f"\nEntity types in sample:")
        for etype, count in sorted(entity_types.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"  {etype}: {count}")
            
        print(f"\nChunks represented: {len(chunks)}")
        
    except Exception as e:
        print(f"Error getting stats: {e}")

# Run analysis
print("=== Collection Statistics ===")
get_collection_stats()

print("\n=== Example Queries ===")

# Example queries using the updated query function
queries = ['infection'
]

for query in queries:
    print(f"\n--- Query: {query} ---")
    query_similar_entities(query, n_results=5)

=== Collection Statistics ===
Total entities in collection: 378

Entity types in sample:
  Intellectual Product: 8
  Body Part, Organ, or Organ Component: 6
  Temporal Concept: 5
  Activity: 4
  Tissue: 4
  Health Care Activity: 4
  Social Behavior: 4
  Population Group: 4
  Idea or Concept: 3
  Family Group: 3

Chunks represented: 4

=== Example Queries ===

--- Query: infection ---
Query: 'infection'
Found 5 similar entities:

1. Entity: immunosuppression
   Distance: 0.0654
   Type: Therapeutic or Preventive Procedure
   Description: found in blood to protect from threats
   Chunk: 6, Paragraph: 1
   Context: Antibodies are found in your blood and they try to protect you from anything they don't recognize as...

2. Entity: change
   Distance: 0.0667
   Type: Therapeutic or Preventive Procedure
   Description: drugs taken after transplant
   Chunk: 16, Paragraph: 0
   Context: Medications play an important role after transplant. Some of them will be taken for the rest of your...

3. 

In [None]:
# Example: Search by content embedding directly
print("=== Demonstrating Direct Embedding Search ===\n")

# Method 1: Using pandas to access the JSON data
import pandas as pd
import json

# Load the JSON file
with open('entity_processing_results.json', 'r') as f:
    data = json.load(f)

# Access a specific entity's embedding
# For example, get the first entity from chunk 4, paragraph 0
if '4' in data['chunks']:
    chunk_4 = data['chunks']['4']
    if chunk_4['paragraphs'] and chunk_4['paragraphs'][0]['entities']:
        first_entity = chunk_4['paragraphs'][0]['entities'][0]
        entity_name = first_entity['entity_name']
        entity_embedding = first_entity['content_embedding']
        
        print(f"Using embedding from entity: '{entity_name}'")
        print(f"Embedding dimension: {len(entity_embedding)}")
        print(f"\nSearching for similar entities...")
        
        # Search using the embedding
        results = query_similar_entities(query_embedding=entity_embedding, n_results=5)
        
        # The first result should be the entity itself (or very similar)
        if results and results['documents'][0][0] == entity_name:
            print(f"\n✓ Successfully found the same entity as top result!")
    else:
        print("No entities found in chunk 4, paragraph 0")
else:
    print("Chunk 4 not found in the data")

print("\n" + "="*50 + "\n")

# Method 2: Direct access using dictionary navigation
print("Alternative method - Direct dictionary access:\n")

# You can also access embeddings like this:
example_embedding = data['chunks']['2']['paragraphs'][0]['entities'][0]['content_embedding']
example_name = data['chunks']['2']['paragraphs'][0]['entities'][0]['entity_name']

print(f"Using embedding from: '{example_name}'")
query_similar_entities(query_embedding=example_embedding, n_results=3)

In [None]:
import pandas as pd


In [None]:
df = pd.read_csv('note/discharge.csv.gz')
print(df.head())

          note_id  subject_id   hadm_id note_type  note_seq  \
0  10000032-DS-21    10000032  22595853        DS        21   
1  10000032-DS-22    10000032  22841357        DS        22   
2  10000032-DS-23    10000032  29079034        DS        23   
3  10000032-DS-24    10000032  25742920        DS        24   
4  10000084-DS-17    10000084  23052089        DS        17   

             charttime            storetime  \
0  2180-05-07 00:00:00  2180-05-09 15:26:00   
1  2180-06-27 00:00:00  2180-07-01 10:15:00   
2  2180-07-25 00:00:00  2180-07-25 21:42:00   
3  2180-08-07 00:00:00  2180-08-10 05:43:00   
4  2160-11-25 00:00:00  2160-11-25 15:09:00   

                                                text  
0   \nName:  ___                     Unit No:   _...  
1   \nName:  ___                     Unit No:   _...  
2   \nName:  ___                     Unit No:   _...  
3   \nName:  ___                     Unit No:   _...  
4   \nName:  ___                    Unit No:   __...  


In [None]:
df.text[1]

" \nName:  ___                     Unit No:   ___\n \nAdmission Date:  ___              Discharge Date:   ___\n \nDate of Birth:  ___             Sex:   F\n \nService: MEDICINE\n \nAllergies: \nPercocet\n \nAttending: ___.\n \nChief Complaint:\nabdominal fullness and discomfort\n \nMajor Surgical or Invasive Procedure:\n___ diagnostic paracentesis\n___ therapeutic paracentesis\n\n \nHistory of Present Illness:\n___ with HIV on HAART, COPD, HCV cirrhosis complicated by \nascites and HE admitted with abdominal distention and pain. She \nwas admitted to ___ for the same symptoms \nrecently and had 3L fluid removed (no SBP) three days ago and \nfelt better. Since discharge, her abdomen has become \nincreasingly distended with pain. This feels similar to prior \nepisodes of ascites.  \nHer diuretics were recently decreased on ___ due to worsening \nhyponatremia 128 and hyperkalemia 5.1. Patient states she has \nbeen compliant with her HIV and diuretic medications but never \nfilled out the 