In [7]:
"""Load reports and embed them into the Vector store so they can be searched."""

# Imports & Basic Setup

import re
import sys
import random
from typing import Protocol, List, Optional, Type
import os

import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

from sentence_transformers import SentenceTransformer
from tokenizers import Tokenizer
from transformers import AutoTokenizer, AutoModel
import torch
import pyarrow as pa
import pandas as pd

from pydantic import BaseModel, Field


In [8]:
# Define data models

class Report(BaseModel):
    """Represents the entire text of a radiology report."""
    id: str
    text: str

#NOTE: maybe refactor this? to include a "fragments" field?

class Fragment(BaseModel):
    """Represents a chunk of text (one section or smaller) from a report."""
    report_id: str
    section: Optional[str]
    sequence_number: int
    text: str
    vector: Optional[List[float]] = None

In [9]:
class SectionSplitter:
    """
    Splits text into sections based on naive headings:
    e.g. "Header:", "Findings:", "Impression:".
    """
    def __init__(self):
        self.known_sections = ["Header:", "Findings:", "Impression:"]

    def split_into_sections(self, report_text: str):
        """
        Returns a list of tuples (section_label, section_text).
        If no headings are found, returns one tuple with (None, entire_text).
        """
        pattern = r"(" + "|".join(map(re.escape, self.known_sections)) + r")"
        parts = re.split(pattern, report_text)
        results = []
        current_section_label = None
        current_text_chunks = []

        for part in parts:
            part_stripped = part.strip()
            if not part_stripped:
                continue

            if part in self.known_sections:
                # Save the previous chunk
                if current_section_label and current_text_chunks:
                    combined_text = " ".join(current_text_chunks).strip()
                    results.append((current_section_label, combined_text))
                # Update the label
                current_section_label = part_stripped
                current_text_chunks = []
            else:
                current_text_chunks.append(part_stripped)

        # Final chunk
        if current_section_label and current_text_chunks:
            combined_text = " ".join(current_text_chunks).strip()
            results.append((current_section_label, combined_text))

        if not results and report_text.strip():
            # No recognized sections, return entire text
            results.append((None, report_text.strip()))

        return results

    def create_smaller_fragments(self, section_fragments):
        smaller_fragments = []
        for label, text in section_fragments:
            if ':' in text:
                fragments = text.split(':')
                for i in range(1, len(fragments)):
                    fragment_text = fragments[i-1].split()[-1] + ': ' + fragments[i].strip()
                    smaller_fragments.append((label, fragment_text.strip()))
            else:
                smaller_fragments.append((label, text.strip()))
        return smaller_fragments

# NOTE: Maybe refactor this? DRY violation.??
# Function to create fragments from a SINGLE report
def create_fragments_from_report(
    report: Report,
    section_splitter: SectionSplitter
) -> List[Fragment]:
    """
    1) Split the entire report text into sections.
    2) Chunk each section by tokens.
    3) Return a list of Fragment objects.
    """
    fragments = []
    sections = section_splitter.split_into_sections(report.text)
    smaller_fragments = section_splitter.create_smaller_fragments(sections)
    seq_num = 0

    for section_label, section_text in smaller_fragments:
        fragments.append(
            Fragment(
                report_id=report.id,
                section=section_label,
                sequence_number=seq_num,
                text=section_text,
                vector=None
            )
        )
        seq_num += 1

    return fragments
# Function to create fragments from a folder of reports
def create_fragments_from_reports(
    reports: List[Report],
    section_splitter: SectionSplitter
) -> List[Fragment]:
    """
    Process a list of Report objects and return a list of Fragment objects.
    """
    all_fragments = []
    for report in reports:
        fragments = create_fragments_from_report(report, section_splitter)
        all_fragments.extend(fragments)
    return all_fragments



In [10]:
# Connect to LanceDB
db = lancedb.connect("./data/lancedb")

# Define the embedding function
embed_fcn = get_registry().get("huggingface").create(name="BAAI/bge-en-icl")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
# Define the schema
class FragmentSchema(LanceModel):
    report_id: str
    section: Optional[str]
    sequence_number: int
    text: str = embed_fcn.SourceField()
    vector: Vector(embed_fcn.ndims()) = embed_fcn.VectorField() # type: ignore

In [12]:

table = db.create_table("table", schema=FragmentSchema, mode="overwrite")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
# ------------------------------
# 5. Read local .txt files from 'reports' folder
# ------------------------------
def read_reports_from_folder(folder_path: str) -> List[Report]:
    reports = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()
                # The "id" of the report is just the filename in this example
                reports.append(Report(id=filename, text=text))
    return reports

reports_dir = "./reports"  # Folder containing .txt files
reports_list = read_reports_from_folder(reports_dir)

In [14]:
# ------------------------------
# 6. Create all fragments
# ------------------------------
splitter = SectionSplitter()
all_fragments = create_fragments_from_reports(reports_list, splitter)

In [15]:
# ------------------------------
# 8. (Option B) Insert into "auto" table for auto-embedding
# ------------------------------
auto_inserts = []
for frag in all_fragments:
    # We do NOT embed here because LanceDB will do it automatically
    doc = {
        "report_id": frag.report_id,
        "section": frag.section,
        "sequence_number": frag.sequence_number,
        "text": frag.text
    }
    auto_inserts.append(doc)

table.add(auto_inserts)
print(f"Inserted {len(auto_inserts)} fragments into {table}.")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Inserted 30 fragments into LanceTable(connection=LanceDBConnection(/Users/yz/dev/embed_reports_lancedb/data/lancedb), name="table").


In [16]:
print(table.head(10))

pyarrow.Table
report_id: string not null
section: string
sequence_number: int64 not null
text: string not null
vector: fixed_size_list<item: float>[4096]
  child 0, item: float
----
report_id: [["3.txt","3.txt","3.txt","3.txt","3.txt","3.txt","3.txt","3.txt","3.txt","3.txt"]]
section: [["Header:","Findings:","Findings:","Findings:","Findings:","Findings:","Findings:","Findings:","Findings:","Impression:"]]
sequence_number: [[0,1,2,3,4,5,6,7,8,9]]
text: [["Chest CT without contrast, conducted to assess lung findings.","Devices/Tubes/Lines: None.
Lungs","Lungs: Persistent 5 mm left lower lobe nodule with no significant growth. Scattered subcentimeter calcified granulomas. No new consolidation or masses. Mild bronchiectasis in the right middle lobe. Clear central airways.
Pleura","Pleura: Trace right pleural effusion. No pneumothorax.
Mediastinum","Mediastinum: Normal size and configuration of the heart. Minimal calcifications in the aortic root. No lymphadenopathy.
Lymph Nodes","Nodes: N

From here can either do a flat search or an ANN search.

Flat search is faster but less accurate and is just brute forcing via kNN. 

ANN search is slower but more accurate. Search algorithms usually need to be trained on SUFFICIENT amount of data.

In order of priority:
1. Flat Search
2. Full text search 
3. ANN search

since we dont have much data, we can do a full text search. 


**Remember that HYBRID Search is a two-pronged approach**

In [17]:
""" Lets implement Basic Search """

## First we need to createa a FTS index

table.create_fts_index("text", use_tantivy=False)


In [18]:
## Now we can do a full text search

query = "fatty liver"
results = table.search(query).limit(10).to_list()
print(results)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'report_id': '3.txt', 'section': 'Findings:', 'sequence_number': 6, 'text': 'Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.\nChest Wall', 'vector': [0.8172467350959778, -0.672540009021759, 2.6749424934387207, 4.577466011047363, -2.5102005004882812, -2.1569230556488037, -2.1359920501708984, 1.9548733234405518, -1.2229790687561035, 1.1690298318862915, -4.831377983093262, -3.2345237731933594, 0.8181387782096863, -4.354695796966553, 4.890837669372559, -1.195616364479065, 5.136759281158447, 0.9618955850601196, -0.9802966117858887, -4.861990451812744, 4.069133281707764, -3.561309337615967, -4.124400615692139, 1.8647927045822144, -3.203742504119873, -3.3564558029174805, 3.920886993408203, -0.7454123497009277, 2.260406017303467, -4.424810409545898, 3.9626879692077637, 3.9060218334198, -3.007263660430908, -3.7420878410339355, 1.3284369707107544, 0.5294495820999146, 1.3193391561508179, 2.9235997200012207, -2.3054957389831543, -1.8884539604187012

In [19]:
# Print all rows containing "fatty liver" (case insensitive)
print("All fragments containing 'fatty liver':")
all_rows = table.to_pandas()
fatty_liver_rows = all_rows[all_rows['text'].str.contains('fatty liver', case=False)]
print(fatty_liver_rows[['text', 'section']])

All fragments containing 'fatty liver':
                                                 text      section
6   Abdomen: Fatty liver with unchanged 3 cm adren...    Findings:
9   1.\tStable 5 mm left lower lobe nodule and cal...  Impression:
16  Abdomen: Fatty liver. Stable 3 cm adrenal lesi...    Findings:
19  1.\tNo evidence of new pulmonary pathology. St...  Impression:
29  1.\tStable subpleural pulmonary nodules withou...  Impression:


In [27]:
# Approach 1a: Basic search with replace=True for index
table.create_fts_index("text", use_tantivy=False, replace=True)
results1 = table.search("fatty liver").limit(5).select(["text", "section"]).to_list()
print("\nBasic search results:")
for r in results1:
    print(f"Text: {r['text']}\nSection: {r['section']}\n")

### Basic search appears to be failing 


Basic search results:
Text: Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.
Chest Wall
Section: Findings:

Text: Abdomen: Fatty liver. Stable 3 cm adrenal lesion and 2 cm renal angiomyolipoma. No pancreatic or splenic abnormalities.
Chest Wall
Section: Findings:

Text: Wall: No abnormality detected.
Bones
Section: Findings:

Text: Wall: No abnormalities detected.
Bones
Section: Findings:

Text: Bones: No lytic or sclerotic lesions. Moderate degenerative changes in the thoracic spine.
Section: Findings:



In [None]:
# Let's try basic search with Tokenization and Filtering

# Tokenization:



# Filtering:



In [28]:
# Approach 2: Try hybrid search (combines semantic and keyword search)
results2 = table.search("fatty liver", query_type="hybrid").limit(5).select(["text", "section"]).to_list()
print("\nHybrid search results:")
for r in results2:
    print(f"Text: {r['text']}\nSection: {r['section']}\n")

""" Hybrid Search appears to work better than basic search """


Hybrid search results:
Text: Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.
Chest Wall
Section: Findings:

Text: Abdomen: Fatty liver. Stable 3 cm adrenal lesion and 2 cm renal angiomyolipoma. No pancreatic or splenic abnormalities.
Chest Wall
Section: Findings:

Text: Wall: No abnormality detected.
Bones
Section: Findings:

Text: 1.	Stable subpleural pulmonary nodules without concerning features for malignancy.
	2.	Minimal left pleural effusion, likely reactive.
	3.	Fatty liver with stable adrenal and renal lesions.
Section: Impression:

Text: Wall: No abnormalities detected.
Bones
Section: Findings:



In [33]:
# Vector search with scores
results3 = table.search("fatty liver", query_type="vector").limit(10).select(["text", "section"]).to_list()
print("\nVector search results with scores:")
print(results3)

for r in results3:
    print(f"Text: {r['text']}\nSection: {r['section']}\n")



Vector search results with scores:
[{'text': 'Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.\nChest Wall', 'section': 'Findings:', '_distance': 24026.384765625}, {'text': 'Abdomen: Fatty liver. Stable 3 cm adrenal lesion and 2 cm renal angiomyolipoma. No pancreatic or splenic abnormalities.\nChest Wall', 'section': 'Findings:', '_distance': 25898.681640625}, {'text': 'Wall: No abnormality detected.\nBones', 'section': 'Findings:', '_distance': 25963.111328125}, {'text': 'Wall: No abnormalities detected.\nBones', 'section': 'Findings:', '_distance': 26068.212890625}, {'text': 'Bones: No lytic or sclerotic lesions. Moderate degenerative changes in the thoracic spine.', 'section': 'Findings:', '_distance': 27249.94140625}, {'text': 'Nodes: No axillary or hilar lymphadenopathy.\nUpper Abdomen', 'section': 'Findings:', '_distance': 27348.37109375}, {'text': 'Bones: No concerning osseous lesions. Minimal degenerative changes in the cervical s

In [45]:
print(type(results3[0]['text']))

<class 'str'>


In [46]:
# Extract just the text strings from the results
context_documents = [r['text'] for r in results3]

In [47]:
print(context_documents)

['Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.\nChest Wall', 'Abdomen: Fatty liver. Stable 3 cm adrenal lesion and 2 cm renal angiomyolipoma. No pancreatic or splenic abnormalities.\nChest Wall', 'Wall: No abnormality detected.\nBones', 'Wall: No abnormalities detected.\nBones', 'Bones: No lytic or sclerotic lesions. Moderate degenerative changes in the thoracic spine.', 'Nodes: No axillary or hilar lymphadenopathy.\nUpper Abdomen', 'Bones: No concerning osseous lesions. Minimal degenerative changes in the cervical spine.', 'Nodes: No pathologically enlarged lymph nodes in the mediastinum or hilar regions.\nUpper Abdomen', 'Pleura: Trace right pleural effusion. No pneumothorax.\nMediastinum', 'Wall: Unremarkable.\nBones']


In [48]:
# First we need to define a prompt

def build_query(query: str, context: List[str]) -> str:
    """Combine the question and the documents into a single prompt string.""" 
    prompt = f"""
You are a helpful assistant. Use the following context to answer the question.

Context:
{context}

Question: {query}

Answer:
"""
    print(prompt)
    return prompt

In [49]:
# USING Ollama via PYTHON

from ollama import chat, ChatResponse

def query_LLM(prompt):
    response: ChatResponse = chat(model='llama3.3', messages=[
    {
        'role': 'user',
        'content': query,
    },
    ])
    return response.message.content

In [50]:
query = build_query("From the following results, are fatty livers typically associated with any masses? And if so, what are they and what sizes are they? ", results3)
response = query_LLM(query)
print(response)


You are a helpful assistant. Use the following context to answer the question.

Context:
[{'text': 'Abdomen: Fatty liver with unchanged 3 cm adrenal adenoma. Normal kidneys, pancreas, and spleen.\nChest Wall', 'section': 'Findings:', '_distance': 24026.384765625}, {'text': 'Abdomen: Fatty liver. Stable 3 cm adrenal lesion and 2 cm renal angiomyolipoma. No pancreatic or splenic abnormalities.\nChest Wall', 'section': 'Findings:', '_distance': 25898.681640625}, {'text': 'Wall: No abnormality detected.\nBones', 'section': 'Findings:', '_distance': 25963.111328125}, {'text': 'Wall: No abnormalities detected.\nBones', 'section': 'Findings:', '_distance': 26068.212890625}, {'text': 'Bones: No lytic or sclerotic lesions. Moderate degenerative changes in the thoracic spine.', 'section': 'Findings:', '_distance': 27249.94140625}, {'text': 'Nodes: No axillary or hilar lymphadenopathy.\nUpper Abdomen', 'section': 'Findings:', '_distance': 27348.37109375}, {'text': 'Bones: No concerning osseous l

In [None]:
# Approach 2: Try case-insensitive search
results2 = table.search("FATTY LIVER").limit(10).to_list()
print("\nCase-insensitive search results:")
for r in results2:
    print(f"Score: {r['_score']}, Text: {r['text']}")

# Approach 3: Try hybrid search (combines semantic and keyword search)
results3 = table.search("fatty liver", query_type="hybrid").limit(10).to_list()
print("\nHybrid search results:")
for r in results3:
    print(f"Score: {r['_score']}, Text: {r['text']}")