In [4]:
import os
from openai import OpenAI
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, MarkdownHeaderTextSplitter
from dotenv import load_dotenv, find_dotenv
load_dotenv()

open_api_key  = os.environ['OPENAI_API_KEY']

client = OpenAI()

md_path = Path("data/t2d_guideline_ee/20_normalized_md/2-tuubi-diabeedi-diagnostika-ravi.md")

text = md_path.read_text(encoding="utf-8")

In [5]:
# Context aware splitting
headers_to_split_on  = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [6]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(text)

In [7]:
# This should be added to metadata or something
md_header_splits[5].metadata
md_header_splits[5].page_content

'1. **Prediabeediga patsient suunake eluviisisekkumise intensiivprogrammi.**\nTugev positiivne soovitus, m√µ√µdukas t√µendatuse aste\n2. **Prediabeediga patsiendil √§rge metformiinravi pigem kasutage.**\nN√µrk negatiivne soovitus, m√µ√µdukas t√µendatuse aste'

In [56]:
response = client.responses.create(
    model = "gpt-5.2",
    input = [
        {
            "role": "system",
            "content": "You answer questions based on the user query and result from vector database. Answer without any formatting. Do not add any data that is not in the vector database result."
        },
        {
            "role": "user",
            "content": f"""Keda peaks s√µeluma diabeedi osas?
            
            Vector database result: {md_header_splits[3].page_content}"""
        }
    ]
)

In [57]:
# Assess if the length of the splits is good enough or should be shorter. Should run trough tokenizer?

'Diabeedi s√µelumist tuleks teha v√µi kaaluda j√§rgmistel juhtudel:\n\n1) T√§iskasvanud enne 45. eluaastat, kui esineb √ºlekaal ja/v√µi rasvumine (KMI ‚â• 25 kg/m¬≤) ning lisaks √ºks v√µi mitu riskitegurit:\n- esimese astme sugulasel on 2. t√º√ºpi diabeet\n- suure riskiga rass / etniline taust\n- anamneesis SVH\n- h√ºpertensioon (verer√µhk ‚â• 140/90 mmHg v√µi tarvitab verer√µhuravimit)\n- HDL-Chol < 0,90 mmol/l v√µi trigl√ºtseriidid > 2,82 mmol/l\n- pol√ºts√ºstiliste munasarjade s√ºndroom\n- v√§hene f√º√ºsiline aktiivsus\n- viited insuliiniresistentsusele\n\n2) Prediabeediga patsiendid (HbA1c 6,0%‚Äì6,4% ehk 42‚Äì47 mmol/mol ja/v√µi IFG v√µi IGT): gl√ºkoosi m√µ√µtmine v√§hemalt kord aastas\n\n3) Gestatsioonidiabeedi diagnoosiga naised: v√§hemalt iga 3 aasta tagant\n\n4) HIV-iga patsiendid\n\n5) K√µik teised t√§iskasvanud alates 45. eluaastast; kui gl√ºkoos on normis, korrata s√µeluuringut v√§hemalt iga 3 aasta tagant'

In [84]:
import hashlib
from langchain_core.documents import Document

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def build_breadcrumbs(md: dict) -> str:
    """
    Merge Header 1 / Header 2 / Header 3 (or h1/h2/h3)
    into a single breadcrumb string.
    """
    candidates = [
        md.get("Header 1") or md.get("h1"),
        md.get("Header 2") or md.get("h2"),
        md.get("Header 3") or md.get("h3"),
    ]
    
    parts = [
        p.strip()
        for p in candidates
        if p and isinstance(p, str) and p.strip()
    ]
    
    return " > ".join(parts)

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def enrich_splits(
    splits: list[Document],
    *,
    doc_id: str,
    version_id: str,
    published_year: int,
    language: str,
    source_path: str,
) -> list[Document]:
    enriched = []

    for i, d in enumerate(splits):
        md = dict(d.metadata or {})

        # Build breadcrumbs from headers
        breadcrumbs = build_breadcrumbs(md)

        # Reset metadata to only what you want
        md = {
            "doc_id": doc_id,
            "version_id": version_id,
            "published_year": published_year,
            "language": language,
            "source_path": source_path,
            "chunk_index": i,
            "chunk_id": f"{doc_id}::{version_id}::chunk_{i:04d}",
            "text_hash": sha256_text(d.page_content),
            "breadcrumbs": breadcrumbs,
        }

        enriched.append(
            Document(
                page_content=d.page_content,
                metadata=md,
            )
        )

    return enriched



### Aim to have the output like
[Source: 2. t√º√ºpi diabeedi diagnostika ja ravi (RJ-E/51.1-2021, 2021)]
[Section: Ravijuhendi soovituste loetelu > Ravi eesm√§rkv√§√§rtused]
[Chunk: recommendation #23 | Strength: tugev | Evidence: madal]
<chunk text here>

In [85]:
docs = enrich_splits(
    md_header_splits,
    doc_id="t2d_guideline_ee",
    version_id="RJ-E_51.1-2021",
    published_year=2021,
    language="et",
    source_path="docs/t2d_guideline_ee/v2021_RJ-E-51.1-2021/10_canonical_md/canonical.md",
)


In [87]:
docs[3]

Document(metadata={'doc_id': 't2d_guideline_ee', 'version_id': 'RJ-E_51.1-2021', 'published_year': 2021, 'language': 'et', 'source_path': 'docs/t2d_guideline_ee/v2021_RJ-E-51.1-2021/10_canonical_md/canonical.md', 'chunk_index': 3, 'chunk_id': 't2d_guideline_ee::RJ-E_51.1-2021::chunk_0003', 'text_hash': 'e8a2d4b6b3fa0756898f79b6b9dbebc37230c555da9382d4a8ecb8738d2830b2', 'breadcrumbs': 'Prediabeet ja 2. t√º√ºpi diabeedi diagnoosimine > Prediabeedi ja diabeedi s√µeluuring s√ºmptomiteta t√§iskasvanutel (2)'}, page_content='1) S√µelumist tuleks kaaluda t√§iskasvanutel enne 45. eluaastat, kui esineb √ºlekaal ja/v√µi rasvumine (kehamassiindeks ehk KMI ‚â• 25 kg/m¬≤) ning lisaks sellele √ºks v√µi mitu j√§rgmist riskitegurit:\n- esimese astme sugulasel on diagnoositud 2. t√º√ºpi diabeet\n- suure riskiga rass / etniline taust\n- anamneesis SVH\n- h√ºpertensioon (verer√µhk ‚â• 140/90 mmHg v√µi tarvitab verer√µhuravimit)\n- HDL-Chol < 0,90 mmol/l v√µi trigl√ºtseriidid > 2,82 mmol/l\n- pol√ºts√ºsti

In [92]:
from openai import OpenAI

client = OpenAI()  # expects OPENAI_API_KEY in env

def embed_and_report_lengths(
    splits,
    model: str = "text-embedding-3-large",
    batch_size: int = 64,
    dimensions: int | None = None,   # None => model default
):
    """
    For each split (LangChain Document-like object with .page_content and .metadata),
    call OpenAI embeddings and print len(embedding) for each split.

    Returns: list of dicts with chunk_id (if present), chunk_index (if present), and embedding_length.
    """
    results = []

    # Prepare texts
    texts = [d.page_content for d in splits]

    for start in range(0, len(texts), batch_size):
        batch_texts = texts[start:start + batch_size]

        # Build request payload
        req = {"model": model, "input": batch_texts, "encoding_format": "float"}
        if dimensions is not None:
            req["dimensions"] = dimensions  # supported for text-embedding-3* models :contentReference[oaicite:2]{index=2}

        resp = client.embeddings.create(**req)

        # resp.data is aligned with inputs order
        for j, item in enumerate(resp.data):
            doc = splits[start + j]
            emb = item.embedding
            emb_len = len(emb)

            chunk_id = (doc.metadata or {}).get("chunk_id")
            chunk_index = (doc.metadata or {}).get("chunk_index", start + j)

            results.append({
                "chunk_index": chunk_index,
                "chunk_id": chunk_id,
                "embedding_length": emb_len,
            })

    return results

# Example usage:
lengths = embed_and_report_lengths(
    docs,
    model="text-embedding-3-large",
    batch_size=64,
    dimensions=None,  # omit to use the model default size
)

# Print a quick report:
for r in lengths[:26]:
    print(r["chunk_index"], r["chunk_id"], r["embedding_length"])

# If you just want to verify they're consistent:
unique_lengths = sorted({r["embedding_length"] for r in lengths})
print("Unique embedding lengths:", unique_lengths)


0 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0000 3072
1 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0001 3072
2 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0002 3072
3 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0003 3072
4 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0004 3072
5 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0005 3072
6 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0006 3072
7 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0007 3072
8 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0008 3072
9 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0009 3072
10 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0010 3072
11 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0011 3072
12 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0012 3072
13 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0013 3072
14 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0014 3072
15 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0015 3072
16 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0016 3072
17 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0017 3072
18 t2d_guideline_ee::RJ-E_51.1-2021::chunk_0018 3072
19 

### Function to put this all together

In [115]:
import json
import os
import hashlib
from pathlib import Path
from typing import Optional, List, Dict, Any

from dotenv import load_dotenv
from openai import OpenAI
from langchain_text_splitters import MarkdownHeaderTextSplitter


def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()


def build_breadcrumbs(md: dict) -> str:
    candidates = [
        md.get("Header 1") or md.get("h1"),
        md.get("Header 2") or md.get("h2"),
        md.get("Header 3") or md.get("h3"),
    ]
    parts = [p.strip() for p in candidates if p and isinstance(p, str) and p.strip()]
    return " > ".join(parts)


def process_markdown_to_embedded_jsonl(
    markdown_path: str,
    output_jsonl_path: str,
    *,
    doc_id: str,
    version_id: str,
    published_year: int,
    language: str = "et",
    embedding_model: str = "text-embedding-3-large",
    batch_size: int = 64,
    dimensions: Optional[int] = None,
) -> None:
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not found in environment")

    client = OpenAI(api_key=api_key)

    md_path = Path(markdown_path)
    out_path = Path(output_jsonl_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    source_file = md_path.name
    source_path = str(md_path)

    markdown_text = md_path.read_text(encoding="utf-8")

    # 1) Context-aware splitting
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    splits = splitter.split_text(markdown_text)

    # 2) Build flat records
    records: List[Dict[str, Any]] = []
    for i, d in enumerate(splits):
        md_in = dict(d.metadata or {})
        breadcrumbs = build_breadcrumbs(md_in)

        text = d.page_content
        text_hash = sha256_text(text)
        chunk_id = f"{doc_id}::{version_id}::chunk_{i:04d}"

        # Typesense requires a top-level "id"
        ts_id = sha256_text(f"{chunk_id}::{text_hash}")

        records.append({
            "id": ts_id,
            "doc_id": doc_id,
            "version_id": version_id,
            "published_year": published_year,
            "language": language,
            "source_file": source_file,
            "source_path": source_path,
            "chunk_index": i,
            "chunk_id": chunk_id,
            "breadcrumbs": breadcrumbs,
            "text_hash": text_hash,
            "text": text,
        })

    # 3) Embed + write JSONL
    with out_path.open("w", encoding="utf-8") as f:
        for start in range(0, len(records), batch_size):
            batch = records[start:start + batch_size]
            inputs = [r["text"] for r in batch]

            req: Dict[str, Any] = {
                "model": embedding_model,
                "input": inputs,
                "encoding_format": "float",
            }
            if dimensions is not None:
                req["dimensions"] = dimensions

            resp = client.embeddings.create(**req)
            if len(resp.data) != len(batch):
                raise RuntimeError("Embedding count mismatch")

            for record, item in zip(batch, resp.data):
                emb = item.embedding
                record_out = {
                    **record,
                    "embedding_model": embedding_model,
                    "embedding_dimensions": len(emb),
                    "embedding": emb,
                }
                f.write(json.dumps(record_out, ensure_ascii=False) + "\n")

    print(f"‚úÖ Wrote {len(records)} Typesense-ready documents to: {out_path}")


In [116]:
process_markdown_to_embedded_jsonl(
    markdown_path="data/t2d_guideline_ee/20_normalized_md/2-tuubi-diabeedi-diagnostika-ravi.md",
    output_jsonl_path="data/t2d_guideline_ee/30_embeddings/2-tuubi-diabeedi-diagnostika-ravi.jsonl",
    doc_id="t2d_guideline_ee",
    version_id="RJ-E_51.1-2021",
    published_year=2021,
    language="et",
    embedding_model="text-embedding-3-large",
    batch_size=64,
)


‚úÖ Wrote 26 Typesense-ready documents to: data/t2d_guideline_ee/30_embeddings/2-tuubi-diabeedi-diagnostika-ravi.jsonl


In [117]:
import json
import hashlib
from pathlib import Path
from typing import List, Dict, Any

from langchain_text_splitters import MarkdownHeaderTextSplitter


def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()


def build_breadcrumbs(md: dict) -> str:
    candidates = [
        md.get("Header 1") or md.get("h1"),
        md.get("Header 2") or md.get("h2"),
        md.get("Header 3") or md.get("h3"),
    ]
    parts = [p.strip() for p in candidates if p and isinstance(p, str) and p.strip()]
    return " > ".join(parts)


def process_markdown_to_jsonl(
    markdown_path: str,
    output_jsonl_path: str,
    *,
    doc_id: str,
    version_id: str,
    published_year: int,
    language: str = "et",
) -> None:
    md_path = Path(markdown_path)
    out_path = Path(output_jsonl_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    source_file = md_path.name
    source_path = str(md_path)

    markdown_text = md_path.read_text(encoding="utf-8")

    # 1) Context-aware splitting
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    splits = splitter.split_text(markdown_text)

    # 2) Build flat records and write JSONL
    with out_path.open("w", encoding="utf-8") as f:
        for i, d in enumerate(splits):
            md_in = dict(d.metadata or {})
            breadcrumbs = build_breadcrumbs(md_in)

            text = d.page_content
            text_hash = sha256_text(text)
            chunk_id = f"{doc_id}::{version_id}::chunk_{i:04d}"

            # Stable Typesense document id
            ts_id = sha256_text(f"{chunk_id}::{text_hash}")

            record = {
                "id": ts_id,
                "doc_id": doc_id,
                "version_id": version_id,
                "published_year": published_year,
                "language": language,
                "source_file": source_file,
                "source_path": source_path,
                "chunk_index": i,
                "chunk_id": chunk_id,
                "breadcrumbs": breadcrumbs,
                "text_hash": text_hash,
                "text": text,
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"‚úÖ Wrote {len(splits)} JSONL chunks (no embeddings) to: {out_path}")


In [118]:
process_markdown_to_jsonl(
    markdown_path="data/t2d_guideline_ee/20_normalized_md/2-tuubi-diabeedi-diagnostika-ravi.md",
    output_jsonl_path="data/t2d_guideline_ee/30_chunks/2-tuubi-diabeedi-diagnostika-ravi.jsonl",
    doc_id="t2d_guideline_ee",
    version_id="RJ-E_51.1-2021",
    published_year=2021,
    language="et",
    
)


‚úÖ Wrote 26 JSONL chunks (no embeddings) to: data/t2d_guideline_ee/30_chunks/2-tuubi-diabeedi-diagnostika-ravi.jsonl


In [11]:
import json
from pathlib import Path
from langchain_text_splitters import MarkdownHeaderTextSplitter

def process_markdown_to_jsonl(
    file_path: str,
    output_file_path: str,
    guideline_name: str,
    # Manual Configuration Fields
    class_name: str = "t2dm_guideline_ee",
    version_id: str = "1.0",
    published_year: int = 2021,
    language: str = "et"
) -> None:
    """
    Parses a markdown file into Weaviate-ready JSONL with integer IDs 
    and combined search text for context-aware retrieval.
    """
    md_path = Path(file_path)
    
    # 1. Read Markdown File
    markdown_text = md_path.read_text(encoding="utf-8")

    # 2. Configure Splitter (Context-Aware)
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]
    splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    
    # 3. Split Text
    splits = splitter.split_text(markdown_text)

    # 4. Write to JSONL
    # We use 'w' to overwrite or 'a' to append. 'w' is safer for fresh runs.
    with open(output_file_path, "w", encoding="utf-8") as f:
        for i, doc in enumerate(splits):
            
            # A. Build Breadcrumbs (Hierarchy)
            # MarkdownHeaderTextSplitter stores headers in metadata
            md = doc.metadata
            breadcrumbs_list = [
                md.get("Header 1"),
                md.get("Header 2"),
                md.get("Header 3")
            ]
            # Filter out None values and join with " > "
            breadcrumbs = " > ".join([h for h in breadcrumbs_list if h])
            
            # B. Prepare Content Fields
            clean_text = doc.page_content
            
            # The "Context-Enriched" field for the Embedding Model
            # Combines hierarchy + content
            combined_search_text = f"{breadcrumbs}\n{clean_text}" if breadcrumbs else clean_text

            # C. Construct the Object
            record = {
                "class": class_name,
                "properties": {
                    "chunk_id": i,  # Simple Integer (0, 1, 2...)
                    "source": guideline_name,
                    "version_id": version_id,
                    "year": published_year,
                    "language": language,
                    "breadcrumbs": breadcrumbs,
                    "text": clean_text,          # Clean text for LLM/Reading
                    "search_text": combined_search_text # Enriched text for Vectorizing
                }
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"‚úÖ Successfully converted {len(splits)} chunks.")
    print(f"üìÇ Output saved to: {output_file_path}")

In [15]:
process_markdown_to_jsonl(
    file_path="data/t2d_guideline_ee/20_normalized_md/2-tuubi-diabeedi-diagnostika-ravi.md",
    output_file_path="data/t2d_guideline_ee/30_chunks/2-tuubi-diabeedi-diagnostika-ravi.jsonl",
    guideline_name="2. t√º√ºbi diabeedi diagnostika ja ravi",
    class_name= "t2dm_guideline_ee",
    version_id= "RJ-E_51.1-2021",
    published_year= 2021,
    language= "et"
    
)


‚úÖ Successfully converted 26 chunks.
üìÇ Output saved to: data/t2d_guideline_ee/30_chunks/2-tuubi-diabeedi-diagnostika-ravi.jsonl
