In [None]:
%pip install onnxruntime==1.19.2
%pip install fastembed
%pip -q install docling quackling llama-index llama-index-llms-openllm pydantic-yaml
%pip -q install semantic-router semantic-chunkers

In [None]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import PipelineOptions
from llama_index.llms.openllm import OpenLLM
from semantic_router.encoders.fastembed import FastEmbedEncoder
from semantic_chunkers import StatisticalChunker

import logging
import os
from dotenv import load_dotenv
from __future__ import annotations
from typing import Annotated, List
from pydantic import BaseModel, Field
from pydantic_core import from_json
from pydantic import ValidationError
from pydantic_yaml import to_yaml_str

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
load_dotenv()

In [None]:
source = "/home/noelo/dev/instruct-injest/data/CELEX_32021R1173_EN_TXT.pdf"
converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False, do_table_structure=False))
result = converter.convert_single(source)
_log.info(len(result.pages))
raw_text = result.output.export_to_markdown()

This area needs work. It's not clear from the docs if the context passed to InstructLab can be a summerization of the actual text from the knowledge document.
Depending the answer the max_split_tokens value may need to change


In [None]:
CONTEXT_MAX_SPLIT_TOKENS=500
encoder = FastEmbedEncoder()
chunker = StatisticalChunker(encoder=encoder,enable_statistics=True,plot_chunks=True,min_split_tokens=200, max_split_tokens=CONTEXT_MAX_SPLIT_TOKENS)
%pip show onnxruntime

In [None]:
chunks = chunker(docs=[raw_text])

In [None]:
_log.setLevel(level=logging.INFO)

llm_base = OpenLLM(
    model=os.getenv("MODEL_NAME"), 
    api_base=os.getenv("LLM_URL"),
    api_key=os.getenv("API_KEY") )

Design Notes

1. Does the answers for the questions have to come from the actual context in the file or can the context be a summarization of the info that's in the knowledge markdown files
Every fact should be supported by the context, but the answers do not need to be verbatim.

2. The docs say that "Each qna.yaml file needs at least three question and answer pairs per context chunk with a maximum token count of 250 tokens.". Is that 250 tokens per context or per question and answer pair?
The 250 is an approximate number based on the maximum total size for SDG. The total tokens of Context + 3 Q&A must be less than 750 tokens. To have enough data for a context to answer the questions, an approximate 500 tokens are recommended for context, and the remaining 250 for the 3 Q&A.
At the end, the Q&A length is no problem as long as the context+3 Q&As remain < 750

3. Also from the docs, "Each qna.yaml needs five context blocks and has a maximum token count of 500 tokens." Is that per context or for all contexts?
This is per context, and the recommended 500 is to ensure there is enough data in the context to answer the questions. It can be less or it can be more, as long as the final lenght of Context + 3 Q&A < 750 tokens.

In [None]:
MAX_TOKENS_CONTEXT=500
MAX_TOKENS_QNA=250

In [None]:
class QuestionAndAnswer(BaseModel):
    question: str
    answer: str

class SeedExample(BaseModel):
    context: Annotated[str, Field(None,max_length=500)]
    questions_and_answers: List[QuestionAndAnswer] = Field(None, min_items=3, set=True)

class QNAModel(BaseModel):
    version: Annotated[int,Field(3)]
    created_by: Annotated[str, Field(None)]
    domain: Annotated[str, Field(None)]
    seed_examples: Annotated[List[SeedExample], Field(None, min_items=5, set=True)]

print(QNAModel.model_json_schema())

In [None]:
gen_prompt=f"You are a helpful question and answer writing assistant. Given the following information generate 1 seed examples containing 3 question and answer pairs. Ensure that the questions can be answered by the information given. Do not number the pairs.  All output MUST be in valid JSON format.\n\nInformation:"

json_prompt=f"\n\nHere's a JSON schema to follow: {SeedExample.model_json_schema()}.\n\nOutput a valid JSON object but do not repeat the schema."

for ch in chunks[0]:

    llm_msg = gen_prompt+ch.content+json_prompt
    _log.debug(llm_msg)

    it = llm_base.complete(llm_msg,max_tokens=MAX_TOKENS_QNA,timeout=120.0)
    
    # Ensure that we just take the json output, sometimes we get some rubbish upfront
    json_start = it.text.find('{')
    extracted_json = it.text[json_start:]


    qna_list=[]
    try:
        res = SeedExample.model_validate(from_json(extracted_json,allow_partial=True,cache_strings='keys'))
        res.context=ch.content
        yml = to_yaml_str(res)
        print(yml)
    except (ValidationError ,ValueError) as e:
        _log.error(e,extracted_json)