In [None]:
%pip -q install docling quackling llama-index llama-index-llms-openllm
%pip -q install semantic-router semantic-chunkers

In [None]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import PipelineOptions
from llama_index.llms.openllm import OpenLLM
from semantic_router.encoders.fastembed import FastEmbedEncoder
from semantic_chunkers import StatisticalChunker

import logging
import os
from dotenv import load_dotenv
from __future__ import annotations
from typing import Annotated, List, Optional
from pydantic import BaseModel, Field, constr

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
load_dotenv()

In [None]:
source = "/home/noelo/dev/instruct-injest/data/CELEX_32021R1173_EN_TXT.pdf"
converter = DocumentConverter(pipeline_options=PipelineOptions())
result = converter.convert_single(source)
_log.info(len(result.pages))
raw_text = result.output.export_to_markdown()

In [4]:
class QuestionsAndAnswer(BaseModel):
    question: constr(min_length=1)
    answer: constr(min_length=1)


class SeedExample(BaseModel):
    context: constr(min_length=1)
    questions_and_answers: List[QuestionsAndAnswer] = Field(
        ..., min_items=3, set=True
    )

class QNAModel(BaseModel):
    created_by: Annotated[str, Field(None)]
    domain: Annotated[str, Field(None)]
    seed_examples: Annotated[List[SeedExample], Field(
        None, min_items=5, set=True
    )]

In [None]:
encoder = FastEmbedEncoder()
chunker = StatisticalChunker(encoder=encoder,enable_statistics=True,plot_chunks=False,min_split_tokens=300, max_split_tokens=2000)
chunks = chunker(docs=[raw_text])

In [6]:
gen_prompt2=f"You are a helpful question and answer writing assistant. Given the following context generate 3 question and answer pairs. Ensure that the questions can be answered by the context given. Do not number the pairs. Also generate a description of the contexts. You MUST only answer using the following json schema: {QNAModel.model_json_schema()} \n Context:"

In [None]:
_log.setLevel(level=logging.INFO)

llm_base = OpenLLM(
    model=os.getenv("MODEL_NAME"), 
    api_base=os.getenv("LLM_URL"),
    api_key=os.getenv("API_KEY") )

llm2 = llm_base.as_structured_llm(output_cls=QNAModel)

comp_resp=''

for ch in chunks[0]:
    llm_msg = gen_prompt2+ch.content
    _log.debug(llm_msg)
    for it in llm_base.stream_complete(llm_msg,max_tokens=4000,timeout=120.0):
        pass
    
    print(it)