In [1]:
%pip install docling quackling llama-index llama-index-llms-openllm
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import PipelineOptions
import logging
import os
from dotenv import load_dotenv

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

Collecting docling
  Downloading docling-2.4.0-py3-none-any.whl.metadata (6.1 kB)
Collecting quackling
  Downloading quackling-0.4.1-py3-none-any.whl.metadata (8.4 kB)
Collecting llama-index
  Downloading llama_index-0.11.22-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-llms-openllm
  Downloading llama_index_llms_openllm-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting beautifulsoup4<5.0.0,>=4.12.3 (from docling)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting certifi>=2024.7.4 (from docling)
  Using cached certifi-2024.8.30-py3-none-any.whl.metadata (2.2 kB)
Collecting deepsearch-glm<0.27.0,>=0.26.1 (from docling)
  Downloading deepsearch_glm-0.26.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.3.0 (from docling)
  Downloading docling_core-2.3.1-py3-none-any.whl.metadata (5.4 kB)
Collecting docling-ibm-models<3.0.0,>=2.0.3 (from docling)
  Downloading docling_ibm_models-2.0

In [2]:
source = "/home/noelo/dev/instruct-injest/data/CELEX_32021R1173_EN_TXT.pdf"
converter = DocumentConverter(pipeline_options=PipelineOptions())
result = converter.convert_single(source)
print(len(result.pages))

  from .autonotebook import tqdm as notebook_tqdm
Fetching 10 files: 100%|██████████| 10/10 [00:05<00:00,  1.88it/s]
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.document_converter:Processing document CELEX_32021R1173_EN_TXT.pdf
INFO:docling.document_converter:Finished converting page batch time=4.884
INFO:docling.document_converter:Finished converting page batch time=4.637
INFO:docling.document_converter:Finished converting page batch time=4.532
INFO:docling.document_converter:Finished converting page batch time=4.305
INFO:docling.document_converter:Finished converting page batch time=4.126
INFO:docling.document_converter:Finished converting page batch time=4.202
INFO:docling.document_converter:Finished converting page batch time=4.211
INFO:docling.document_converter:Finished converting page batch time=4.087
INFO:docling.document_converter:Finished converting page batch time=3.805
INFO:docling.document_converter:Finished converting page batch time=4.

49


In [None]:
load_dotenv()
print(os.getenv("API_KEY"))
print(os.getenv("LLM_URL"))

In [None]:
from quackling.llama_index.node_parsers import HierarchicalJSONNodeParser
from quackling.llama_index.readers import DoclingPDFReader

reader = DoclingPDFReader(parse_type=DoclingPDFReader.ParseType.JSON)
node_parser = HierarchicalJSONNodeParser()

In [None]:
docs = reader.load_data(file_path=source)

In [None]:
from rich.pretty import pprint
pprint(docs, max_length=2, max_string=250, max_depth=4)

In [None]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[node_parser],
)
nodes = pipeline.run(documents=docs)
print(len(nodes))
# for x in nodes:
#     print(x.text)

In [None]:
raw_text =''
for x in nodes:
    raw_text += x.text

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
encoded_output = tokenizer(raw_text[:10000])

In [None]:
print(len(encoded_output.input_ids))

In [None]:
from llama_index.llms.openllm import OpenLLM  

In [None]:
from __future__ import annotations

from typing import List, Optional

from pydantic import BaseModel, Field, constr


class QuestionsAndAnswer(BaseModel):
    question: constr(min_length=1)
    answer: constr(min_length=1)


class SeedExample(BaseModel):
    context: constr(min_length=1)
    questions_and_answers: List[QuestionsAndAnswer] = Field(
        ..., min_items=3, set=True
    )


class QNAModel(BaseModel):
    created_by: Optional[constr(min_length=1)] = None
    domain: Optional[constr(min_length=1)] = Field(
        None, examples=['Chemistry', 'History', 'Pop culture']
    )
    seed_examples: Optional[List[SeedExample]] = Field(
        None, min_items=5, set=True
    )


In [None]:
gen_prompt=f"you are a helpful writing assistant. Given the following context generate 30 question and answer pairs. Ensure that the questions can be answered by the context given.  Group the questions into different semantic contexts. Also output a description of the contexts. You MUST answer using the following json schema: {QNAModel.schema_json()}"

In [None]:
llm = OpenLLM(
    model="mistral-7b-instruct", api_base=os.getenv("LLM_URL"),
    api_key=os.getenv("API_KEY")
)


# completion_response = llm.complete(gen_prompt+raw_text[:5000],max_tokens=1000,timeout=120.0)
# print(completion_response)

comp_resp=''

for it in llm.stream_complete(gen_prompt+raw_text[:5000],max_tokens=4000,timeout=120.0):
    comp_resp+=it.text


In [None]:
print(it)

In [None]:

%pip -q install semantic-router semantic-chunkers

from semantic_router.encoders.fastembed import FastEmbedEncoder

encoder = FastEmbedEncoder()

In [None]:


from semantic_router.splitters import RollingWindowSplitter
from semantic_router.utils.logger import logger

logger.setLevel("WARNING")  # reduce logs from splitter

splitter = RollingWindowSplitter(
    encoder=encoder,
    dynamic_threshold=True,
    min_split_tokens=100,
    max_split_tokens=4000,
    # window_size=2,
    plot_splits=True,  # set this to true to visualize chunking
    enable_statistics=True  # to print chunking stats
)
     

splits = splitter([result.output.export_to_markdown()])

In [None]:
from semantic_chunkers import StatisticalChunker

chunker = StatisticalChunker(encoder=encoder,enable_statistics=True,plot_chunks=False,min_split_tokens=300, max_split_tokens=2000)

chunks = chunker(docs=[result.output.export_to_markdown()])


In [None]:
chunker.print(chunks[0])