In [1]:
# remove warnings
import warnings
warnings.filterwarnings("ignore")



In [None]:
!pip install tenacity loguru huggingface_hub semantic-text-splitter openai
!pip install indoxarcg torch docling torchvision

# install the required libraries

# Checking version of indoxArcg

In [9]:
import indoxArcg 
indoxArcg.__version__

'0.1.1'

## Adding your Api Key to the environment variables

In [10]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
HUGGINGFACE_API_KEY = os.environ['HUGGINGFACE_API_KEY']


In [None]:
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual Hugging Face token
login(token=HUGGINGFACE_API_KEY)

In [12]:
from indoxArcg.llms import OpenAi
from indoxArcg.embeddings import OpenAiEmbedding
from indoxArcg.data_loaders import Txt,DoclingReader
from indoxArcg.splitter import RecursiveCharacterTextSplitter,SemanticTextSplitter
from indoxArcg.pipelines.cag import CAG, KVCache
from pprint import pprint

In [16]:
llm = OpenAi(model="gpt-4o-mini",api_key=OPENAI_API_KEY)
embed_model = OpenAiEmbedding(api_key=OPENAI_API_KEY, model="text-embedding-3-small")


[32mINFO[0m: [1mInitializing OpenAi with model: gpt-4o-mini[0m
[32mINFO[0m: [1mOpenAi initialized successfully[0m
[32mINFO[0m: [1mInitialized OpenAiEmbedding with model: text-embedding-3-small[0m


In [17]:
# check if the sample.txt file exists

import os

if os.path.exists("sample.txt"):
    print("sample.txt exists")
else:
    !wget https://raw.githubusercontent.com/osllmai/inDox/refs/heads/master/cookbook/indoxArcg/sample.txt
    print("sample.txt does not exist")



sample.txt exists


In [19]:
# Load the text file using Txt loader
txt_loader = Txt(txt_path="sample.txt")

# Initialize a RecursiveCharacterTextSplitter to split the text into smaller chunks
splitter = RecursiveCharacterTextSplitter()

# Load the content of the text file
docs = txt_loader.load()

# Split the loaded text into smaller chunks using the splitter
split_docs = splitter.split_text(text=docs)

# Print the number of chunks created
print(len(split_docs ))

# Print the first chunk of the split text
print(split_docs[0])


38
The wife of a rich man fell sick, and as she felt that her end

was drawing near, she called her only daughter to her bedside and

said, dear child, be good and pious, and then the

good God will always protect you, and I will look down on you

from heaven and be near you.  Thereupon she closed her eyes and

departed.  Every day the maiden went out to her mother's grave,


In [20]:
# check if the sample.txt file exists

import os

if os.path.exists("LLM output verification.pdf"):
    print("LLM output verification.pdf  exists")
else:
    !wget "https://raw.githubusercontent.com/osllmai/inDox/refs/heads/master/cookbook/indoxArcg/LLM output verification.pdf"
    print("sample.txt does not exist")



LLM output verification.pdf  exists


In [21]:
pdf_file_path = "LLM output verification.pdf"
docling_reader = DoclingReader(file_path=pdf_file_path)

In [22]:
pdf_doc = docling_reader.load()

2025-03-22 19:28:08,486 - docling.document_converter - INFO - Going to convert document batch...
2025-03-22 19:28:08,512 - docling.models.factories.base_factory - INFO - Loading plugin 'docling_defaults'
2025-03-22 19:28:08,512 - docling.models.factories - INFO - Registered ocr engines: ['easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-03-22 19:28:08,801 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-03-22 19:28:11,591 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-03-22 19:28:39,101 - docling.utils.accelerator_utils - INFO - Accelerator device: 'mps'
2025-03-22 19:28:39,853 - docling.models.factories.base_factory - INFO - Loading plugin 'docling_defaults'
2025-03-22 19:28:39,853 - docling.models.factories - INFO - Registered picture descriptions: ['vlm', 'api']
2025-03-22 19:28:39,855 - docling.pipeline.base_pipeline - INFO - Processing document LLM output verification.pdf
2025-03-22 19:28:52,869 - docling.document_c

In [23]:
text_docs = pdf_doc.document.export_to_text()



In [37]:
# Initialize a SemanticTextSplitter instance to split the text into meaningful semantic chunks
semantic_splitter = SemanticTextSplitter()

# Use the semantic splitter to divide the extracted text from the PDF into smaller, coherent sections
pdf_doc_split = semantic_splitter.split_text(text_docs)

# Print the second chunk of the split text to verify the output
pprint(pdf_doc_split[1])

('zkLLM is a cryptographic framework designed to ensure verifiable execution '
 'of large language models (LLMs) using Zero-Knowledge Proofs (ZKPs) . The key '
 'idea is that a third-party executor can prove they used the specified model '
 'to compute the given input and produce the output without revealing the '
 'underlying model parameters or the input data. zkLLM achieves this through '
 'efficient protocols tailored for LLM operations, such as transformer '
 'attention mechanisms, enabling secure and scalable verification.\n'
 '\n'
 'This innovative approach is detailed in the paper, zkLLM: Zero Knowledge '
 'Proofs for Large Language Models , which introduces core components like '
 'tlookup for non-arithmetic operations and zkAttn for attention mechanisms. '
 'The official implementation is available on: '
 'https://github.com/jvhs0706/zkllm-ccs2024\n'
 '\n'
 '## Detailed Explanation of zkLLM\n'
 '\n'
 'The zkLLM framework introduces an innovative method for verifiable '
 'comp

In [24]:
cache_no_embed_key = "no_embed_cache"
cag_without_embedding = CAG(llm=llm, cache=KVCache())
cag_without_embedding.preload_documents(split_docs, cache_no_embed_key)

[32mINFO[0m: [1mPrecomputing KV cache for 38 document chunks...[0m
[32mINFO[0m: [1mKV cache saved: kv_cache/no_embed_cache.pkl[0m
[32mINFO[0m: [1mPreloaded 38 document chunks into KV cache[0m


In [25]:
cache_no_embed_key_pdf = "no_embed_cache_pdf"
cag_without_embedding = CAG(llm=llm, cache=KVCache())
cag_without_embedding.preload_documents(pdf_doc_split, cache_no_embed_key_pdf)

NameError: name 'pdf_doc_split' is not defined

In [None]:
query_cinderella = "How Cinderella reach her happy ending?"
query_pdf = "how users could earn tokens?"

In [10]:
# pprint(cag_with_embedding.infer(query,cache_key=cache_embed_key))

In [None]:
response_tfidf = cag_without_embedding.infer(query_pdf,cache_key=cache_no_embed_key_pdf,similarity_search_type="tfidf",similarity_threshold=0.3)
pprint(response_tfidf)

[32mINFO[0m: [1mSelected 0 relevant chunks from cache[0m
<generator object OpenAi._generate_response at 0x368cc5640>


In [51]:
response_bm25 = cag_without_embedding.infer(query_pdf,cache_key=cache_no_embed_key,similarity_search_type="bm25",similarity_threshold=0.7)
pprint(response_bm25)

[32mINFO[0m: [1mSelected 5 relevant chunks from cache[0m
<generator object OpenAi._generate_response at 0x368cc6140>


In [52]:
response_jaccard = cag_without_embedding.infer(query_pdf,cache_key=cache_no_embed_key,similarity_search_type="jaccard",similarity_threshold=0.1)
pprint(response_jaccard)

[32mINFO[0m: [1mSelected 0 relevant chunks from cache[0m
<generator object OpenAi._generate_response at 0x368cc5fe0>


In [53]:
response_jaccard = cag_without_embedding.infer(query_pdf,cache_key=cache_no_embed_key,similarity_search_type="jaccard",similarity_threshold=0.1,smart_retrieval=True)
pprint(response_jaccard)

[32mINFO[0m: [1mUsing smart retrieval with validation[0m
[32mINFO[0m: [1mUsing smart retrieval[0m
[32mINFO[0m: [1mSelected 0 relevant chunks from cache[0m
[32mINFO[0m: [1mPerforming web search for additional context[0m
[32mERROR[0m: [31m[1mWeb fallback failed: No module named 'duckduckgo_search'[0m
[31mERROR[0m: [31m[1mWeb fallback failed: No module named 'duckduckgo_search'[0m
<generator object OpenAi._generate_response at 0x368cc5e80>
