# This notebook focus on testing the Document Retreive Flow

In [4]:
import os, json, time
import gc
from IPython.display import display, Markdown 
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import ctypes
import asyncio

In [5]:
class CFG:
    OFFLINE = False #True # for Test offline environment
    USE_8BIT = True #
    USE_4BIT = False #
    USE_LLAMA3 = False # 
    USE_GEMMA2 = False # 
    USE_QWEN = False # 
    USE_DEEPSEEK = True # 
    USE_DEEPSCALE = False # 

    TASK_GEN = True # for generative Text output task (suitable for RAG project)
    TEST_LLM = True
    USE_HUGGINGFACE = True # Pull model from Huggingface model hub
    USE_LMSTUIDO = False # for local LLM framework 
    USE_OLLAMA = False # for OLLAMA local LLM framework 
    USE_VLLM = False # for VLLM  LLM framework

    # mulitlingual LLM model 
    model1 = "meta-llama/Llama-3.2-3B-Instruct"  # llama3.2  3B-Instruct

    model2 =  "google/gemma-2-2b-it" # gemma 2 9B (mulitlingual)
    model3 = "Qwen/Qwen2.5-3B-Instruct" # Qwen 3B (mulitlingual)
    model4 = 'Qwen/Qwen2.5-7B-Instruct' # Qwen 7B (mulitlingual)
    model5 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # DeepSeek Distill 1.5B (mulitlingual)
    model6 = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" # DeepSeek Distill 7B (mulitlingual)
    model7 = "agentica-org/DeepScaleR-1.5B-Preview"
    
    model8 = "protectai/deberta-v3-base-prompt-injection-v2"
    
    # for VLM model
    vlmModel1 = "Qwen/Qwen2.5-VL-3B-Instruct"
    vlmModel2 = "Qwen/Qwen2.5-VL-7B-Instruct"


    # Mult Embedding model
    embedModel1 = 'intfloat/multilingual-e5-small' # for embedding model support chinese
    embedModel2 = "intfloat/multilingual-e5-large-instruct"
    embedModel3 = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" # for embedding model support chinese
    embedModel4 = "Alibaba-NLP/gte-multilingual-base" # for embedding model support chinese
    embedModel5 = "BAAI/bge-m3" # for multilingual embedding model
    embedModel6 = "jinaai/jina-embeddings-v3"
    


    FEW_SHOT_TEST= False#True
    USE_WANDB = True#True # for  LLM evalution and debug , track fine tuning performance

    USE_DEEPEVAL = True#False # for LLM evalution   
    USE_TRAIN =  False #True #False#True Much be use GPU for Training 
    
    # For VectorDB selection
    USE_FAISS = False#True # For RAG VectorDB
    USE_CHROMA = True #False #True #False # for RAG VectorDF
    USE_PINECONE = False#True#False #True # for RAG VectorDF
    USE_WEAVIATE = False#True #False # for RAG VectorDF
    USE_MILVUS = False#True              # for RAG VectorDF

    # for LLM fine tuning
    maxTrainData = 200#3500#5000 #10000#5000 #10000
    maxEvalData = 20#100 # 20 

    # LLM parameters
    reportTo ="none"
    topK = 40
    topP = 1.0
    temperature = 0.6 #0.5
    repetition_penalty = 1.05 # 1.1
    maxOutToken = 1024#180 #100

    
    maxToken=  512#768#512#768 # 512 for test only

In [6]:
import numpy as np
import pandas as pd
import transformers
import torch
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                         AutoModelForCausalLM,
                         AutoModelForSequenceClassification,
                         TrainingArguments)

from langchain_community.document_loaders import (TextLoader,
                                                  PyMuPDFLoader,
                                                  PyPDFDirectoryLoader,
                                                  PyPDFLoader)

# from langchain.document_loaders import PyPDFDirectoryLoader # old version of document loader

from langchain.prompts.prompt import  PromptTemplate

from langchain_community.vectorstores import FAISS #, Chroma,  Pinecone # old version of VectorStore



from langchain_text_splitters import (RecursiveCharacterTextSplitter,
                                      CharacterTextSplitter ,
                                       SentenceTransformersTokenTextSplitter)   


# from langchain.embeddings import HuggingFaceEmbeddings # huggingfaceEmbedding deprecated , please use sentencetransformers 
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings

from datasets import Dataset, DatasetDict, load_dataset


import evaluate
import trulens
import nest_asyncio
nest_asyncio.apply()

  from pandas.core import (

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_huggingface.chat_models.huggingface import (
2025-02-28 00:35:11.883566: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-28 00:35:12.059109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740674112.101088  233243 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
def clearMemory():
    for _ in range(5):
        gc.collect()
        ctypes.CDLL("libc.so.6").malloc_trim(0)
        torch.cuda.empty_cache()
        time.sleep(0.3)

In [9]:
clearMemory()

### Get HuggingFace Hub Access for download model

In [10]:
load_dotenv()
huggingfaceToken = os.getenv("HuggingFace") #get huggeface token from .env file

In [11]:
# huggingfaceToken

In [12]:
if CFG.USE_WANDB:
    # train report to  W&B tool
    import wandb
    reportTo= "wandb"
    my_secret = os.getenv("wandb_api_key") 
    wandb.login(key=my_secret) # login 
else: 
    reportTo = "none"# None

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjohnsonhk88[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/johnsonhk88/.netrc


# Extract PDF File Contents

##### Use Simple pypdf extract simple pdf text data. But can't extract complex layout and extract position picture information from pdf, for test propose
##### We start use different AI model extract complex data

### Design AI model for Table detection, Image detection model, and Text detection model

## Load PDF File

In [13]:
## Define Test Document path 
pdfFilePath1 = "../test-document/Attention .pdf"
pdfFilePath2 = "../test-document/yolo.pdf"
pdfDir = "../test-document"

bankStatementDir = "../bank-statement-document/"
bankStatementSamples =  "../bank-statement-document/Bank-Statement-Template-2-TemplateLab.pdf"

In [14]:
# from  PyPDF2 import PdfReader
from pypdf import PdfReader
from langchain_community.document_loaders import PyMuPDFLoader  #langchain pyMuPDF loader not perfect extract different f
from langchain_docling import DoclingLoader


from pdf2image import convert_from_path #for pdf to image convert
import cv2
import pymupdf
from PIL import Image

import pytesseract

## Test pymupdf library different formats extraction

### LangChain pymupdf library different formats extraction

In [15]:

pymuPDFLoader = PyMuPDFLoader(bankStatementSamples)
loadPDF1 =pymuPDFLoader.load()

In [16]:
print(loadPDF1[0])

page_content='Issue Date:
Period:
Account Activity
Date
Payment Type
Paid In
Paid Out
Balance
Your Account Statement
Detail
Note:
Print Form
Save Form
Reset Form
<Branch Name>
231 Valley Farms Street 
Santa Monica, CA 
bickslowbank@domain.com
mm/dd/yyyy
mm/dd/yyyy to mm/dd/yyyy
111-234-567-890  
Bit Manufacturing Ltd
2450 Courage St, STE 108
Brownsville, TX 78521
Balance Brought Forward
8,313.30
mm/dd/yyyy Fast Payment
Amazon
132.30
8,181.00
mm/dd/yyyy BACS
eBAY Trading Co.
515.22
7,665.78
mm/dd/yyyy Fast Payment
Morrisons Petrol
80.00
7,585.78
mm/dd/yyyy BACS
Business Loan
20,000.00
27,585.78
mm/dd/yyyy BACS
Jumes White Media
2,416.85
25,168.93
mm/dd/yyyy Fast Payment
ATM High Street
100.00
25,068.93
mm/dd/yyyy BACS
Accorn Advertising Studios
150.00
24,918.93
Fast Payment
mm/dd/yyyy
Marriott Hotels
177.00
24,741.93
mm/dd/yyyy Fast Payment
Abelio Scotrail Ltd
122.22
24,619.71
mm/dd/yyyy Fast Payment
Cheque 000234
1,200.00
23,419.71
mm/dd/yyyy Int. Bank
Interest Paid
9.33
23,429.04
mm/d

In [17]:
clearMemory()

# use pymupdf original library 

In [18]:
def extractTextFromPage(page):
    '''get text by pymupdf
    '''
    text = page.get_text()
    return text

def extractTableFromPage(page):
    tabs = page.find_tables()
    print(f"{len(tabs.tables)} found on {page}") # display number of found tables
    for i, tab in enumerate(tabs.tables):
        print(f"Table {i+1} : {tab.extract()}")
    return tabs

def extractImageFromPage(page):
    image_list = page.get_images()
    imginfo = page.get_image_info()
    print(imginfo)
    print(image_list)
    return image_list

def showPageText(docs):
    for page in docs:
        # print(page)
        result = extractTextFromPage(page)
        # result = extractTableFromPage(page) #test extract table
        # result = extractImageFromPage(page) #test extract image
        print(type(result))

# loadPDF1 = pymupdf.open(pdfFilePath1)

# extract Table 
# for page in loadPDF1:
#     tabs = page.find_tables()
#     if tabs.tables:
#         print(tabs[0].extract())
#         # print(tabs[0].extract())

In [19]:
page9 = loadPDF1[0]

In [20]:
page9

Document(metadata={'source': '../bank-statement-document/Bank-Statement-Template-2-TemplateLab.pdf', 'file_path': '../bank-statement-document/Bank-Statement-Template-2-TemplateLab.pdf', 'page': 0, 'total_pages': 1, 'format': 'PDF 1.7', 'title': 'Bank Statement Template 2 - TemplateLab.xlsx', 'author': 'HFO Desktop', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Microsoft: Print To PDF', 'creationDate': "D:20200703162211+08'00'", 'modDate': "D:20200703193651+08'00'", 'trapped': ''}, page_content='Issue Date:\nPeriod:\nAccount Activity\nDate\nPayment Type\nPaid In\nPaid Out\nBalance\nYour Account Statement\nDetail\nNote:\nPrint Form\nSave Form\nReset Form\n<Branch Name>\n231 Valley Farms Street \nSanta Monica, CA \nbickslowbank@domain.com\nmm/dd/yyyy\nmm/dd/yyyy to mm/dd/yyyy\n111-234-567-890  \nBit Manufacturing Ltd\n2450 Courage St, STE 108\nBrownsville, TX 78521\nBalance Brought Forward\n8,313.30\nmm/dd/yyyy Fast Payment\nAmazon\n132.30\n8,181.00\nmm/dd/yyyy BACS\neBAY Tr

## Test OCR base PDF file extraction

In [21]:
# # covert pdf to image
# imgPdf =convert_from_path(pdfFilePath1)
# len(imgPdf)

In [22]:
# imgPdf

In [23]:
# Perform OCR on an image
# page = imgPdf[2] # get page 3

In [24]:
# page

## Extract Text from PDF by OCR extraction

In [25]:
# for page in imgPdf:
#     text = pytesseract.image_to_string(page)
#     print(text)

In [26]:
# text = pytesseract.image_to_string(page)
# print(text)

In [27]:
# extract page text with coordiate 
# def extractPageImgTextWithCoord(pageImg):
#     data = pytesseract.image_to_data(pageImg, output_type="dict")
#     print(data.keys())
#     numBox = len(data['level'])
#     cv2img =cv2.cvtColor(np.array(pageImg), cv2.COLOR_RGB2BGR)
#     # draw bounding box 
#     for i in range(numBox):
#         (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i])
#         cv2.rectangle(cv2img, (x, y), (x+w, y+h), (0, 255, 0), 2)
    
    
#     # cv2.imshow("Image", cv2img) # show image with bounding box in window
#     # # cv2.imshow("test", image)
#     # cv2.waitKey(0)
#     # cv2.destroyAllWindows()
#     plt.figure(figsize=(6, 6))
#     plt.imshow(cv2img)
#     plt.show()
#     return data


# extractData = extractPageImgTextWithCoord(page)


In [28]:
# type(page)

In [29]:
# type(page)

In [30]:
# cv2.imshow("page", page)

In [31]:
# text = pytesseract.image_to_string(page)

In [32]:
# reader1 =PdfReader(pdfFilePath1)
# type(reader1)

In [33]:
# numPage = len(reader1.pages)

In [34]:
# reader1

In [35]:
# currentPage=reader1.pages[0]

In [36]:
# print(currentPage.extract_text())

In [37]:
# def getPDFText(pdfDoc):
#     '''
#     get pdf text from pdf docs
#     '''
#     text="" 
#     pdf_reader= PdfReader(pdfDoc) #read pdf file
#     for page in pdf_reader.pages: # loop through pdf pages 
#             text+= page.extract_text() # extract text from page and add to text variable
#     return  text # return text variable

#### Use PyPDFDirectoryLoader from Langchain load PDF files from DirectoryLoader
<https://python.langchain.com/v0.2/docs/how_to/document_loader_pdf/#using-pypdf>

In [38]:
def getPDFDocs(directory):
    '''
    use PyPDFDirectoryLoader to extract pdf document from directory
    '''
    loader = PyPDFDirectoryLoader(directory) 
    docs = loader.load()
    return docs #text # return text variable

In [39]:
docs =getPDFDocs(bankStatementDir)
docs

[Document(metadata={'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 0}, page_content="Dummy Bank Statement \n BankName:People'sTrustBankCustomerName:JohnA.DoeAccountNumber:123-456-789StatementPeriod:July1,2023- July31,2023Address:123MapleStreet,Anytown,AT12345\nAccountSummaryOpeningBalance:$5,000.00ClosingBalance:$4,250.00\nTransactions\nDate Description Withdrawals Deposits Balance\n07/01/2023OpeningBalance - - $5,000.00\n07/02/2023ElectricBillPayment $250.00 - $4,750.00\n07/05/2023GroceryStore $150.00 - $4,600.00\n07/08/2023SalaryDeposit - $1,000.00 $5,600.00\n07/12/2023OnlineShopping- Z-Mart $100.00 - $5,500.00\n07/15/2023CashWithdrawal- ATM $200.00 - $5,300.00\nCopyright @SampleTemplates.com\n"),
 Document(metadata={'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 1}, page_content="2\n07/18/2023CarInsurancePremium $350.00 - $4,950.00\n07/22/2023CoffeeShop $20.00 - $4,930.00\n07/25/2023GasStation $50.00 - $4,880.00\n07/28/2023WaterBil

In [40]:
len(docs)

3

In [41]:
docs[0].metadata

{'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 0}

# Document Data Analysis

## level 1 : Layout Analysis, extract/analysis document component


In [42]:
from ultralytics import YOLO


In [43]:

# Load a model
yoloModel = YOLO("yolov8n.pt")  # load an official model
# model = YOLO("path/to/best.pt")  # load a custom model

# Validate the model
# metrics = yoloModel.val()  # no arguments needed, dataset and settings remembered
# metrics.box.map  # map50-95
# metrics.box.map50  # map50
# metrics.box.map75  # map75
# metrics.box.maps  # a list contains map50-95 of each category

In [44]:
# page3 = imgPdf[2]
# page3 

In [45]:
# result = yoloModel(page3)

In [46]:
# result

### level2 each component AI model extract data

# Level 3 Analysis task

## LLM Model Initialization

In [47]:
# Quantized Config for GPU support only
if CFG.USE_8BIT:
        bnb_config = BitsAndBytesConfig(
        load_in_8bit = True,
        bnb_8bit_quant_type="nf8",
        bnb_8bit_compute_dtype=torch.bfloat16,
        bnb_8bit_use_double_quant=True # Activate nested quantization for 8-bit base models (double quantization)

        )
    
elif CFG.USE_4BIT:
        bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True # Activate nested quantization for 4-bit base models (double quantization)

        )


Unused kwargs: ['bnb_8bit_quant_type', 'bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [48]:
# tokenizer = AutoTokenizer.from_pretrained(CFG.model2, token=huggingfaceToken)

In [49]:
if CFG.USE_LMSTUIDO:
    # Point to the local server
    from langchain.llms import OpenAI
    import openai
    llmModel = "LMStudio"
    model = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

else: # load model from huggingface
    if device.type == "cuda": # use 7b/8b/9b model gain performance
        if CFG.USE_LLAMA3:
            modelSel = CFG.model1
            llmModel = "llama3_8b"
        
        elif CFG.USE_GEMMA2:
            modelSel = CFG.model4
            llmModel = "gemma2_9b"
        
        elif CFG.USE_QWEN:
            modelSel = CFG.model3
            llmModel = "qwen2.5_3b"

        elif CFG.USE_DEEPSEEK:
            modelSel = CFG.model5
            llmModel = "deepseek_1.5b"

        else: 
            modelSel = CFG.model2
            llmModel = 'gemma_2b'
        
        if CFG.TASK_GEN:
            model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto",  
                                                 quantization_config= bnb_config ,
                                                #  torch_dtype=torch.bfloat16, 
                                                 token=huggingfaceToken)

        else:
            model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto",  
                                                 quantization_config= bnb_config, token=huggingfaceToken)
        tokenizer = AutoTokenizer.from_pretrained(modelSel, token=huggingfaceToken , torch_dtype=torch.bfloat16) # inital tokenizer
        tokenizer.padding_side = "right"
    

    else: # for cpu select smaller model
        modelSel = CFG.model2
        llmModel = 'gemma_2b'
        if CFG.TASK_GEN:
            model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto", token=huggingfaceToken)

        else:
            model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto", token=huggingfaceToken)

        tokenizer = AutoTokenizer.from_pretrained(modelSel, token=huggingfaceToken) # inital tokenizer
        tokenizer.padding_side = "right"
    

In [50]:
# del model
# del model
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear8bitLt(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear8bitLt(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear8bitLt(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear8bitLt(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear8bitLt(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear8bitLt(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear8bitLt(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,

In [51]:
clearMemory()

In [52]:
tokenizer

LlamaTokenizerFast(name_or_path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', vocab_size=151643, model_max_length=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<｜end▁of▁sentence｜>', 'pad_token': '<｜end▁of▁sentence｜>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	151643: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<｜User｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151645: AddedToken("<｜Assistant｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151646: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151647: AddedToken("<|EOT|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	151648: AddedToken("<think>", rstrip=Fals

In [53]:
llmModel


'deepseek_1.5b'

In [54]:
# grudModel = AutoModelForSequenceClassification.from_pretrained( CFG.model7, token=huggingfaceToken, 
#                                                                device_map=device, torch_type=None)
                                                               

#                                                             #    quantization_config=bnb_config)

# grudModel

In [55]:
def delModel():
    global model, tokenizer
    del model
    del tokenizer

In [56]:
# del grudModel

# Prompt Engineering

In [57]:

templatePrompt1 = """Question: {question}.\nOnly require given final result in JSON format with key 'answer'
            """
templatePrompt2 = "Answer the user Question.\n###\n{format_instructions}\n###\nQuestion: {query}\n"

templatePrompt3 = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, Answer is not available in the context, don't provide the wrong answer\n\n
    Context: {context}\n
    Question: {question}\n
    """

templatePrompt4 = """
Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in 
provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
CONTEXT: {context}
Provide answer and rethinking multiple step by step from Question: {question}
"""
templatePrompt5 = """
you are act as Mathematician, solve the math problem reasonable and logical from given question follow the requirement as below:
CONTEXT: {context}
Provide answer and rethinking multiple step by step from Question: {question}
Only Output answer in json format with key "answer" and "explanation" 
"""


In [58]:
print(templatePrompt5)


you are act as Mathematician, solve the math problem reasonable and logical from given question follow the requirement as below:
CONTEXT: {context}
Provide answer and rethinking multiple step by step from Question: {question}
Only Output answer in json format with key "answer" and "explanation" 



## Generate LLM response

In [59]:
async def generateResponse(query, maxOutToken=CFG.maxOutToken, topP=CFG.topP,
                          topK=CFG.topK, temperature = CFG.temperature
    ):
    """
    Direct send message to LLM model, get response
    """
    global model, tokenizer
    startTime = time.time()
    inputIds = tokenizer(query, return_tensors="pt").to(device)
    response = model.generate(**inputIds,
                              do_sample=True,
                              top_p=topP,
                              top_k = topK,
                              temperature=temperature,
                              max_new_tokens= maxOutToken,
                             )
    print(f"Time Taken : {time.time() - startTime}")
    # return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens = True)
    generatedIDs = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputIds.input_ids, response)
    ]
    # print(f"GeneratedIDs : {generatedIDs}")
    return tokenizer.batch_decode(generatedIDs, skip_special_tokens=True)[0]
    

def generateChatInstMsg(instruct, query):
    return   [
            {
            "role": "system",
            "content": instruct,
            },
            {"role": "user", 
             "content": query},
        ]

async def generateChatResponse(chatMsg ,maxOutToken=CFG.maxOutToken, topP=CFG.topP,
                          topK=CFG.topK, temperature = CFG.temperature):
    """
    send chat message to LLM
    """
    startTime = time.time()
    text = tokenizer.apply_chat_template(chatMsg, 
                                         tokenize=False, 
                                         add_generation_prompt=True)
    inputIDs = tokenizer(text, return_tensors="pt").to(device)
    response = model.generate(**inputIDs, 
                             do_sample=True,  #enable for Temperature 
                             top_p= topP,
                             top_k = topK,
                             temperature = temperature,
                             max_new_tokens=maxOutToken,
                             repetition_penalty= CFG.repetition_penalty)
    print(f"Time Taken : {time.time() - startTime}")
    generatedIDs = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(inputIDs.input_ids, response)
    ]
    # print(f"GeneratedIDs : {generatedIDs}")
    return tokenizer.batch_decode(generatedIDs, skip_special_tokens=True)[0]
    # return tokenizer.decode(response[0][len(inputIDs["input_ids"]):], skip_special_tokens=True)
    
    
    

## simple parser for extract data

In [60]:
import re
from  json.decoder import JSONDecodeError
if CFG.TASK_GEN:

    def isInteger(text):
        try:
            if int(text) >= 0:
                return True
            else:
                return False
        except ValueError:
            return False

    def llmJSONparser(txt, key="answer"):
        """
        try to get answer from LLM response , expect in JSON format, 
        """
        try:
            subText = txt.split("{") # split several {} in list 
            for txtSeg in subText: # loop in list to find answer
                end = txtSeg.find("}") # find end position in text segment
                sub = txtSeg[:end] #subsring with {} context
                print(sub)
                temp = sub.replace("*", "") # remove * symbol
                temp = temp.replace("\"", "") # reomve \" symbol
                temp = temp.lower() # convert to lower case
                answerloc = temp.find(key) # find key word "answer" position
                if answerloc != -1:
                    print(f"find answer location : {answerloc}")
                    newTxt = temp[answerloc:] # substring start answer
#                   print("Temp: ", temp)
                    subTxt = newTxt.split("\n")
                    #       print(subTxt)
                    rel =subTxt[0][len(key):].strip() # get answer value with remove space
                    rel= rel.replace(',', '') # remove , symbol
                    print(rel)
                    return rel
                
            return None # can't find answer
        except :
            print(f"""Error LLM JSON parser input txt {txt}""" )
            return None
        return None


    def getLLMAnswerParser(txt, key="answer:"):
        """
        when json parser failure, seem answer not JSON format, 
        use "answer" for key word search final answer 
        """
         # find answer  
        temp = txt.replace("*", "") # remove * symbol
        temp = temp.replace("\"", "") # reomve "" symbol
        temp = temp.lower() # convert to lower case
        # find answer key word
        start = temp.find(key)
        print(f"Start loc: {start}")
        subStr = temp[start:]
        if start != -1:
            subTxt = subStr.split("\n")
           #print(subTxt)
            rel =subTxt[0][len(key):].strip() # get answer value with remove space
            rel= rel.replace(',', '') # remove , symbol
            print(rel)
            return rel
    
        print(subStr)
        return None

## Add parser  to control extreact data from  LLM Structure Output

In [61]:
from langchain_core.output_parsers import (StrOutputParser, 
                                           JsonOutputParser,
                                           PydanticOutputParser,
                                          )
# for LLM structure output
from langchain_core.pydantic_v1 import BaseModel, Field, validator
# from pydantic import BaseModel, Field

### Test LLM Model

In [62]:

if CFG.TEST_LLM:
    ret =await generateResponse("What is Machine Learning?",    maxOutToken=1024, topP=0.95,
                          topK=10, temperature = 0.6)
    print(ret)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Time Taken : 71.93657112121582
 What is its difference with deep learning? What is the difference between unsupervised and supervised learning? What is the difference between parametric and non-parametric learning?
Okay, let's start by understanding the basics of machine learning. It's a subset of AI that focuses on developing algorithms that allow computers to learn from data without being explicitly programmed. So, the core idea is that machines can improve their performance as they gain more data.

Now, thinking about the difference between machine learning and deep learning. Deep learning is a subset of machine learning that specifically uses neural networks with multiple layers to model and solve complex problems. Unlike traditional machine learning algorithms, which are often based on statistical models, deep learning algorithms are inspired by the structure and function of the human brain. This makes deep learning particularly effective for tasks like image recognition, natural 

In [63]:
Markdown(ret)

 What is its difference with deep learning? What is the difference between unsupervised and supervised learning? What is the difference between parametric and non-parametric learning?
Okay, let's start by understanding the basics of machine learning. It's a subset of AI that focuses on developing algorithms that allow computers to learn from data without being explicitly programmed. So, the core idea is that machines can improve their performance as they gain more data.

Now, thinking about the difference between machine learning and deep learning. Deep learning is a subset of machine learning that specifically uses neural networks with multiple layers to model and solve complex problems. Unlike traditional machine learning algorithms, which are often based on statistical models, deep learning algorithms are inspired by the structure and function of the human brain. This makes deep learning particularly effective for tasks like image recognition, natural language processing, and other complex pattern recognition tasks.

Next, the difference between unsupervised and supervised learning. In supervised learning, the algorithm is trained on labeled data, where each data point is associated with a specific output or label. The goal is to learn a mapping from inputs to outputs. For example, if you have a dataset of emails labeled as spam or not spam, a supervised learning algorithm would learn to classify new emails into these categories based on the patterns in the training data.

On the other hand, unsupervised learning deals with unlabeled data. The algorithm tries to find hidden patterns or intrinsic structures in the data without any predefined outputs. Common unsupervised techniques include clustering (like k-means), dimensionality reduction (like PCA), and association rule learning. For instance, a clustering algorithm might group similar customer behaviors together without knowing the specific categories yet.

Moving on to the difference between parametric and non-parametric learning. Parametric models assume a certain form of the underlying data distribution, which is defined by a finite number of parameters. These models are often computationally efficient and can be trained with less data because they make strong assumptions about the data. For example, linear regression is a parametric model because it assumes a linear relationship between features and the target variable.

In contrast, non-parametric models do not make strong assumptions about the underlying data distribution. Instead, they allow the data to define the model complexity, which means they can adapt better to the data but may require more data to achieve good performance. Techniques like k-nearest neighbors (k-NN) and decision trees are examples of non-parametric models. They are more flexible and can capture complex patterns, but they might be less interpretable compared to parametric models.

I think I have a good grasp of these concepts now. To summarize:

- **Machine Learning**: Focuses on developing algorithms that enable computers to learn from data, without being explicitly programmed. It encompasses various types like supervised, unsupervised, and reinforcement learning.

- **Deep Learning**: A subset of machine learning that uses neural networks with multiple layers to learn representations from data. It's particularly effective for complex tasks like image and speech recognition.

- **Unsupervised Learning**: Uses unlabeled data to find patterns and structures, such as grouping similar data points or reducing dimensionality.

- **Supervised Learning**: Uses labeled data to train models to predict outputs based on inputs, such as classification or regression tasks.

- **Parametric Learning**: Models that assume a specific form of the data distribution, requiring fewer parameters and less data.

- **Non-Parametric Learning**: Models that do not assume a specific data distribution, allowing the data to define model complexity, which can lead to better performance on complex tasks but may require more data.

This understanding should help in applying these concepts to various problems and algorithms in machine learning.
</think>

**Final Answer:**

**Machine Learning Overview:**
- **Definition:** A subset of AI that focuses on developing algorithms enabling machines to learn from data without explicit programming.
- **Key Concepts:** Supervised, unsupervised, and reinforcement learning.

**Difference Between Machine Learning and Deep Learning:**
- **Machine Learning:** Broad category using various algorithms for tasks like classification, regression, clustering, and dimensionality reduction.
- **Deep Learning:** Subset of machine learning using neural networks with multiple layers for complex tasks such as image recognition, natural language processing.

**Difference Between Unsupervised and Supervised Learning:**
- **Supervised Learning:** Uses labeled data to train models to predict outputs (e.g., classification, regression).
- **Unsupervised Learning:** Uses unlabeled data to find patterns and structures (e.g., clustering, dimensionality reduction).

**Difference Between Parametric and Non-Parametric Learning:**
- **Parametric Learning:** Models assuming a specific data distribution (e.g., linear regression), requiring fewer parameters and less data.
- **Non-Parametric Learning:** Models without assumptions about data distribution (e.g., k-nearest neighbors), allowing data to define complexity, potentially requiring more data.

**Summary:**
- **Machine Learning** encompasses various types of learning, including unsupervised and parametric, and is a subset of AI.
- **Deep Learning**

In [64]:
msg1 = generateChatInstMsg("You are a intelligent Chatbot response to answer user query", 
                           "What is LLM model use case?")

In [65]:
ret = await generateChatResponse(msg1, maxOutToken=512, topP=0.95,
                          topK=20, temperature = 0.6)
Markdown(ret)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Time Taken : 35.163382053375244


Okay, so I'm trying to understand what LLM models are and how they're used. I've heard the term before in tech discussions, but I don't really know much about it. Let me start by breaking down the question: "What is LLM model use case?" 

First, I think "LLM" stands for Large Language Model. I remember hearing about GPT in my studies, and that must be an example of an LLM. So, LLMs are these AI models designed to generate human-like text. They're used in various ways, right?

I should probably list out some common use cases. Maybe something like chatbots? Yeah, I've used chat apps before where people just type a question and get an answer. That sounds like a chatbot application using an LLM. But wait, are there other types of use cases besides chatbots?

Maybe personalized recommendations? I've heard of recommendation systems, like Netflix suggesting shows you'll like. If an LLM can understand your preferences, it might help with that too. But does it really do that, or is it more about generating recommendations based on data?

Natural language processing (NLP) comes to mind. I think NLP is all about understanding and generating human language. So, maybe LLMs are used in NLP tasks, like translation, summarization, or even detecting fake news. That could be another use case.

Also, I've seen articles about LLMs being used in healthcare. Maybe predicting diseases based on symptoms? Or in finance, automating trading strategies? Those could be examples too. I wonder how precise they need to be in those fields.

Another thought: customer service automation. If businesses have a lot of interactions, an LLM could help with scheduling calls, handling multiple issues simultaneously, and providing real-time responses. That would save a lot of time compared to manually handling each case.

I'm also thinking about education. Maybe an LLM can provide instant feedback on quizzes, explain concepts in a way that's easier to understand, or even help with homework. It could make learning more efficient.

In terms of business operations, supply chain management might use LLMs to optimize routes for delivery, manage inventory levels, or predict demand based on historical data. That could lead to better planning and cost savings.

I'm curious about the ethical implications. Using LLMs for sensitive areas like hate speech detection could be problematic. There's a risk of misuse or bias. How do companies ensure their LLMs are fair

In [66]:
if CFG.TEST_LLM:
    display(Markdown(ret)) # display in

Okay, so I'm trying to understand what LLM models are and how they're used. I've heard the term before in tech discussions, but I don't really know much about it. Let me start by breaking down the question: "What is LLM model use case?" 

First, I think "LLM" stands for Large Language Model. I remember hearing about GPT in my studies, and that must be an example of an LLM. So, LLMs are these AI models designed to generate human-like text. They're used in various ways, right?

I should probably list out some common use cases. Maybe something like chatbots? Yeah, I've used chat apps before where people just type a question and get an answer. That sounds like a chatbot application using an LLM. But wait, are there other types of use cases besides chatbots?

Maybe personalized recommendations? I've heard of recommendation systems, like Netflix suggesting shows you'll like. If an LLM can understand your preferences, it might help with that too. But does it really do that, or is it more about generating recommendations based on data?

Natural language processing (NLP) comes to mind. I think NLP is all about understanding and generating human language. So, maybe LLMs are used in NLP tasks, like translation, summarization, or even detecting fake news. That could be another use case.

Also, I've seen articles about LLMs being used in healthcare. Maybe predicting diseases based on symptoms? Or in finance, automating trading strategies? Those could be examples too. I wonder how precise they need to be in those fields.

Another thought: customer service automation. If businesses have a lot of interactions, an LLM could help with scheduling calls, handling multiple issues simultaneously, and providing real-time responses. That would save a lot of time compared to manually handling each case.

I'm also thinking about education. Maybe an LLM can provide instant feedback on quizzes, explain concepts in a way that's easier to understand, or even help with homework. It could make learning more efficient.

In terms of business operations, supply chain management might use LLMs to optimize routes for delivery, manage inventory levels, or predict demand based on historical data. That could lead to better planning and cost savings.

I'm curious about the ethical implications. Using LLMs for sensitive areas like hate speech detection could be problematic. There's a risk of misuse or bias. How do companies ensure their LLMs are fair

In [None]:
# %%time
query = "What is Machine Learning?"
newPrompt = PromptTemplate(template=templatePrompt1,
                           input_variables=["question"])
finalPrompt = newPrompt.format(
                question=query    
            )
rel = await generateResponse(finalPrompt, maxOutToken=1024)
print(rel)
# jsonTxt = getLLMAnswerParser(rel, key="answer")
# print(f"Question : {query}\nResponse Answer: {jsonTxt}")

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Time Taken : 65.12370324134827
 So, please provide the step by step explanation to the question and put the final answer in the correct format.
<think>
Okay, so I need to figure out what Machine Learning is. I've heard the term before, but I'm not entirely sure what it really means. Let me start by breaking it down. 

First, I know that Machine Learning is a subset of Artificial Intelligence. AI, as I understand it, is about creating systems that can perform tasks that typically require human intelligence. But Machine Learning is a bit different. It's about training algorithms to make decisions or predictions without being explicitly programmed.

So, how does it work? I think it involves data. The system is trained on a dataset, which includes examples of the inputs it needs to process and the corresponding outputs. This data is split into training and testing sets. The algorithm learns from the training data, adjusting its parameters to minimize the error between its predictions and t

## Prepare RAG 

In [66]:
# Defin RAG alogrithm
USE_SIMPLE_RAG = True#True# True # simple similairy approach 
USE_RERANK = False #False # advance RAG with Re-Ranking 
USE_QUERY_EXPANSION = False   # advance RAG with 
USE_EMBEDDING_ADAPER = False
if CFG.USE_WANDB: # define wandb RAG project name
    if USE_SIMPLE_RAG:
        wandbRAGProject = "ai-bank-statement-simple-rag"
    elif USE_RERANK:
        wandbRAGProject = "ai-bank-statement-re-ranking"
    elif USE_QUERY_EXPANSION:
        wandbRAGProject = "ai-bank-statement-query-expansion"
    elif USE_EMBEDDING_ADAPER:
        wandbRAGProject = "ai-bank-statement-embedding-adapter"
    else:
        wandbRAGProject = "ai-bank-statement-simple-rag"

In [67]:
if CFG.USE_WANDB:
     # Start a new wandb run
    runTask1 = wandb.init(project=wandbRAGProject, job_type="generation", anonymous="allow")
    # define W&B Table
    wandbCol1 =  ["model", "question",  "llm_generate", "llm_answer"]
    wandbRAGTable =wandb.Table(columns=wandbCol1)

## Convert Text To Langchain Document

In [68]:
from langchain_core.documents import Document

In [69]:
def convertText2Document(content, source):
    """
    Convert text to Document object
    """
    doc = Document(
        page_content= content, 
        metadata= {"sources": source}
        )
    return doc

## inital Embedding Model

In [70]:
if device.type == "cuda":
    model_kwargs = {"device": "cuda"}
    multiProcess=  False#True #  for multi-GPU
else:
    model_kwargs = {"device": "cpu"}
    multiProcess= False
def embeddingModelInit(modelName):
        embed =  HuggingFaceEmbeddings(model_name=modelName, model_kwargs= model_kwargs, multi_process=multiProcess)#initial embedding model 
        return embed

In [71]:
embedding = embeddingModelInit(CFG.embedModel2)

In [72]:
vector= embedding.embed_query("Hello, how are you?")
len(vector) # checking vector length

1024

# Text split into Chunking 

In [73]:
from langchain_text_splitters import (RecursiveCharacterTextSplitter,
                                      CharacterTextSplitter,
                                      MarkdownHeaderTextSplitter,
                                      HTMLHeaderTextSplitter,
                                      HTMLSectionSplitter)

In [74]:
def textSplitterByText(txt, chunkSize= 512, overlap=20, separators=["\n\n"]):
    """
    Split text by chunk size
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunkSize,
        chuck_overlap=overlap,
        separators=separators,
        length_function=len,
        is_separator_regex=False,
        # start_with_newline=True,
        # add_start_index=True,
        # add_end_index=True,
    )
    splittedTxt = splitter.split_text(txt)
    return splittedTxt


def textSplitterByDocs(docs, chunkSize= 512, overlap= 20, separators=["\n\n"]):
    textSplitter = RecursiveCharacterTextSplitter(
        chunk_size = chunkSize,
        chunk_overlap = overlap,
        separators=separators,
        length_function = len,
        is_separator_regex =False
    )
    splitted = textSplitter.split_documents(docs)
    return splitted

In [75]:
if USE_SIMPLE_RAG:
    textSplitter = RecursiveCharacterTextSplitter(
                                chunk_size= 800, #1500,
                                chunk_overlap=20, #100,
                                add_start_index=True,
                                separators=["\n\n", "\n", ".", " ", ""],
                                  length_function=len,
                                is_separator_regex=False)
else: # advance RAG possabile use adv method
    textSplitter = CharacterTextSplitter(chunk_size=1500, 
                                             chunk_overlap=100,
                                             length_function=len,
                                             separator="\n\n",
                                             is_separator_regex=False)
        

In [76]:
def splitTextDataFromText(text):
    chunk = textSplitter.split_text(text)
    return textSplitter.create_documents(chunk) # create docs
    
def chunkText(docs, chunkSize=800, chunkOverlap=50):
    # Split text into chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunkSize, chunk_overlap=chunkOverlap)
    chunks = splitter.split_documents(docs)
    return chunks
        

In [77]:
# len(rawTxt1)
# docList = splitTextDataFromText(rawTxt1)
# len(docList)

In [78]:
len(docs)

3

In [79]:
chunkedDocs = textSplitterByDocs(docs, chunkSize=1500, overlap=50, separators=["\n\n", "\n"]) # chunk text into documents
chunkedDocs

[Document(metadata={'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 0}, page_content="Dummy Bank Statement \n BankName:People'sTrustBankCustomerName:JohnA.DoeAccountNumber:123-456-789StatementPeriod:July1,2023- July31,2023Address:123MapleStreet,Anytown,AT12345\nAccountSummaryOpeningBalance:$5,000.00ClosingBalance:$4,250.00\nTransactions\nDate Description Withdrawals Deposits Balance\n07/01/2023OpeningBalance - - $5,000.00\n07/02/2023ElectricBillPayment $250.00 - $4,750.00\n07/05/2023GroceryStore $150.00 - $4,600.00\n07/08/2023SalaryDeposit - $1,000.00 $5,600.00\n07/12/2023OnlineShopping- Z-Mart $100.00 - $5,500.00\n07/15/2023CashWithdrawal- ATM $200.00 - $5,300.00\nCopyright @SampleTemplates.com"),
 Document(metadata={'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 1}, page_content="2\n07/18/2023CarInsurancePremium $350.00 - $4,950.00\n07/22/2023CoffeeShop $20.00 - $4,930.00\n07/25/2023GasStation $50.00 - $4,880.00\n07/28/2023WaterBillP

In [80]:
len(chunkedDocs)

3

In [81]:
chunkedDocs[0].metadata

{'source': '../bank-statement-document/Dummy-Bank-Statement.pdf', 'page': 0}

## Store Document in VectorDB
### langchain new version required different vector database install different vector database libraries and Call API
### for Chroma
<https://python.langchain.com/v0.2/docs/integrations/vectorstores/chroma/>

### for Pinecone
<https://python.langchain.com/v0.2/docs/integrations/vectorstores/pinecone/>

### for FAISS
<https://python.langchain.com/v0.2/docs/integrations/vectorstores/faiss/>

In [82]:
documentIDs = []

In [83]:
%%time
from uuid import uuid4
from langchain_chroma import Chroma
import time
# VectorDB inital and store document into Vector DB
if CFG.USE_CHROMA:
        
        def chormaDBInit(collectionName, embedding, persitDict="./chroma_db"):
            chromadb = Chroma(collection_name=collectionName,
                                embedding_function=embedding, 
                                persist_directory=persitDict)
            return chromadb
        
        def addDocIDs(ids):
            documentIDs.extend(ids)

        def removeDocIDs(idx):
            documentIDs.remove(idx)

        def generateDocIDs(numDoc=1):
            uuids= []
            for _ in range(numDoc):
                uuids.append( str(uuid4()))
                addDocIDs(uuids)
             # print(f"UUIDS : {uuids}")
            return uuids
        
        def saveToChroma(db, chuck: list[Document]):
            db.add_documents(chuck)

        async def saveToChromaIDAsyc(db, chuck: list[Document]):
            startTime = time.time()
            uuids = generateDocIDs(len(chuck))
            await db.aadd_documents(chuck, ids=uuids)
            print(f"Time Taken:  {time.time() - startTime}")
        

        def saveToChromaID(db, chuck: list[Document]):
            # generate document ID 
            uuids = generateDocIDs(len(chuck))
            db.add_documents(chuck, ids=uuids)

        def delChroma(db, idx):
            db.delete(ids=idx)

        def delChromaAll(db):
            db.delete(ids=documentIDs)


        def vectorDBsimilaritySearchWithScores(db, query, k =3):
            results = db.similarity_search_with_relevance_scores(query, k=k)
            return results

        def vectorDBsimilaritySearch(db, query, k=3):
            results = db.similarity_search(query, k=k)
            return results


        def vectorDBsimilaritySearchByVector(db, query, embed, k=3 ):
            results = db.similarity_search_by_vector(
            embedding=embed.embed_query(query), k=k)
            return results

        def vectorDBSimilaritySearchByMMR(db, query, k=3, fetchK=20):
            results = db.max_marginal_relevance_search(query=query, k=k, fetch_k=fetchK)
            return results

        db = chormaDBInit("bank-statement", embedding, persitDict="./chroma_db")
        saveToChromaID(db, chunkedDocs)
        # db = Chroma.from_documents(documents= chunkedDocs, embedding=embedding, persist_directory="./chroma_db")
elif CFG.USE_FAISS:
        db = FAISS.from_documents(documents =chunkedDocs, embedding= embedding)
        print(db.index.ntotal) # number of total index size
elif CFG.USE_PINECONE:
        from pinecone import Pinecone, ServerlessSpec
        from langchain_pinecone import PineconeVectorStore
        os.environ['PINECONE_API_KEY'] = os.getenv("PINECONE_API_KEY")# 
        pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

        import time

        index_name = "langchainvector2"  # piecond  db index name , can change if desired

        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

        if index_name not in existing_indexes:
            pc.create_index(
                name=index_name,
                dimension=1024,#768,
                metric="cosine",
                spec=ServerlessSpec(cloud="aws", region="us-east-1"),
            )
            while not pc.describe_index(index_name).status["ready"]:
                time.sleep(1)

        index = pc.Index(index_name)
        # Connect to Pinecone index and insert the chunked docs as contents
        db =PineconeVectorStore.from_documents(chunkedDocs, embedding, index_name=index_name)

CPU times: user 196 ms, sys: 17.4 ms, total: 213 ms
Wall time: 272 ms


In [84]:
db

<langchain_chroma.vectorstores.Chroma at 0x7c753491b9a0>

# 


# Test Query for Vector DB

In [85]:
query = "What is Yolo?"
# results = db.similarity_search(query)


results = vectorDBsimilaritySearchWithScores(db, query, k=3)

In [86]:
results[0][0].metadata

{'page': 9, 'source': '../test-document/yolo.pdf'}

## Setup RAG Top K

In [87]:
if USE_SIMPLE_RAG:
        num_docs= 2 # set max top k=  3 rank similarity  
elif  USE_RERANK:
        num_docs =15 # re-ranking use highter order 
else: 
        num_docs =3

# set retriever
retriever  = db.as_retriever( 
                search_type="mmr",  # Also test "similarity"
                search_kwargs={"k": num_docs})

In [81]:
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x72d454fc9b10>, search_type='mmr', search_kwargs={'k': 2})

## Test for VectorDB with retriever

In [82]:
query = "What is Yolo?"
results = retriever.invoke(query)
print(len(results))
print(results)

2
[Document(metadata={'page': 9, 'source': '../test-document/yolo.pdf'}, page_content='Farhadi. You only look once: Uniﬁed, real-time ob-\nject detection, 2016. Supplied as additional material\nhttps://arxiv.org/pdf/1506.02640.pdf . 7\n[13] Joseph Redmon and Ali Farhadi. Yolov3: An incremental\nimprovement. CoRR , abs/1804.02767, 2018. 4\n[14] Dillon Reis, Jacqueline Hong, Jordan Kupec, and Ahmad\nDaoudi. Real time ﬂying object detection code repository.\n1\n[15] Zion Market Research. Global drone market size to register\ncagr of about 38.75 percent over 2023-2030, 2023. March\n15 2023. 1\n[16] Jacob Solawetz and Francesco. What is yolov8? the ultimate\nguide., 2023. 04-30-2023. 1, 5, 8\n[17] Emma Soteriou. Ukraine ’tried to assassinate putin using\ndrone loaded with explosives’ but it crashed miles from tar-\nget, 2023. 27 April 2023. 1\n[18] Juan R. Treven and Diana M. Cordova-Esparaza. A\ncomprehensive review of yolo: From yolov1 to yolov8\nand beyond, 2023. Supplied as additional m

## LLM RAG Test

In [83]:
# from langchain.chains.question_answering import load_qa_chain

In [84]:
# chain=load_qa_chain(model ,chain_type="stuff")

In [85]:
userQuery="What is YOLO"

In [86]:
def ragAnswerLLM(query, retriever):
        newPrompt = PromptTemplate(template=templatePrompt3, 
                               input_variables=["context", "question"])
        ragContext= ""
        resultRAG = retriever.invoke(query)
        print(len(resultRAG))
        for i, res in enumerate(resultRAG): # loop RAG result
            ragContext +=  res.page_content + "\n" #f"Context {i+1} : {res['document']}\n"

        # print("RAG result: ", ragContext)
        finalPrompt = newPrompt.format(
            context=ragContext,
            question=query
        )
        # finalPrompt = query
        print(finalPrompt)
        response = generateResponse(finalPrompt, maxOutToken=256)

        return response

In [87]:
regAnswer = ragAnswerLLM(userQuery, retriever)
print(regAnswer)

2

    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, Answer is not available in the context, don't provide the wrong answer


    Context: Farhadi. You only look once: Uniﬁed, real-time ob-
ject detection, 2016. Supplied as additional material
https://arxiv.org/pdf/1506.02640.pdf . 7
[13] Joseph Redmon and Ali Farhadi. Yolov3: An incremental
improvement. CoRR , abs/1804.02767, 2018. 4
[14] Dillon Reis, Jacqueline Hong, Jordan Kupec, and Ahmad
Daoudi. Real time ﬂying object detection code repository.
1
[15] Zion Market Research. Global drone market size to register
cagr of about 38.75 percent over 2023-2030, 2023. March
15 2023. 1
[16] Jacob Solawetz and Francesco. What is yolov8? the ultimate
guide., 2023. 04-30-2023. 1, 5, 8
[17] Emma Soteriou. Ukraine ’tried to assassinate putin using
drone loaded with explosives’ but it crashed miles from tar-
get, 2023. 27 April 202

In [88]:
documentIDs

['6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '81987ae5-1c57-4dc0-a551-48c16f52d0d1',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '81987ae5-1c57-4dc0-a551-48c16f52d0d1',
 '867bbd1c-7b6e-4262-a6a0-5d442d52d25d',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '81987ae5-1c57-4dc0-a551-48c16f52d0d1',
 '867bbd1c-7b6e-4262-a6a0-5d442d52d25d',
 'af719f5e-d5c9-459d-8a16-7a9b0e5bc288',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '81987ae5-1c57-4dc0-a551-48c16f52d0d1',
 '867bbd1c-7b6e-4262-a6a0-5d442d52d25d',
 'af719f5e-d5c9-459d-8a16-7a9b0e5bc288',
 'c28a84d5-34d9-4ec8-a6d6-b7a271ac4bdf',
 '6ed4d0f6-ba35-47c8-8d3c-07c03829a702',
 '5c8b2ceb-e900-4946-a5ff-3bc4f3efbcf3',
 '81987ae5-1c57-4dc0-a551-48c16f52d0d1',
 '867bbd1c-7b6e-

In [90]:
# delChromaAll(db)