### Load Data and Construct VectorDatabase

In [1]:
from Utils import *
import os

# initialize the database
db = VectorDatabase()

# gpu if available
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:4000"

# initialize the database
db.initialize_process(chunk_size=256 ,chunk_overlap=200)

  from .autonotebook import tqdm as notebook_tqdm


< VectorDatabase initialized >
  - PaddleOCR initialized successfully
  - loading data into ChromaDB 
     - loading [faq] ...
        ... collection [faq] deleted.
     - loading [insurance] ...
        ... collection [insurance] deleted.
     - loading [finance] ...
        ... collection [finance] deleted.


In [2]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='final')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 87.33%
     - Category: [insurance], Accuracy: 92.00%
     - Category: [finance], Accuracy: 72.00%
     - Category: [faq], Accuracy: 98.00%


In [3]:
from Utils import *

# initialize the retriever
retriever = Retriever(question_path='./dataset/preliminary/questions_preliminary.json')
# do question 
retriever.process_questions(method='final')

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 
  - Answers saved to output.json 


### Vector Retriever

In [4]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='Vector')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 


  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 87.33%
     - Category: [insurance], Accuracy: 90.00%
     - Category: [finance], Accuracy: 74.00%
     - Category: [faq], Accuracy: 98.00%


### Original BM25 Retriever

In [6]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='original')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 74.67%
     - Category: [insurance], Accuracy: 82.00%
     - Category: [finance], Accuracy: 50.00%
     - Category: [faq], Accuracy: 92.00%


### Llama index BM25

In [1]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='BM25', text_type='c')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

  from .autonotebook import tqdm as notebook_tqdm


< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 84.00%
     - Category: [insurance], Accuracy: 96.00%
     - Category: [finance], Accuracy: 62.00%
     - Category: [faq], Accuracy: 94.00%


### BM25 + Vector Fusion Retriever

In [2]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='BM25_Vector_rrf',k=60) #100:88% 20:88%

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 
  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 88.67%
     - Category: [insurance], Accuracy: 94.00%
     - Category: [finance], Accuracy: 76.00%
     - Category: [faq], Accuracy: 96.00%


### Relative fusion

In [3]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='relative_fusion',weight=0.8)

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 


[2024-11-18 22:51:43,641] [   ERROR] _common.py:120 - Giving up send_request(...) after 4 tries (posthog.request.APIError: [PostHog] upstream connect error or disconnect/reset before headers. reset reason: overflow (503))


  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 88.00%
     - Category: [insurance], Accuracy: 92.00%
     - Category: [finance], Accuracy: 74.00%
     - Category: [faq], Accuracy: 98.00%


### Distribution fusion

In [4]:
from Utils import *

# initialize the retriever
retriever = Retriever()
# do question 
retriever.process_questions(method='distribution_fusion')

# evaluate the accuracy
evaluator = Evaluation()
evaluator.output_evaluation()

< VectorDatabase initialized >
  - PaddleOCR initialized successfully
< Retriever initialized > 


  - Answers saved to output.json 
< Evaluation by Ground Truths > 
  - Retrieval accuracy: 85.33%
     - Category: [insurance], Accuracy: 94.00%
     - Category: [finance], Accuracy: 66.00%
     - Category: [faq], Accuracy: 96.00%
