In [1]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import PreProcessor, PDFToTextConverter
from haystack.nodes.retriever.dense import DensePassageRetriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import FARMReader
from haystack.utils import convert_files_to_docs
from tqdm.auto import tqdm
from haystack.utils import print_answers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the document store
document_store = InMemoryDocumentStore()

In [3]:
# Convert files to dicts containing the text of the documents
# dicts = convert_files_to_docs(dir_path='F:\Projects\dpr-app\backend\deephashing.pdf')
# print(len(dicts))

pdf_converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"])
doc_pdf = pdf_converter.convert(file_path='docs/deephashing.pdf', meta=None)
# print(doc_pdf)  # This should show the content of the PDF if conversion is successful.

In [4]:
preprocessor = PreProcessor(split_length=100, split_overlap=0, split_respect_sentence_boundary=True)
processed_docs = []
for doc in tqdm(doc_pdf, desc="Processing documents"):
    processed = preprocessor.process([doc])  # process each document individually
    processed_docs.extend(processed)  # extend the list with the results
document_store.write_documents(processed_docs)


Processing documents:   0%|          | 0/1 [00:00<?, ?it/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 18.18docs/s]
Processing documents: 100%|██████████| 1/1 [00:00<00:00, 15.15it/s]


In [5]:
# Initialize the DPR Retriever with GPU enabled
retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True
)

  return self.fget.__get__(instance, owner)()


In [6]:
# Update the embeddings for our documents in the document store
document_store.update_embeddings(retriever)

Documents Processed: 10000 docs [00:40, 247.43 docs/s]         


In [7]:
# Initialize a reader with GPU enabled
model_name_or_path = "deepset/roberta-base-squad2"
reader = FARMReader(model_name_or_path, use_gpu=True)

In [8]:
# Initialize the Extractive QA Pipeline
pipe = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [9]:
# Make a query
queries = ["What is deep hashing?", "How does LSH work?", "What are hash functions?"]

# results = pipe.run(query="What is deep hashing?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})

In [10]:
for query in queries:
    results = pipe.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
    print(f"Query: {query}")
    print_answers(results, details="all")
    print("\n" + "-"*80 + "\n")

Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.88s/ Batches]


Query: What is deep hashing?
'Query: What is deep hashing?'
'Answers:'
[   <Answer {'answer': 'a single cosine similarity based learning objective', 'type': 'extractive', 'score': 0.6755504012107849, 'context': 'nd Tao Xiang. One\nloss for all: Deep hashing with a single cosine similarity based learning objective. Advances in Neural\nInformation Processing Syste', 'offsets_in_document': [{'start': 319, 'end': 370}], 'offsets_in_context': [{'start': 50, 'end': 101}], 'document_ids': ['5523ea9adaf6d87c7b8cceac2e6b2e39'], 'meta': {'_split_id': 74}}>,
    <Answer {'answer': 'state-of-the-art methods', 'type': 'extractive', 'score': 0.6514706611633301, 'context': 'e significant advances brought about by deep\nlearning in image tasks, deep hashing methods [45,50] have become state-of-the-art methods in the field.\n', 'offsets_in_document': [{'start': 604, 'end': 628}], 'offsets_in_context': [{'start': 111, 'end': 135}], 'document_ids': ['cd866affb03996661a329c236903c0d5'], 'meta': {'_split_i

Inferencing Samples: 100%|██████████| 1/1 [00:02<00:00,  2.77s/ Batches]


Query: How does LSH work?
'Query: How does LSH work?'
'Answers:'
[   <Answer {'answer': 'assigns\ncompact binary hash codes to images', 'type': 'extractive', 'score': 0.2829137146472931, 'context': 'r example, Locality-Sensitive Hashing (LSH) [2,17,21] assigns\ncompact binary hash codes to images such that similar items receive similar hash codes, ', 'offsets_in_document': [{'start': 192, 'end': 235}], 'offsets_in_context': [{'start': 54, 'end': 97}], 'document_ids': ['72a06bf473f74c0387c0990cd6307679'], 'meta': {'_split_id': 3}}>,
    <Answer {'answer': 'By using pre-trained architectures', 'type': 'extractive', 'score': 0.05957616865634918, 'context': 'th a sequence of fully connected layers to\nbe fine-tuned. By using pre-trained architectures, they exploit the enriched features and start the trainin', 'offsets_in_document': [{'start': 208, 'end': 242}], 'offsets_in_context': [{'start': 58, 'end': 92}], 'document_ids': ['87343af840ed03af1590b98905b5e3cb'], 'meta': {'_split_id': 14}}

Inferencing Samples: 100%|██████████| 1/1 [00:03<00:00,  3.58s/ Batches]

Query: What are hash functions?
'Query: What are hash functions?'
'Answers:'
[   <Answer {'answer': 'two-termed loss function', 'type': 'extractive', 'score': 0.15098091959953308, 'context': 'he similarity learning and\nquantization strategies in a single two-termed loss function where one term accounts for similarity learning\nand the other ', 'offsets_in_document': [{'start': 380, 'end': 404}], 'offsets_in_context': [{'start': 63, 'end': 87}], 'document_ids': ['d0f0e6e661561d6e21d9b11bcd2d4afc'], 'meta': {'_split_id': 7}}>,
    <Answer {'answer': 'discrete nature', 'type': 'extractive', 'score': 0.12224160134792328, 'context': 'een proposed (e.g., , [6,7,37,45,51,53,58,63]).\nHowever, due to the discrete nature of hash functions, the objective function in (3.1) is not differen', 'offsets_in_document': [{'start': 370, 'end': 385}], 'offsets_in_context': [{'start': 68, 'end': 83}], 'document_ids': ['b733a44dadcf150e6f21f6e3e45ca9be'], 'meta': {'_split_id': 25}}>,
    <Answer {'answer':




In [11]:

# # Output the results
# if results['answers']:
#     for idx, answer_obj in enumerate(results['answers']):
#         answer = answer_obj.__dict__  # Convert the Answer object to a dictionary if necessary
#         print(f"\nAnswer {idx+1}:")
#         print(f"  Text: {answer['answer']}")
#         print(f"  Score: {answer['score']:.4f}")
#         context = answer['context'] if 'context' in answer else "Not provided"
#         print(f"  Context: {context}")
#         doc_id = answer['document_ids'][0] if 'document_ids' in answer else "Not provided"
#         print(f"  Document ID: {doc_id}")
#         # Handle Span object or dictionary for offsets
#         start_pos = answer['offsets_in_document'][0].start if hasattr(answer['offsets_in_document'][0], 'start') else answer['offsets_in_document'][0]['start']
#         end_pos = answer['offsets_in_document'][0].end if hasattr(answer['offsets_in_document'][0], 'end') else answer['offsets_in_document'][0]['end']
#         print(f"  Start: {start_pos}, End: {end_pos}")
# else:
#     print("\nNo answers found.")


In [12]:
# print_answers(results, details="all")