In [1]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

In [13]:
doc_dir = "print_pack"

In [14]:
import os

from haystack import Pipeline
from haystack.nodes import PDFToTextConverter

indexing_pipeline = Pipeline()
pdf_converter = PDFToTextConverter()

indexing_pipeline.add_node(component=pdf_converter, name="PDFtoTextConverter", inputs=["File"])
indexing_pipeline.add_node(component=document_store, name="DocumentStore", inputs=["PDFtoTextConverter"])

files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir) if f.endswith(".pdf")]
files_to_index

['print_pack/TM351_Part 07 Notebooks_logo.pdf',
 'print_pack/TM351_Part 14 Notebooks_logo.pdf',
 'print_pack/TM351_Part 22 Notebooks_logo.pdf',
 'print_pack/TM351_Part 12 Notebooks_logo.pdf',
 'print_pack/TM351_Part 02 Notebooks_logo.pdf',
 'print_pack/TM351_Part 01 Notebooks_logo.pdf',
 'print_pack/TM351_Part 21 Notebooks_logo.pdf',
 'print_pack/TM351_Part 20 Notebooks_logo.pdf',
 'print_pack/TM351_Part 23 Notebooks_logo.pdf',
 'print_pack/TM351_Part 03 Notebooks_logo.pdf',
 'print_pack/TM351_Part 05 Notebooks_logo.pdf',
 'print_pack/TM351_Part 15 Notebooks_logo.pdf',
 'print_pack/TM351_Part 25 Notebooks_logo.pdf',
 'print_pack/TM351_Part 26 Notebooks_logo.pdf',
 'print_pack/TM351_Part 16 Notebooks_logo.pdf']

In [15]:
_ = indexing_pipeline.run_batch(file_paths=files_to_index)

Converting files:   0%|          | 0/15 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Updating BM25 representation...:   0%|          | 0/15 [00:00<?, ? docs/s]

In [16]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store=document_store)

In [17]:
from haystack.nodes import FARMReader

#deepset/roberta-base-squad2
reader = FARMReader(model_name_or_path="deepset/tinyroberta-squad2", use_gpu=True)

In [18]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [23]:
prediction = pipe.run(
    query="How do you use a pipeline in MongoDB?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

Inferencing Samples:   0%|          | 0/52 [00:00<?, ? Batches/s]

In [24]:
from haystack.utils import print_answers

print_answers(
    prediction,
    details="all" ## Choose from `minimum`, `medium`, and `all`
)

'Query: How do you use a pipeline in MongoDB?'
'Answers:'
[   <Answer {'answer': 'group the accidents by number of vehicles and number of casualties', 'type': 'extractive', 'score': 0.9107933044433594, 'context': 'ctivity 2¶\nUsing an aggregation pipeline, group the accidents by number of vehicles and number of casualties. From\neach group, find the number of acci', 'offsets_in_document': [{'start': 89836, 'end': 89902}], 'offsets_in_context': [{'start': 42, 'end': 108}], 'document_ids': ['a8c732e5b7aab29d04ebc255c6aa40ce'], 'meta': {}}>,
    <Answer {'answer': 'unwinds the $Casualties lists', 'type': 'extractive', 'score': 0.8909108638763428, 'context': '-hand side\nof the cell.\nStart of by defining a pipeline that unwinds the $Casualties lists and then group the result by casualty age\nband (Age_Band_of', 'offsets_in_document': [{'start': 111408, 'end': 111437}], 'offsets_in_context': [{'start': 61, 'end': 90}], 'document_ids': ['a8c732e5b7aab29d04ebc255c6aa40ce'], 'meta': {}}>,
   

In [12]:
prediction = pipe.run(
    query="What are the properties of Anscombe's quartet?",
    params={
        "Retriever": {"top_k": 10},
        "Reader": {"top_k": 5}
    }
)

print_answers(
    prediction,
    details="all" ## Choose from `minimum`, `medium`, and `all`
)

Inferencing Samples:   0%|          | 0/5 [00:00<?, ? Batches/s]

"Query: What are the properties of Anscombe's quartet?"
'Answers:'
[   <Answer {'answer': 'Graphical output such as described above is readily available to anyone who does his own program-\nming', 'type': 'extractive', 'score': 0.5456240177154541, 'context': 'd his paper as follows:\nGraphical output such as described above is readily available to anyone who does his own program-\nming. I myself habitually ge', 'offsets_in_document': [{'start': 17809, 'end': 17911}], 'offsets_in_context': [{'start': 24, 'end': 126}], 'document_ids': ['ebd0822493cde768584fae4de6ce81bd'], 'meta': {}}>,
    <Answer {'answer': 'The values in the latitude and longitude cells look sensible', 'type': 'extractive', 'score': 0.47285276651382446, 'context': '67\n-0.820093\n4\n1569\nMK13\n52.056028\n-0.796945\nThe values in the latitude and longitude cells look sensible, but the outcodes are in a different format ', 'offsets_in_document': [{'start': 44906, 'end': 44966}], 'offsets_in_context': [{'start': 45, 'end