In [None]:
import pathlib

from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import Crawler, PreProcessor, BM25Retriever, FARMReader
from haystack.pipelines import Pipeline


In [None]:
DATA_DIR = pathlib.Path().absolute().parent / "data"
CRAWLED_FILES_DIR = DATA_DIR / "crawled_files"

# Prerequisites
- The Chrome browser must be installed manually

This is because the Haystack crawler uses Selenium but only with the ChromeDriver.

Now, we can install ChromeDriver like this:

In [None]:
from webdriver_manager.chrome import ChromeDriverManager
ChromeDriverManager().install()

# Step 1: Get the data, clean it, and store it

In [None]:
crawler = Crawler(
    urls=["https://haystack.deepset.ai"],   # Websites to crawl
    crawler_depth=1,    # How many links to follow
    output_dir=CRAWLED_FILES_DIR,  # The directory to store the crawled files, not very important, we don't use the files in this example
)

In [None]:
document_store = InMemoryDocumentStore(use_bm25=True)

In [None]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=500,
    split_respect_sentence_boundary=True,
)
indexing_pipeline = Pipeline()
indexing_pipeline.add_node(component=crawler, name="crawler", inputs=['File'])
indexing_pipeline.add_node(component=preprocessor, name="preprocessor", inputs=['crawler'])
indexing_pipeline.add_node(component=document_store, name="document_store", inputs=['preprocessor'])

indexing_pipeline.run()

# Step 2: Use the data to answer questions

Let's create the indexing pipeline. It will contain:
  1. A Crawler node that fetches text from a website.
  2. A PreProcessor that makes the documents friendly to the Retriever.
  3. The DocumentStore that receives the documents and stores them.

In [None]:
retriever = BM25Retriever(document_store=document_store)
reader =  FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled")

query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
query_pipeline.add_node(component=reader, name="reader", inputs=["retriever"])



In [None]:
results = query_pipeline.run(query="What can I use Haystack for?")

print("\nQuestion: ", results["query"])
print("\nAnswers:")
for answer in results["answers"]:
    print("- ", answer.answer)
print("\n\n")