# Pipeline Testing

If running in Google Colab, see [notes about enabling GPU](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration#enabling-the-gpu-in-colab).

# Configure Logging

In [1]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

# Configure Haystack

In [None]:
import os
from haystack.telemetry import disable_telemetry
disable_telemetry()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Document Store

In [3]:
# Start a local Elasticsearch server
from haystack.utils import launch_es
launch_es()

In [None]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

# Preprocessing

In [None]:
import pandas as pd

# Load CSV data into DataFrame
file_path = "../data/segments-200.csv"
df = pd.read_csv(file_path)

# Cleanup:

# Fill empty values with ""
df.fillna(value="", inplace=True)
# Strip whitespaces from start/end of the segment text
df["text"] = df["text"].apply(lambda x: x.strip())
# Rename column "text" to "content" for document store
df = df.rename(columns={"text": "content"})

# print(df.head())
print(df.count)

In [6]:
docs = df.to_dict(orient="records")

from pprint import pprint
# pprint(docs[:3])

# Write documents

In [7]:
# docs = docs[:100] # max for testing
document_store.delete_documents()
document_store.write_documents(docs)

# Initialize Retriever and Reader

## Retriever

In [8]:
from haystack.nodes import BM25Retriever
retriever = BM25Retriever(document_store=document_store)

## Reader

### FARMReader

In [None]:
from haystack.nodes import FARMReader
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")

### TransformersReader

In [11]:
# from haystack.nodes import TransformersReader
# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1)

# Pipeline

In [47]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

pipe.save_to_yaml("../pipelines/Answer.yaml")

# Query

In [None]:
query = "What is the meaning of life?"
retrieverTopK = 10
readerTopK = 10
prediction = pipe.run(
    query=query, params={
        "Retriever": {"top_k": retrieverTopK},
        "Reader": {"top_k": readerTopK}
    }
)

In [None]:
# from haystack.utils import print_answers

# # Change `minimum` to `medium` or `all` to raise the level of detail
# print_answers(prediction, details="all")

In [40]:
# Convert prediction dict to JSON and save
import json

# replace all non alphanumeric characters with a space in query
path = "../outputs/Query/" + ''.join(e for e in query if e.isalnum() or e.isspace())
if not os.path.exists(path):
    os.makedirs(path)

filename = path + "/" + str(retrieverTopK) + "-" + str(readerTopK) + ".json"

output_data = prediction
if isinstance(output_data, dict):
    output_data = [output_data]

df = pd.DataFrame(output_data)
df.to_json(filename, indent=2)