In [None]:
%%bash

pip install haystack-ai accelerate "sentence-transformers>=3.0.0" "datasets>=2.6.1"

Collecting haystack-ai
  Downloading haystack_ai-2.6.1-py3-none-any.whl.metadata (13 kB)
Collecting sentence-transformers>=3.0.0
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=2.6.1
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting haystack-experimental (from haystack-ai)
  Downloading haystack_experimental-0.2.0-py3-none-any.whl.metadata (11 kB)
Collecting lazy-imports (from haystack-ai)
  Downloading lazy_imports-0.3.1-py3-none-any.whl.metadata (10 kB)
Collecting openai>=1.1.0 (from haystack-ai)
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting posthog (from haystack-ai)
  Downloading posthog-3.7.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.6.1)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.6.1)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)


Knowing you’re using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/enabling-telemetry) for more details.

In [None]:
from haystack.telemetry import tutorial_running

tutorial_running(34)



In [None]:
pip install langchain_community docx2txt

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3959 sha256=24f6e238d0ab7f281fff8ee46b751dfc936f8877796b5e8e830f388e247682e5
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8


In [None]:
from langchain_community.document_loaders import Docx2txtLoader

loader = Docx2txtLoader("IGI LIFE WTO Zeenat Takaful Plan.docx")

data = loader.load()

data

[Document(metadata={'source': 'IGI LIFE WTO Zeenat Takaful Plan.docx'}, page_content='Zeenat Takaful Plan (Underwritten By IGI Life Insurance) is a saving and protection plan specifically designed for our Female Clientele along with spouse coverage.\n\nProduct Features/ Benefits:\n\nUnit Linked\n\nIt provides life cover as well as a return on investment.\n\nContributions Management\n\nOffers the flexibility to direct customer contributions in part or whole to any of the following funds\n\nBalanced Fund\n\nConservative Fund\n\nManaged Fund\n\nDeath Benefit\n\nDeath Benefit is Both of Sum Covered Plus Participant Account value (PIA) + Surplus (if any)\n\nMaturity Benefit\n\nOn completion of the membership term, the PIA value along with the account value of Top-up will be payable, which can be taken as a Lump sum or in applied to life time pensions\n\nOptional Riders\n\nAccidental Death Benefit\n\nIncome Benefit- Disability\n\nIncome Benefit-Death\n\nWaiver of Contribution\n\nAdditional P

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

data_content = data[0].page_content

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

# Use the extracted text content
texts = text_splitter.create_documents([data_content])

texts

[Document(metadata={}, page_content='Zeenat Takaful Plan (Underwritten By IGI Life Insurance) is a saving and protection plan'),
 Document(metadata={}, page_content='and protection plan specifically designed for our Female Clientele along with spouse coverage.'),
 Document(metadata={}, page_content='Product Features/ Benefits:\n\nUnit Linked'),
 Document(metadata={}, page_content='Unit Linked\n\nIt provides life cover as well as a return on investment.\n\nContributions Management'),
 Document(metadata={}, page_content='Offers the flexibility to direct customer contributions in part or whole to any of the following'),
 Document(metadata={}, page_content='of the following funds'),
 Document(metadata={}, page_content='Balanced Fund\n\nConservative Fund\n\nManaged Fund\n\nDeath Benefit'),
 Document(metadata={}, page_content='Death Benefit is Both of Sum Covered Plus Participant Account value (PIA) + Surplus (if any)'),
 Document(metadata={}, page_content='Maturity Benefit'),
 Document(meta

In [None]:
from datasets import load_dataset
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter




textss = [Document(content=doc.page_content, meta=doc.metadata) for doc in texts]



model = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder.documents", "writer.documents")

indexing_pipeline.run({"documents": textss})

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

{'writer': {'documents_written': 56}}

In [None]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder


retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader("deepset/roberta-base-squad2")
reader.warm_up()

extractive_qa_pipeline = Pipeline()

extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=reader, name="reader")

extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/295 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



<haystack.core.pipeline.pipeline.Pipeline object at 0x7ab768d3b0a0>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

Try extracting some answers.

In [None]:
query = "Free Look Period?"
answer = extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 5}, "reader": {"query": query, "top_k": 1}}
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Initialize a list to store the content from the answers
extracted_content = []

# Iterate through the answers
for ans in answer['reader']['answers']:
    # Check if there is an associated document and if it has content
    if ans.data and ans.document:
        extracted_content.append(ans.document.content)

# Print or return the extracted content
print(extracted_content[0])

During a 14 day free look period, the customer can review the policy terms and conditions and
