In [None]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore

### Document Splitter

In [34]:
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter

doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
result = splitter.run(documents=[doc])
result

{'documents': [Document(id=b2c67a306f99f2d859c1bc0811c9ba449dd7288b7d3fcbc6591141b69fb2a341, content: 'Moonlight shimmered softly, ', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1}),
  Document(id=2b6b758e8432e27ee9ec0b15506e60506557cd77206e26b335f041f0c5462bb3, content: 'wolves howled nearby, ', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1}),
  Document(id=accbb0fd14e1ea1bb237a6d55e537e18938effe0a5dcb108b3c1f54d46d57f41, content: 'night enveloped everything.', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1})]}

### Document Cleaner

In [38]:
from haystack import Document
from haystack.components.preprocessors import DocumentCleaner

doc = Document(content="This   is  a  document  to  clean\\n\\n\\nsubstring to remove")

cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
result = cleaner.run(documents=[doc])

#assert result["documents"][0].content == "This is a document to clean "
result

{'documents': [Document(id=cfa6114cbbb149e353c7c91bcf156714d7a030dd8e4a87e3ea3ae5e534a27eb3, content: 'This is a document to clean\n\n\n')]}

### 1. Word Document Converter

In [None]:
#!pip install python-docx

In [8]:
from CustomDOCXToDocument import CustomDOCXToDocument

converter = CustomDOCXToDocument()
results = converter.run(sources=["Documents/Week2.docx"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the DOCX file.'

Week 2: Communication and Middleware
Lesson 3: Communication in Distributed Systems
Communication models and protocols.
Sockets, message-passing, and remote procedure calls (RPC).
Lesson 4: Middleware and Message Brokers
Introduction to middleware.
Message queuing systems (e.g., RabbitMQ, Apache Kafka).

Class Activity: Hands-on lab on socket programming.
Group Communication in Distributed Systems
Communication between two processes in a distributed system is required to exchange various data, such as code or a file, between the processes. When one source process tries to communicate with multiple processes at once, it is called Group Communication. A group is a collection of interconnected processes with abstraction. This abstraction is to hide the message passing so that the communication looks like a normal procedure call. Group communication also helps the processes from different hosts to work together and perform operations in a synchronized manner, therefore increasing the overa

### 2. PDF Document Converter

In [4]:
from haystack.components.converters.pypdf import PyPDFToDocument
import datetime

converter = PyPDFToDocument()
results = converter.run(sources=["HealthPal.pdf"])
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.

Nishauri 2.0 - Leveraging Retrieval -Augmented Generation for 
Digital Health  
 
Introduction:  
The challenge posed in the Nishauri application development revolves around integrating 
Retrieval -Augmented Generation (RAG) to create an advanced chatbot system. This 
initiative aims to harness Generative AI to enhance digital health services, ensuring accurate 
and timely information dissemination to users/patients.  
 
Understanding of the Case Study:  
In the realm of healthcare, ensuring the precision of information shared is of utmost 
importance. Here, the integration of Retrieval -Augmented Generation (RAG) models 
emerges as a crucial strategy, acting as a bridge between dynamic technological 
advanceme nts and the essential need for accurate healthcare information dissemination.  
 
Definition of RAG:  
 
RAG is a sophisticated model combining a parametric (pre -trained large language model) 
and a non -parametric model. This fusion enables dynamic information retrieval and 
g

### 3. PPT Document Converter

In [None]:
#!pip install python-pptx

In [11]:
from CustomPPTXToDocument import CustomPPTXToDocument

converter = CustomPPTXToDocument()
results = converter.run(sources=["Documents/HCI_Lect.pptx"])
documents = results["documents"]
print(documents[0].content)

# 'This is the text from the PPTX file.'

Why Do We Prototype?
Get feedback on our design faster
saves money
Experiment with alternative designs
Fix problems before code is written
Keep the design centered on the customerFidelity in Prototyping
Fidelity refers to the level of detail
High fidelity?
prototypes look like the final product
 Low fidelity?
artists renditions with many details missingLow-fidelity Storyboards
Where do storyboards come from?
film & animation
From Star War VI:
Return of the Jedi
Give you a “script” of important events
leave out the details
concentrate on the important interactionsWhy Use Low-fi Prototypes?
Traditional methods take too long
sketches -> prototype -> evaluate -> iterate
Can simulate the prototype
sketches -> evaluate -> iterate
sketches act as prototypes
designer “plays computer”
other design team members observe & record
Kindergarten implementation skills
allows non-programmers to participateHi-fi Prototypes Warp
Perceptions of the customer/reviewer?
formal representation indicates “f

### 4. Normal Txt Document Converter

In [None]:
from haystack.components.converters.txt import TextFileToDocument

converter = TextFileToDocument()
results = converter.run(sources=["sample.txt"])
documents = results["documents"]
print(documents[0].content)
# 'This is the content from the txt file.'