In [None]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore

### Document Splitter

In [34]:
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter

doc = Document(content="Moonlight shimmered softly, wolves howled nearby, night enveloped everything.")

splitter = DocumentSplitter(split_by="word", split_length=3, split_overlap=0)
result = splitter.run(documents=[doc])
result

{'documents': [Document(id=b2c67a306f99f2d859c1bc0811c9ba449dd7288b7d3fcbc6591141b69fb2a341, content: 'Moonlight shimmered softly, ', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1}),
  Document(id=2b6b758e8432e27ee9ec0b15506e60506557cd77206e26b335f041f0c5462bb3, content: 'wolves howled nearby, ', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1}),
  Document(id=accbb0fd14e1ea1bb237a6d55e537e18938effe0a5dcb108b3c1f54d46d57f41, content: 'night enveloped everything.', meta: {'source_id': 'ea61b469f16c17b46ae667f395d7997c488dc7da96f6f4f9b4ac327f9a7b1abd', 'page_number': 1})]}

### Document Cleaner

In [38]:
from haystack import Document
from haystack.components.preprocessors import DocumentCleaner

doc = Document(content="This   is  a  document  to  clean\\n\\n\\nsubstring to remove")

cleaner = DocumentCleaner(remove_substrings = ["substring to remove"])
result = cleaner.run(documents=[doc])

#assert result["documents"][0].content == "This is a document to clean "
result

{'documents': [Document(id=cfa6114cbbb149e353c7c91bcf156714d7a030dd8e4a87e3ea3ae5e534a27eb3, content: 'This is a document to clean\n\n\n')]}

### 1. Word Document Converter

In [None]:
from haystack.components.converters.docx import DOCXToDocument

converter = DOCXToDocument()
results = converter.run(sources=["sample.docx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the DOCX file.'

### 2. PDF Document Converter

In [None]:
from haystack.components.converters.pdfminer import PDFMinerToDocument

converter = PDFMinerToDocument()
results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.'

In [None]:
from haystack.components.converters.pypdf import PyPDFToDocument

converter = PyPDFToDocument()
results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# 'This is a text from the PDF file.

### 3. PPT Document Converter

In [None]:
from haystack.components.converters.pptx import PPTXToDocument

converter = PPTXToDocument()
results = converter.run(sources=["sample.pptx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)

# 'This is the text from the PPTX file.'

### 4. Normal Txt Document Converter

In [None]:
from haystack.components.converters.txt import TextFileToDocument

converter = TextFileToDocument()
results = converter.run(sources=["sample.txt"])
documents = results["documents"]
print(documents[0].content)
# 'This is the content from the txt file.'