In [45]:
from langchain_text_splitters import MarkdownTextSplitter

In [46]:
splitter = MarkdownTextSplitter(chunk_size=300, chunk_overlap=80)

In [47]:
md_text = """IRN/QR Code:\n\namazon.in\n\nAmazon Seller Services Private Limited 8th Floor, Brigade Gateway, World Trade Center, No.26/1, Dr Rajkumar Road, Malleshwaram(W), Bangalore, Karnataka 560055, IN Website:www.amazon.in\n\nPAN No: AAICA3918J\n\nGST Tax Registration No: 29AAICA3918J1ZE\n\nCIN No: U51900KA2010PTC053234\n\n# Bill to\n\nName: SHRASTI SHARMA\n\nAddress:  * NO. 569, CHA/59, PREM NAGAR, ALAMBAGH, Lucknow, Lucknow, UTTAR PRADESH, 226005, IN\n\nPlace of Supply: UTTAR PRADESH\n\nState/UT Code: 09\n\nGSTIN: 09DHEPS6726R1ZA"""

In [48]:
chunks = splitter.split_text(md_text)

In [49]:
# Print the resulting chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}")

Chunk 1:
IRN/QR Code:

amazon.in

Amazon Seller Services Private Limited 8th Floor, Brigade Gateway, World Trade Center, No.26/1, Dr Rajkumar Road, Malleshwaram(W), Bangalore, Karnataka 560055, IN Website:www.amazon.in

PAN No: AAICA3918J

GST Tax Registration No: 29AAICA3918J1ZE
Chunk 2:
PAN No: AAICA3918J

GST Tax Registration No: 29AAICA3918J1ZE

CIN No: U51900KA2010PTC053234

# Bill to

Name: SHRASTI SHARMA

Address:  * NO. 569, CHA/59, PREM NAGAR, ALAMBAGH, Lucknow, Lucknow, UTTAR PRADESH, 226005, IN

Place of Supply: UTTAR PRADESH

State/UT Code: 09

GSTIN: 09DHEPS6726R1ZA


In [53]:
markdown_text = """
| SI No   | Category of Service   | Description of Service   | Tax Rate   | Amount     |\n|:--------|:----------------------|:-------------------------|:-----------|:-----------|\n| 1.      | 998599                | Order Cancellation Fee   |            | INR 85.41  |\n|         |                       | IGST                     | 18.00%     | INR 15.37  |\n| 2.      | 998599                | Refund Processing Fee    |            | INR 285.00 |\n|         |                       | IGST                     | 18.00%     | INR 51.30  |\n|         |                       | Total:                   |            | INR 437.08 |\n| 0                                  | 1                                  |\n|:-----------------------------------|:-----------------------------------|\n| Subtotal of fees amount INR 370.41 | Subtotal of fees amount INR 370.41 |\n| Subtotal for IGST                  | INR 66.67                          |\n| Subtotal of GST amount             | INR 66.67                          |\n| Total Invoice amount INR           | 437.08                             |
"""

In [54]:
def chunk_markdown_table(markdown_text, max_rows_per_chunk=5):
    # Extract only table rows (lines starting with |)
    rows = [line.strip() for line in markdown_text.strip().split("\n") if line.strip().startswith("|")]
    
    # Keep the header and alignment line
    header = rows[:2]
    data_rows = rows[2:]
    
    # Chunk data rows safely
    chunks = []
    for i in range(0, len(data_rows), max_rows_per_chunk):
        chunk_rows = header + data_rows[i:i + max_rows_per_chunk]
        chunk_text = "\n".join(chunk_rows)
        chunks.append(chunk_text)
    
    return chunks

In [56]:
chunks = chunk_markdown_table(markdown_text, max_rows_per_chunk=5)

for i, chunk in enumerate(chunks):
    print(f"\n--- Chunk {i+1} ---\n")
    print(chunk)


--- Chunk 1 ---

| SI No   | Category of Service   | Description of Service   | Tax Rate   | Amount     |
|:--------|:----------------------|:-------------------------|:-----------|:-----------|
| 1.      | 998599                | Order Cancellation Fee   |            | INR 85.41  |
|         |                       | IGST                     | 18.00%     | INR 15.37  |
| 2.      | 998599                | Refund Processing Fee    |            | INR 285.00 |
|         |                       | IGST                     | 18.00%     | INR 51.30  |
|         |                       | Total:                   |            | INR 437.08 |

--- Chunk 2 ---

| SI No   | Category of Service   | Description of Service   | Tax Rate   | Amount     |
|:--------|:----------------------|:-------------------------|:-----------|:-----------|
| 0                                  | 1                                  |
|:-----------------------------------|:-----------------------------------|
| Subtotal 

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
def markdown_table_splitter(markdown_text: str,
                            chunk_size: int = 1000,
                            chunk_overlap: int = 100):
    """
    Splits Markdown text intelligently, keeping tables intact
    (i.e., not splitting mid-row).
    
    Works with multiple tables or mixed markdown (headings, paragraphs, tables).
    """
    # Define logical Markdown-aware separators
    # We prioritize keeping full tables or rows together
    separators = [
        "\n\n|",  # split between tables if they are separated by blank lines
        "\n|",   # split between table rows
        "\n\n",  # split paragraphs
        "\n",    # then lines
        " ",     # then words
        ""       # fallback (character-level)
    ]
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len,
        is_separator_regex=False
    )
    
    chunks = splitter.split_text(markdown_text)
    return chunks


In [None]:
chunks = markdown_table_splitter(markdown_text, chunk_size=200, chunk_overlap=20)