## Supporting functions

In [None]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    return len(encoding.encode(string))

def print_chunks_page_content(page_content,sparse=False):
    print(f"Number of chunks: {len(page_content)}")
    for i, chunk in enumerate(page_content):
        print(f"Chunk {i + 1} character count: {len(chunk.page_content)} token number: {num_tokens_from_string(chunk.page_content)}" )
        if not sparse:
            print(chunk.page_content)        
        else:
            print(chunk.page_content [:50])
        print("Meta data: ", chunk.metadata)
        print()


# Extracting text using OpenSource solution - PyPDF2

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = ("sample.pdf")
loader = PyPDFLoader(file_path)
pdf_pages = loader.load_and_split()
print_chunks_page_content(pdf_pages)

In [None]:
print_chunks_page_content(pdf_pages,sparse=True)

## Chunking the text

In [None]:


from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 1000
chunk_overlap = 200
rct_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

splits = rct_text_splitter.split_documents(pdf_pages)

print_chunks_page_content(splits)

In [None]:
print_chunks_page_content(splits, sparse=True)


# Extracting text using Document Intelligence API

In [None]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from dotenv import load_dotenv
import os
load_dotenv()

file_path = ("sample.pdf")

doc_intelligence_endpoint = os.getenv("DOCUMENTINTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("DOCUMENTINTELLIGENCE_API_KEY")

loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=doc_intelligence_endpoint, 
    api_key=doc_intelligence_key, 
    file_path=file_path, 
    api_model="prebuilt-layout", 
    mode="markdown"
)
di_documents = loader.load()
with open("md_sample.md", "w", encoding="utf-8") as file:
        file.write(di_documents[0].page_content)
print_chunks_page_content(di_documents)

## Chunking the text 

### By Headers

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
    ("####", "Header 4"),
    ("#####", "Header 5"),
    ("######", "Header 6"),  
    ("#######", "Header 7"), 
    ("########", "Header 8")
]
md_text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
md_header_splits = md_text_splitter.split_text(di_documents[0].page_content)

print("Length of splits: " + str(len(md_header_splits)))
print_chunks_page_content(md_header_splits)

In [None]:
print_chunks_page_content(md_header_splits, sparse=True)

### Within headers

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

chunk_size = 1000
chunk_overlap = 200
rct_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
splits = rct_text_splitter.split_documents(md_header_splits)

print_chunks_page_content(splits)

In [None]:
print_chunks_page_content(splits, sparse=True)

# Other tools

## unstructured

In [None]:
from unstructured.partition.md import partition_md
from unstructured.chunking.title import chunk_by_title

NEW_AFTER_N_CHARS = 1000
MAX_CHARACTERS = 1000
COMBINE_UNDER_N_CHARS = 300

elements = partition_md(text=di_documents[0].page_content)
print (f"Number of elements: {len(elements)}")

chunks = chunk_by_title(elements, multipage_sections=True, max_characters=MAX_CHARACTERS, new_after_n_chars=NEW_AFTER_N_CHARS, combine_text_under_n_chars=COMBINE_UNDER_N_CHARS)  
out_text=''
print(f"Number of chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):           
    if chunk.category == 'Table':
        chunk_text = chunk.metadata.text_as_html
    else:
        chunk_text = chunk.text
    print(f'Chunk {i} ({chunk.category}): Chunk len ({len(chunk_text)}) Chunk token ({num_tokens_from_string(chunk_text)}) \n{chunk_text}') 


## Semchunk

In [None]:

import semchunk
chunker = semchunk.chunkerify('gpt-4', chunk_size=250) 
data= chunker(di_documents[0].page_content) 

def print_chunks(list_chunks):
    print(f"Number of chunks: {len(list_chunks)}")
    for i, chunk in enumerate(list_chunks):
        print(f"\nChunk {i + 1} character count: {len(chunk)} token number: {num_tokens_from_string(chunk)}" )
        print(chunk)
print_chunks (data)