In [1]:
import json
from dotenv import load_dotenv


load_dotenv(override=True)

True

In [2]:
def pretty_print(d):
    for key, value in d.items():
        print(f"{key:<20}: {value}")

def show_page(result, page_num=0, char_count=100):
    print(f"{len(result)} documents returned\n")
    pretty_print(result[page_num].metadata)
    print(f"\nContent:")
    print(result[page_num].page_content[:char_count])


### XLSX    


In [3]:
xls= "../test_data/test_xlsx.xlsx"

In [4]:
from langchain_markitdown import XlsxLoader
md = XlsxLoader(file_path=xls, split_by_page=True) 
result = md.load()



In [5]:
show_page(result)

1 documents returned

source              : ../test_data/test_xlsx.xlsx
file_name           : test_xlsx.xlsx
file_size           : 16213
conversion_success  : True
page_number         : Sheet1

Content:
| Unnamed: 0 | First Name | Last Name | Gender | Country | Age | Date | Id |
| --- | --- | --- | ---


### Word

In [6]:
word = "../test_data/test_docx.docx"
from langchain_markitdown import DocxLoader
md = DocxLoader(file_path=word, split_by_page=False) # Set to True to enable plugins
result = md.load()

In [7]:
show_page(result)

1 documents returned

source              : ../test_data/test_docx.docx
file_name           : test_docx.docx
file_size           : 327119
conversion_success  : True
metadata_extraction_error: No module named 'exceptions'
content_type        : document_full

Content:
Lorem ipsum

# Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.

Vest


### PPT

In [8]:
ppt_file = "../test_data/test_pptx.pptx"
from langchain_markitdown import PptxLoader
md = PptxLoader(file_path=ppt_file, split_by_page=True)
result = md.load()

In [9]:
show_page(result,4, 10000)

15 documents returned

source              : ../test_data/test_pptx.pptx
file_name           : test_pptx.pptx
file_size           : 33213951
conversion_success  : True
slide_count         : 8
author              : Lingineni, Raviteja
title               : feedback@customer.cool
created             : 2018-06-23 03:43:30
modified            : 2025-04-10 19:35:33
last_modified_by    : Nathan Sasto
revision            : 51
image_count         : 12
text_box_count      : 2
chart_count         : 0
table_count         : 0
page_number         : 3
content_type        : presentation_slide

Content:
# Pictures and lines

![Close-up of panellets](Picture2.jpg)

![Person packing yellow suitcase](Picture8.jpg)

![Person planking](Picture12.jpg)

![Wedding by the beach](Picture10.jpg)



### Using LLM

In [10]:
from langchain_openai import ChatOpenAI
from langchain_markitdown import PptxLoader

# With a Langchain model
llm = ChatOpenAI(model_name="gpt-4o")
loader = PptxLoader(file_path=ppt_file, llm=llm, split_by_page=True, verbose=True)
documents = loader.load()
show_page(documents,4, 10000)

2025-04-11 11:29:59,122 - langchain_markitdown.base_loader.PptxLoader - INFO - Initialized PptxLoader for ../test_data/test_pptx.pptx
2025-04-11 11:29:59,130 - langchain_markitdown.base_loader.PptxLoader - INFO - Initialized PptxLoader for ../test_data/test_pptx.pptx with split_by_page=True
2025-04-11 11:29:59,131 - langchain_markitdown.base_loader.PptxLoader - INFO - Langchain LLM for image captioning: ChatOpenAI
2025-04-11 11:29:59,133 - langchain_markitdown.base_loader.PptxLoader - INFO - Starting to load PPTX file: ../test_data/test_pptx.pptx
2025-04-11 11:29:59,215 - langchain_markitdown.base_loader.PptxLoader - INFO - Extracting metadata from PPTX file
2025-04-11 11:29:59,216 - langchain_markitdown.base_loader.PptxLoader - INFO - Found 8 slides in the presentation
2025-04-11 11:29:59,302 - langchain_markitdown.base_loader.PptxLoader - INFO - Converting PPTX to markdown
2025-04-11 11:29:59,482 - langchain_markitdown.base_loader.PptxLoader - INFO - Processing images and generating 

15 documents returned

source              : ../test_data/test_pptx.pptx
file_name           : test_pptx.pptx
file_size           : 33213951
conversion_success  : True
slide_count         : 8
author              : Lingineni, Raviteja
title               : feedback@customer.cool
created             : 2018-06-23 03:43:30
modified            : 2025-04-10 19:35:33
last_modified_by    : Nathan Sasto
revision            : 51
image_count         : 12
text_box_count      : 2
chart_count         : 0
table_count         : 0
page_number         : 3
content_type        : presentation_slide

Content:
# Pictures and lines

![A close-up shot of traditional Catalan cookies known as “panellets.” These spherical treats are coated with pine nuts, giving them a distinctive texture. The cookies have a golden-brown hue, suggesting they are freshly baked, while the pine nuts add a nutty aroma and flavor. The warm lighting highlights their appetizing appeal, making them look like an inviting treat typically e