In [1]:
import json
from dotenv import load_dotenv


load_dotenv(override=True)

True

In [2]:
def pretty_print(d):
    for key, value in d.items():
        print(f"{key:<20}: {value}")

def show_page(result, page_num=0, char_count=100):
    print(f"{len(result)} documents returned\n")
    pretty_print(result[page_num].metadata)
    print(f"\nContent:")
    print(result[page_num].page_content[:char_count])


### XLSX    


In [3]:
xls= "../test_data/test_xlsx.xlsx"

In [4]:
from langchain_markitdown import XlsxLoader
md = XlsxLoader(file_path=xls, split_by_page=True) 
result = md.load()



In [5]:
show_page(result)

1 documents returned

source              : ../test_data/test_xlsx.xlsx
file_name           : test_xlsx.xlsx
file_size           : 16213
conversion_success  : True
page_number         : Sheet1

Content:
| Unnamed: 0 | First Name | Last Name | Gender | Country | Age | Date | Id |
| --- | --- | --- | ---


### Word

In [6]:
word = "../test_data/test_docx.docx"
from langchain_markitdown import DocxLoader
md = DocxLoader(file_path=word, split_by_page=False) # Set to True to enable plugins
result = md.load()

In [7]:
show_page(result)

1 documents returned

source              : ../test_data/test_docx.docx
file_name           : test_docx.docx
file_size           : 327119
conversion_success  : True
metadata_extraction_error: No module named 'exceptions'
content_type        : document_full

Content:
Lorem ipsum

# Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio.

Vest


### PPT

In [8]:
ppt_file = "../test_data/test_pptx.pptx"
from langchain_markitdown import PptxLoader
md = PptxLoader(file_path=ppt_file, split_by_page=True)
result = md.load()

In [9]:
show_page(result)

18 documents returned

source              : ../test_data/test_pptx.pptx
file_name           : test_pptx.pptx
file_size           : 4252311
conversion_success  : True
slide_count         : 8
author              : Lingineni, Raviteja
title               : feedback@customer.cool
created             : 2018-06-23 03:43:30
modified            : 2018-11-23 21:16:55
last_modified_by    : Lingineni, Raviteja
revision            : 50
image_count         : 8
text_box_count      : 2
chart_count         : 0
table_count         : 0
page_number         : 1
content_type        : presentation_slide

Content:
<!-- Slide number: 1 -->


### Using LLM

In [10]:
from langchain_openai import ChatOpenAI
from langchain_markitdown import PptxLoader

# With a Langchain model
llm = ChatOpenAI(model_name="gpt-4o")
loader = PptxLoader(file_path=ppt_file, llm=llm, split_by_page=True, verbose=True)
documents = loader.load()


Error during LLM captioning: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image has of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_format'}}
Error during LLM captioning: Error code: 400 - {'error': {'message': "You uploaded an unsupported image. Please make sure your image has of one the following formats: ['png', 'jpeg', 'gif', 'webp'].", 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_image_format'}}


In [11]:
show_page(documents,4, 10000)

18 documents returned

Header 1            : Group [Not supported]
source              : ../test_data/test_pptx.pptx
file_name           : test_pptx.pptx
file_size           : 4252311
conversion_success  : True
slide_count         : 8
author              : Lingineni, Raviteja
title               : feedback@customer.cool
created             : 2018-06-23 03:43:30
modified            : 2018-11-23 21:16:55
last_modified_by    : Lingineni, Raviteja
revision            : 50
image_count         : 8
text_box_count      : 2
chart_count         : 0
table_count         : 0
page_number         : 4
content_type        : presentation_slide

Content:
![Laptop](Graphic7.jpg)
