# Text Extraction

In [1]:
from llama_index.readers.file import PDFReader, DocxReader
from llama_index.readers.web import SimpleWebPageReader
import pathlib
from pprint import pprint

In [None]:
# Extract text from pdf
pdf_reader = PDFReader()
pdf_documents = pdf_reader.load_data(file=pathlib.Path("./samples/pdf-report.pdf"))

# Extract text from docx
docx_reader = DocxReader()
docx_documents = docx_reader.load_data(file=pathlib.Path("./samples/docx-report.docx"))

# Extract text from web page
web_reader = SimpleWebPageReader()
web_documents = web_reader.load_data(urls=["https://quotes.toscrape.com/"])

In [4]:
# Peek documents content
print("PDF Extraction :")
pprint(pdf_documents[0].text[:300])
print()
print("Docx Extraction :")
pprint(docx_documents[0].text[:300])
print()
print("Web Extraction :")
pprint(web_documents[0].text[:300])

PDF Extraction :
('Renewable Energy Market Trends: A 2025 Overview Executive Summary This '
 'report examines the current state of renewable energy markets globally, '
 'highlighting key trends, challenges, and opportunities. In the Ô¨Årst quarter '
 'of 2025, renewable energy installations have continued their strong growth '
 'trajec')

Docx Extraction :
('Renewable Energy Market Trends: A 2025 Overview\n'
 '\n'
 'Executive Summary\n'
 '\n'
 'This report examines the current state of renewable energy markets globally, '
 'highlighting key trends, challenges, and opportunities. In the first quarter '
 'of 2025, renewable energy installations have continued their strong growth '
 'tra')

Web Extraction :
('<!DOCTYPE html>\n'
 '<html lang="en">\n'
 '<head>\n'
 '\t<meta charset="UTF-8">\n'
 '\t<title>Quotes to Scrape</title>\n'
 '    <link rel="stylesheet" href="/static/bootstrap.min.css">\n'
 '    <link rel="stylesheet" href="/static/main.css">\n'
 '    \n'
 '    \n'
 '</head>\n'
 '<bod

In [5]:
from llama_index.readers.file import CSVReader, MarkdownReader
from llama_index.readers.json import JSONReader
from llama_index.readers.database import DatabaseReader

In [6]:
# Extract text from csv
csv_reader = CSVReader()
csv_documents = csv_reader.load_data(file=pathlib.Path("./samples/csv-data.csv"))

# Extract text from json
json_reader = JSONReader()
json_documents = json_reader.load_data(input_file=pathlib.Path("./samples/json-data.json"))

# Extract text from markdown
md_reader = MarkdownReader()
md_documents = md_reader.load_data(file=pathlib.Path("./samples/README.md"))

# Extract text from database
db_reader = DatabaseReader(uri="sqlite:///samples/database.db")
db_documents = db_reader.load_data(query="SELECT * FROM orders")

In [7]:
# Peek documents content
print("CSV Extraction :")
pprint(csv_documents[0].text[:300])
print()
print("JSON Extraction :")
pprint(json_documents[0].text[:300])
print()
print("Markdown Extraction :")
pprint(md_documents[0].text[:300])
print()
print("Database Extraction :")
pprint(md_documents[0].text[:300])

CSV Extraction :
('OrderID, OrderDate, Customer, Product, Quantity, OrderStatus, Misc\n'
 '4321, 1/30/2010, BX30550, ABQ008, 163, Complete, 54\n'
 '4352, 1/15/2010, DY55760, ABQ008, 107, Complete, 36\n'
 '4353, 1/29/2010, BC13961, ABQ016, 110, Complete, 37\n'
 '4317, 3/2/2010, FV41827, EV008, 80, Complete, 27\n'
 '4320, 4/17/2010, WJ72349, EV')

JSON Extraction :
('"OrderID": 4321,\n'
 '"OrderDate": "1/30/2010",\n'
 '"Customer": "BX30550",\n'
 '"Product": "ABQ008",\n'
 '"Quantity": 163,\n'
 '"OrderStatus": "Complete",\n'
 '"Misc": 54\n'
 '"OrderID": 4352,\n'
 '"OrderDate": "1/15/2010",\n'
 '"Customer": "DY55760",\n'
 '"Product": "ABQ008",\n'
 '"Quantity": 107,\n'
 '"OrderStatus": "Complete",\n'
 '"Misc": 36\n'
 '"OrderID": 4353,')

Markdown Extraction :
('\n'
 '\n'
 'Sample Data Files Files Overview\n'
 'This directory contains sample data files in various formats for testing and '
 'demonstration purposes.\n'
 '- **csv-data.csv**: Tabular data in CSV format\n'
 '- **json-data.json**: 

In [8]:
import re
from llama_index.core.schema import Document

In [9]:
# Get raw text from document
raw_text = pdf_documents[0].text

def clean_text(text):
    # Remove white space
    text = re.sub(r"\s+", " ", text)
    
    # Remove special characters but keep structural elements
    text = re.sub(r"[^\w\s\.\,\;\:\-\(\)\[\]\{\}\"\'\n\t]", "", text)
    
    # Fix common OCR errors : exemple
    text = text.replace("l<eywor", "keyword")
    
    return text.strip()

cleaned_text = clean_text(raw_text)

In [11]:
print(f"Original length : {len(raw_text)} characters")
print(f"Cleaned length : {len(cleaned_text)} characters")

Original length : 2180 characters
Cleaned length : 2173 characters


In [13]:
# Function to extract basic metadata
def extract_metadata(text, filename):
    metadata = {
        "source": filename,
        "file_type": filename.split('.')[-1],
    }

    # Extract title (assume first line might be title)
    lines = text.split('\n')
    if lines and len(lines[0]) < 100:
        metadata["title"] = lines[0].strip()

    # Try to extract date with regex (simple example)
    date_match = re.search(r'\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4}', text)
    if date_match:
        metadata["date"] = date_match.group(0)

    return metadata

In [14]:
# Extract metadata from pdf document
metadata = extract_metadata(raw_text, "./samples/pdf-report.pdf")
f"metadata : {metadata}"

"metadata : {'source': './samples/pdf-report.pdf', 'file_type': 'pdf'}"

In [15]:
# Function to combine all steps 
def process_document(file_path):
    """Process a document with appropriate reader and cleaning"""

    # Determine file type
    file_type = file_path.split('.')[-1].lower()

    # Select appropriate reader
    if file_type == 'pdf':
        reader = PDFReader()
    elif file_type in ['docx', 'doc']:
        reader = DocxReader()
    elif file_type in ['html', 'htm']:
        # Assuming file is a local HTML file
        reader = SimpleWebPageReader()
    else:
        # Default to simple text reading
        with open(file_path, 'r') as f:
            return Document(text=f.read(), metadata={"source": file_path})

    # Load and extract text
    docs = reader.load_data(file=file_path)

    if not docs:
        return None

    # Clean the text
    cleaned_text = clean_text(docs[0].text)

    # Extract metadata
    metadata = extract_metadata(docs[0].text, file_path)

    # Create processed document
    return Document(text=cleaned_text, metadata=metadata)

In [16]:
# Create a new document with cleaned text and metadata
new_document = process_document("./samples/pdf-report.pdf")
print(f"Length : {len(new_document.text)}")
print(f"Metadata : {new_document.metadata}")



Length : 2173
Metadata : {'source': './samples/pdf-report.pdf', 'file_type': 'pdf'}
