# Why Metadata Matters
- It provides context about documents (author, date, source, topic)
- It enables filtering and sorting (find all documents from a specific date range)
- It improves search relevance (identifying which documents are most important)
- It helps with organization (grouping related documents together)

In [1]:
from llama_index.core import Document
from llama_index.core.node_parser import SimpleFileNodeParser
import os
from datetime import datetime

In [2]:
sample_doc = """
# Annual Report 2023
## Financial Performance
Our company achieved record profits in 2023, with revenue increasing 15% compared to 2022.

## Product Launches
The new X1000 product line was launched in March 2023 and has exceeded sales expectations.

## Future Outlook
We expect continued growth in 2024, driven by expansion into European markets.
"""

# Create a document with basic metadata
filename = "annual_report_2023.md"
file_path = f"/documents/{filename}"

In [3]:
# Extract basic metadata from filename and content
def extract_basic_metadata(content, filename, file_path):
    """ Extract basic metadata from document content and file information """
    
    # Get file information
    file_metadata = {
        "filename": filename,
        "file_path": file_path,
        "file_type": os.path.splitext(filename)[1][1:],
        "file_size": len(content),
        "extracted_date": datetime.now().strftime("%d-%m-%Y")
    }
    
    # Try to extract year from content or filename
    year_match = None
    if "2023" in content:
        year_match = "2023"
    elif "2022" in content:
        year_match = "2022"
    elif "2024" in content:
        year_match = "2024"
    
    if year_match:
        file_metadata["year"] = year_match
    
    # Try to extract document type
    if "annual report" in content.lower() or "annual report" in filename.lower():
        file_metadata["document_type"] = "annual_report"

    return file_metadata

In [4]:
# Extract metadata
metadata = extract_basic_metadata(sample_doc, filename, file_path)

In [5]:
# Create document with metadata
document = Document(text=sample_doc, metadata=metadata)

In [6]:
# Show the document with its metadata
print(f"Document Text (first 50 chars) : {document.text[:50]}")
print()
print("Document Metadata :")
for key,val in document.metadata.items():
    print(f"- {key} : {val}")

Document Text (first 50 chars) : 
# Annual Report 2023
## Financial Performance
Our

Document Metadata :
- filename : annual_report_2023.md
- file_path : /documents/annual_report_2023.md
- file_type : md
- file_size : 348
- extracted_date : 03-12-2025
- year : 2023
- document_type : annual_report


In [7]:
# Create nodes with the metadata
parser = SimpleFileNodeParser.from_defaults()
nodes = parser.get_nodes_from_documents([document])

In [8]:
nodes[0].metadata

{'filename': 'annual_report_2023.md',
 'file_path': '/documents/annual_report_2023.md',
 'file_type': 'md',
 'file_size': 348,
 'extracted_date': '03-12-2025',
 'year': '2023',
 'document_type': 'annual_report'}

In [9]:
# metadata is propagated to nodes
print(f"Nodes created : {len(nodes)}")
print("Node metadata :")
for key,val in nodes[0].metadata.items():
    print(f"-{key} : {val}")

Nodes created : 1
Node metadata :
-filename : annual_report_2023.md
-file_path : /documents/annual_report_2023.md
-file_type : md
-file_size : 348
-extracted_date : 03-12-2025
-year : 2023
-document_type : annual_report
