# 📄 Demo: Authoring + ETL Pipeline

This notebook demonstrates a basic ETL pipeline that:

- Loads Markdown files from the `content/` folder
- Parses YAML front matter and section headings
- Breaks content into semantic chunks
- Structures the output for storage, publishing, or graph modeling


In [None]:
# 📦 Imports
import os
import yaml
import re
from pathlib import Path
from typing import List, Dict

CONTENT_DIR = Path("../content")

In [None]:
# 🛠 Utility: Split YAML front matter and Markdown body
def split_front_matter(md_text: str):
    match = re.match(r'^---\n(.*?)\n---\n(.*)', md_text, re.DOTALL)
    if match:
        meta = yaml.safe_load(match.group(1))
        body = match.group(2)
    else:
        meta = {}
        body = md_text
    return meta, body

In [None]:
# 🛠 Utility: Break content into heading-based chunks
def chunk_markdown_by_heading(body: str) -> List[Dict]:
    pattern = re.compile(r'(?=^##\s)', re.MULTILINE)
    sections = pattern.split(body)
    chunks = []
    for section in sections:
        heading_match = re.match(r'^##\s+(.*)', section.strip())
        title = heading_match.group(1) if heading_match else "Intro"
        chunks.append({
            "heading": title.strip(),
            "content": section.strip()
        })
    return chunks

In [None]:
# 🔄 Run ETL over all Markdown chapters
def extract_chunks_from_folder(folder: Path) -> List[Dict]:
    all_chunks = []
    for file in sorted(folder.glob("*.md")):
        with open(file, "r", encoding="utf-8") as f:
            raw_md = f.read()
        meta, body = split_front_matter(raw_md)
        chunks = chunk_markdown_by_heading(body)
        for chunk in chunks:
            chunk["source_file"] = file.name
            chunk["metadata"] = meta
        all_chunks.extend(chunks)
    return all_chunks

In [None]:
# 🚀 Extract!
chunks = extract_chunks_from_folder(CONTENT_DIR)
print(f"Extracted {len(chunks)} content chunks from {CONTENT_DIR}")

In [None]:
# 🔍 Preview the first few chunks
import pandas as pd

df = pd.DataFrame(chunks)
df[["source_file", "heading", "content"]].head(5)

## ✅ Next Steps

- Validate structured data using JSON Schema
- Store chunks in SQLite, Postgres, or a document store
- Map content to a graph using `py2neo` and Schema.org types
- Generate summaries and embeddings for AI retrieval
