In [15]:
import os
import book_parser
import json

os.environ["UNSTRUCTURED_API_KEY"] = "wPnp9uRPdSXKexOwECWEIU32yG5rfp"
os.environ["UNSTRUCTURED_API_URL"] = "https://api.unstructuredapp.io/general/v0/general"


In [17]:
from unstructured.partition.auto import partition

pdf_path="./.dev/data/portfolio/portfolio.pdf"
# pdf_path = "./.dev/output/asyncio.pdf"
# pdf_path="./.dev/data/asyncio/asyncio_clean.pdf",
# pdf_path="./.dev/output/asyncio.md",
book_elements = partition(
    filename=pdf_path,
    api_key=os.environ["UNSTRUCTURED_API_KEY"],
    partition_endpoint=os.environ["UNSTRUCTURED_API_URL"],
    partition_by_api=True,
    # include_page_breaks=True,
    include_metadata=True,
)

# Clean elements
book_elements = [
    element
    for element in book_elements
    if element.to_dict()["type"] not in ["Footer", "PageBreak"]
]


metadata = book_parser.extract_pdf_metadata(pdf_path)
sections = book_parser.get_section_hierarchy(pdf_path)
chapters = book_parser.extract_chapters(sections)

new_chapters = []
for chapter in chapters:
    chapter.elements = book_parser.get_elements_for_chapter(book_elements, chapter)
    new_chapters.append(book_parser.partition_elements(chapter))

book = book_parser.Book(metadata=metadata, sections=new_chapters)


In [None]:
book.toc()

In [6]:
book_dict = book.model_dump()
book_parser.recur_to_dict(book_dict)

In [None]:
book_dict['sections'][0]['elements']

In [7]:
with open("book_dict.json", "w") as f:
    json.dump(book_dict, f, indent=2)


## Chunks

need to store metadata and original refs

In [None]:
# book_parser.view_chunks()
book_str ="\n\n".join([element['text'] for element in book_dict['sections'][0]['elements']])
print(book_str)

In [None]:
## generate content from the book
from lumos import lumos
from pydantic import BaseModel

class BookContent(BaseModel):
    content: str


lumos.call_ai(
    messages=[
        {"role": "system", "content": "You are a helpful assistant that can generate content from a book."},
        {"role": "user", "content": f"Generate a summary of the content:\n {book_str}."},
    ],
    model="gpt-4o-mini",
    response_format=BookContent,
)


In [8]:
def get_leaf_sections(section) -> list[tuple[str, list]]:
    """Get all leaf sections (those without subsections) and their elements."""
    results = []
    
    if section.get('elements') and not section.get('subsections'):
        # This is a leaf section - collect title and elements
        ele_str = "\n\n".join([element['text'] for element in section['elements']])
        results.append((section['title'], ele_str))
    
    # Recursively process subsections
    if section.get('subsections'):
        for subsection in section['subsections']:
            results.extend(get_leaf_sections(subsection))
            
    return results

# Get leaf sections from all root sections
leaf_sections = []
for section in book_dict['sections']:
    leaf_sections.extend(get_leaf_sections(section))


In [9]:
from lumos import lumos
from pydantic import BaseModel, Field

class LessonContent(BaseModel):
    description: str = Field(..., description="One or two line description of the content. Get the most information across.")
    summary: str = Field(..., description="A concise summary of the content. Be to the point and concise.")

async def get_lesson_content(title, content):
    input_str = """Generate a summary of the content: 
    <Title>
    {title}
    </Title>

    <Content>
    {content}
    </Content>""".format(title=title, content=content)

    ret = await lumos.call_ai_async(
        messages=[
            {"role": "system", "content": "You are a helpful assistant that will help me creating insightful lessons and summaries for technical books. You will be provided with a section of a book and you will need to generate a summary of the content. Be concise and to the point. "},
            {"role": "user", "content": input_str},
        ],
        model="gpt-4o",
        response_format=LessonContent,
    )
    return ret

In [None]:
# from rich.console import Console
# from rich.panel import Panel
# import nest_asyncio
# import asyncio

# nest_asyncio.apply()
# console = Console()


# tasks = [get_lesson_content(title, content) for title, content in leaf_sections]
# results = asyncio.run(asyncio.gather(*tasks))

for (title, content), lesson in zip(leaf_sections, results):
    console.print()
    console.print(Panel(
        f"[bold magenta]{title}[/bold magenta]\n\n"
    f"[yellow]Description:[/yellow] {lesson.description}\n\n"
    f"[green]Summary:[/green] {lesson.summary}",
    expand=True
))



In [20]:
## quiz gen

In [21]:
from pydantic import BaseModel, Field
from datetime import datetime
from uuid import UUID
from enum import Enum
from typing import Any

class DifficultyLevel(str, Enum):
    EASY = "Easy"
    INTERMEDIATE = "Intermediate"
    ADVANCED = "Advanced"


class QuizQuestionBase(BaseModel):
    question: str
    correct_answer: str
    incorrect_answers: list[str] = Field(
        ..., description="Must be a strictly a list of 3 items. "
    )
    explanation: str = None
    difficulty: DifficultyLevel
    
class QuizItems(BaseModel):
    items: list[QuizQuestionBase]

async def get_quiz_content(title, content):
    input_str = """Generate quiz questions based on the content: 
    <Title>
    {title}
    </Title>

    <Content>
    {content}
    </Content>""".format(title=title, content=content)

    ret = await lumos.call_ai_async(
        messages=[
            {"role": "system", "content": "You are a helpful assistant that will help me creating insightful quizzes for technical books. You will be provided with a section of a book and you will need to generate a quiz based on the content. The quizzes must not be trivial, generate quizzes that are challenging and must require some thinking. It is not neccesary to generate a quiz, do so only if the content is worthy enough to generate a quiz. We do not want to generate low quality questions from chapters. Since, we are generating quizzes for all chapters, we want only the best."},
            {"role": "user", "content": input_str},
        ],
        model="gpt-4o",
        response_format=QuizItems,
    )
    return ret

In [None]:
results

In [None]:
from rich.console import Console
from rich.panel import Panel
import nest_asyncio
import asyncio

nest_asyncio.apply()
console = Console()


# tasks = [get_quiz_content(title, content) for title, content in leaf_sections]
# results = asyncio.run(asyncio.gather(*tasks))

for (title, content), quiz in zip(leaf_sections, results):
    console.print()
    quiz_text = ""
    for i, item in enumerate(quiz.items, 1):
        quiz_text += f"\nQuestion {i}:\n{item.question}\n"
        quiz_text += f"✓ Correct: {item.correct_answer}\n"
        quiz_text += f"✗ Incorrect:\n"
        for wrong in item.incorrect_answers:
            quiz_text += f"  • {wrong}\n"
        if item.explanation:
            quiz_text += f"\nExplanation: {item.explanation}\n"
        quiz_text += f"Difficulty: {item.difficulty.value}\n"
        
    console.print(Panel(
        f"[bold magenta]{title}[/bold magenta]\n\n"
        f"[yellow]Quiz Questions:[/yellow]{quiz_text}",
        expand=True
    ))



In [None]:
res = []
for quizitem in results:
    for item in quizitem.items:
        res.append(item.model_dump())

res

In [None]:
res