In [1]:
import asyncio
from collections import defaultdict
from openai import AsyncOpenAI

client = AsyncOpenAI()
semaphore = asyncio.Semaphore(4)  # allow max 4 in parallel


# --------------------------
# Summarizer Functions
# --------------------------

async def summarize_chapter(text, sid, text_id):
    """Summarize a single chapter."""
    async with semaphore:
        prompt = f"""
        Summarize this chapter from the book into 1–2 concise paragraphs.
        Capture key events, themes, and character actions.
        Avoid bullet points. Do not mention chunking.

        Chapter text:
        {text}
        """
        resp = await client.chat.completions.create(
            model="gpt-4o-mini",  # fast + cheap
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
        )
        return {
            "text_id": text_id,
            "chapter_id": sid,
            "summary": str(resp.choices[0].message.content).strip()
        }


async def summarize_book(chapter_summaries, text_id):
    """Synthesize whole-book summary from chapter summaries."""
    joined = "\n\n".join(
        [f"Chapter {c['chapter_id']}: {c['summary']}" for c in chapter_summaries]
    )
    prompt = f"""
    Here are summaries of each chapter of a book.
    Write a single cohesive overall summary of the book in 2–3 paragraphs. 
    Do NOT enumerate chapter by chapter. Instead, merge into one flowing narrative. 
    Focus on major themes, central characters, and the overall arc.

    Chapter summaries:
    {joined}
    """
    resp = await client.chat.completions.create(
        model="gpt-4o",  # can use bigger model for better synthesis
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
    )
    return {
        "book_id": text_id,
        "summary": resp.choices[0].message.content.strip()
    }


async def summarize_hierarchy(chunks, text_id):
    """
    Summarize book hierarchy
    """
    # Group chunks by chapter_id (sid)
    section_map = defaultdict(list)
    for c in chunks:
        sid = int(c["id"].split("_")[1])  # parse sid from id
        section_map[sid].append(c["text"])

    # Join chapter text
    chapters = {sid: "\n".join(texts) for sid, texts in section_map.items()}

    # Summarize chapters in parallel
    tasks = [summarize_chapter(text, sid, text_id) for sid, text in chapters.items()]
    chapter_summaries = await asyncio.gather(*tasks)

    # Summarize entire book
    book_summary = await summarize_book(chapter_summaries, text_id)

    return chapter_summaries, book_summary


In [2]:

import re
import tiktoken
import hashlib

from pathlib import Path


class TextReader:
    
    def __init__(self, file_path, source, text_id, split_pattern=None) -> None:
        self.file_path = Path(file_path)
        self.source = source
        self.text = ""
        self.enc = tiktoken.get_encoding('cl100k_base')
        self.pattern = split_pattern or r"^(?:CHAPTER [IVXLCDM]+\.)\s*\n"
        self.text_id = text_id 


    @staticmethod
    def simple_hash(text: str, length: int = 7) -> str:
        return hashlib.md5(text.encode("utf-8")).hexdigest()[:length]

    def _section_split(self):
        return re.split(self.pattern, self.text, flags=re.IGNORECASE | re.MULTILINE)

    def _chunk_split(self, section: str, max_tokens=1000, overlap=100):
        tokens = self.enc.encode(section)
        chunks = []
        for i in range(0, len(tokens), max_tokens - overlap):
            chunk = self.enc.decode(tokens[i:i+max_tokens])
            chunks.append(chunk)
        return chunks

    def parse_into_chunks(self, max_tokens=800, overlap=100):
        sections = self._section_split()
        all_chunks = []
        for sid, section in enumerate(sections): 
            section_chunks = self._chunk_split(section, max_tokens, overlap)
            for cid, sub in enumerate(section_chunks):
                hash_id = TextReader.simple_hash(sub)
                all_chunks.append(
                    {
                        "id": f"{self.text_id}_{sid+1:02d}_{cid+1:03d}_{hash_id}",
                        "text": sub,
                        "num_tokens": len(self.enc.encode(sub)),
                        "num_chars": len(sub)
                    }
                )
        return all_chunks



class GutenbergReader(TextReader):

    def __init__(self, file_path, text_id) -> None:
        super().__init__(file_path, text_id=text_id, source="gutenberg")


    def _strip_gutenberg(self, text: str) -> str:
        start_match = re.search(r"\*\*\* START OF.*\*\*\*", text)
        end_match = re.search(r"\*\*\* END OF.*\*\*\*", text)
        if start_match and end_match:
            return text[start_match.end(): end_match.start()]
        return text
    
    def parse(self, max_tokens=500, overlap=100):
        book_path = Path(self.file_path)
        raw_text = book_path.read_text(encoding="utf-8")

        self.text = self._strip_gutenberg(raw_text)
        print("Clean word count:", len(self.text.split()))
    
        chunks = self.parse_into_chunks(max_tokens, overlap)
        return chunks

In [3]:
import os
import psycopg2
from psycopg2.extras import execute_values

from dotenv import load_dotenv

load_dotenv()  # reads .env

DB_CONFIG = {
    "dbname": os.getenv("PG_DB", "booksdb"),
    "user": os.getenv("PG_USER", "bookuser"),
    "password": os.getenv("PG_PASS", "bookpass"),
    "host": "localhost",
    "port": os.getenv("PG_PORT", 5432),
}

class Summarizer:

    def __init__(self, conn=None) -> None:
        self.conn = conn or psycopg2.connect(**DB_CONFIG)


    def store_summaries(self, chapter_summaries, book_summary):
        conn = psycopg2.connect(**DB_CONFIG)
        with conn.cursor() as cur:
            # Insert chapters
            rows = [(c["text_id"], c["chapter_id"], c["summary"]) for c in chapter_summaries]
            execute_values(cur, """
                INSERT INTO chapter_summaries (text_id, chapter_id, summary)
                VALUES %s
                ON CONFLICT (text_id, chapter_id) DO UPDATE SET summary = excluded.summary
            """, rows)

            # Insert book summary
            cur.execute("""
                INSERT INTO book_summaries (book_id, summary)
                VALUES (%s, %s)
                ON CONFLICT (book_id) DO UPDATE SET summary = excluded.summary
            """, (book_summary["book_id"], book_summary["summary"]))

        conn.commit()
        conn.close()


    def get_chapter_summary(self, book_id: str, chapter_id: int) -> str | None:
        """Fetch one chapter summary from DB."""
        with self.conn.cursor() as cur:
            cur.execute(
                """
                SELECT summary 
                FROM chapter_summaries
                WHERE text_id = %s AND chapter_id = %s
                """,
                (book_id, chapter_id),
            )
            row = cur.fetchone()
        self.conn.close()
        return row[0] if row else None


    def get_all_chapter_summaries(self, book_id: str) -> list[tuple[int, str]]:
        """Fetch all chapter summaries for a book, ordered by chapter_id."""
        with self.conn.cursor() as cur:
            cur.execute(
                """
                SELECT chapter_id, summary
                FROM chapter_summaries
                WHERE text_id = %s
                ORDER BY chapter_id
                """,
                (book_id,),
            )
            rows = cur.fetchall()
        self.conn.close()
        return rows 


    def get_book_summary(self, book_id: str) -> str | None:
        """Fetch the overall book summary."""
        with self.conn.cursor() as cur:
            cur.execute(
                """
                SELECT summary
                FROM book_summaries
                WHERE book_id = %s
                """,
                (book_id,),
            )
            row = cur.fetchone()
        self.conn.close()
        return row[0] if row else None
            

### Connect to Debug

`psql -h localhost -U bookuser -d booksdb`



In [4]:

FILE_PATH = "../DATA/alice_in_wonderland.txt"
reader = GutenbergReader(FILE_PATH, "aiw")
chunks = reader.parse(max_tokens=500, overlap=100)


# One time load

# chapter_summaries, book_summary = asyncio.run(
#     summarize_hierarchy(chunks, text_id="aiw")
# )
# chapter_summaries, book_summary = await summarize_hierarchy(chunks, text_id="aiw")

# s = Summarizer()
# s.store_summaries(chapter_summaries, book_summary)
print("Summaries stored in DB.")


Clean word count: 26525
Summaries stored in DB.


In [5]:
s = Summarizer()
print(s.get_book_summary("aiw"))

"Alice's Adventures in Wonderland" is a whimsical tale that follows the journey of a young girl named Alice, who stumbles into a fantastical world after following a peculiar White Rabbit down a rabbit hole. This world, known as Wonderland, is filled with bizarre creatures and illogical events that challenge Alice's understanding of reality. Throughout her journey, Alice encounters a series of eccentric characters, including the Cheshire Cat, the Mad Hatter, and the Queen of Hearts, each contributing to the chaotic and nonsensical nature of Wonderland. As Alice navigates this strange environment, she grapples with questions of identity and self-perception, often feeling isolated and confused by the absurdity surrounding her.

The narrative explores themes of curiosity, the quest for identity, and the transition from childhood innocence to the complexities of adulthood. Alice's interactions with the inhabitants of Wonderland highlight her growing assertiveness and resilience in the face 

In [8]:
s = Summarizer()
print(s.get_chapter_summary("aiw", 5))

In "The Rabbit Sends in a Little Bill," Alice encounters the White Rabbit, who is frantically searching for a fan and gloves, mistaking her for his housemaid, Mary Ann. Complying with his demands, Alice rushes to the Rabbit's house, where she finds the items but also discovers a mysterious bottle that causes her to grow uncontrollably large. As she struggles to fit in the room, she becomes increasingly uncomfortable and anxious about her situation. The Rabbit and other characters outside, including a creature named Bill, attempt to figure out how to deal with her size, leading to a chaotic scene filled with broken glass and confusion.

As Alice continues to grow, she eventually finds a way to shrink by eating a cake that turns into little cakes scattered on the floor. Once she returns to a manageable size, she escapes the house and encounters a large puppy, which she initially fears might eat her. After a playful yet frightening interaction with the puppy, Alice reflects on her desire 