# PlotTrace — Chapter‑Aware RAG for Books

> Autogenerated header added on 2025-10-20 07:03:44. A cleaned script and README are included alongside this notebook.

In [1]:
!pip install -q cassandra-driver

In [2]:
!pip install -q langchain
!pip install -q openai
!pip install -q pypdf
!pip install -q faiss-cpu
!pip install -q tiktoken
!pip install -q cassio


In [3]:
!pip install -q cassandra-driver
!pip install -q cassio>=0.1.1
!pip install -q tiktoken==0.4.0

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for tiktoken [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for tiktoken (pyproject.toml) ... [?25l[?25herror
[31m  ERROR: Failed building wheel for tiktoken[0m[31m
[0m[31mERROR: ERROR: Failed to build installable wheels for some pyproject.toml based projects (tiktoken)[0m[31m
[0m

In [4]:
import cassandra

In [5]:
from cassandra.cluster import Cluster

In [6]:
from cassandra.auth import PlainTextAuthProvider

In [7]:
import json

In [8]:
cloud_config = {'secure_connect_bundle': "/content/secure-connect-pdf-qna-rag (1) (1).zip"}

In [9]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
import json


with open("/content/xcroxx3@gmail.com-token.json") as f:
    secrets = json.load(f)

CLIENT_ID = secrets['clientId']
CLIENT_SECRET = secrets['secret']

auth_provider = PlainTextAuthProvider(CLIENT_ID, CLIENT_SECRET)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()



In [10]:
row = session.execute("select release_version from system.local").one()

if row:
  print(row[0])
else:
  print("Error")

4.0.11.0-14371b2a5645


In [11]:
# 1) Install the split packages (and CassIO/Astra)
!pip install -U langchain langchain-core langchain-community langchain-openai langchain-text-splitters cassio astrapy pypdf

# 2) Restart runtime so the new packages are picked up
# import os, sys, time
# os.kill(os.getpid(), 9)




In [12]:
# ✅ OpenAI integrations live here now (NOT in langchain_community.llms)
from langchain_openai import OpenAI, OpenAIEmbeddings

# ✅ Cassandra vector store lives under community
from langchain_community.vectorstores import Cassandra

# ✅ Correct splitter names & package
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

# ✅ Document/loaders locations
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader, PyPDFLoader


In [13]:
import os
os.environ['OPENAI_API_KEY'] = "API-KEY"

In [14]:
llm = OpenAI(temperature=0)
openai_embeddings = OpenAIEmbeddings()

In [15]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Cassandra
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [16]:
KEYSPACE = "default_keyspace"      # Must exist in your Astra DB
TABLE_NAME = "pdf_novel_table"

In [17]:
import cassio

# Tell CassIO to use this CQL session
cassio.init(session=session)

In [24]:
vectorstore = Cassandra.from_documents(
    documents=docs,
    embedding=embeddings,
    session=session,          # keep passing it
    keyspace=KEYSPACE,
    table_name="pdf_novel_table"
)

In [20]:
# 1. Split the documents
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=30)
docs = splitter.split_documents(pages)

In [19]:
loader = PyPDFLoader("/content/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf")
pages = loader.load_and_split()

In [22]:
embeddings = OpenAIEmbeddings()

In [25]:
# assumes `vectorstore` is already created
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

In [27]:
query = "Summarize the main topic of the PDF."
docs = retriever.invoke(query)          # returns List[Document]

for i, d in enumerate(docs, 1):
    print(f"[{i}] {d.metadata.get('source','unknown')} | chunk_len={len(d.page_content)}")



[1] /content/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf | chunk_len=370
[2] /content/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf | chunk_len=385
[3] /content/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf | chunk_len=41
[4] /content/harry-potter-and-the-philosophers-stone-by-jk-rowling.pdf | chunk_len=379


In [26]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [28]:
prompt = ChatPromptTemplate.from_template(
    """You are a novel analyzer assistant. Use only the context to answer.
If the answer isn't in the context, say you don't know.

Question: {question}

Context:
{context}"""
)

def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
print(rag_chain.invoke("who does Harry lives with?"))





Harry lives with his aunt and uncle, as well as his cousin Dudley.
