# Book Reading Assignment Pipeline 

## Prerequisites

In [None]:
!pip install llama-index --upgrade --force
!pip install langchain --upgrade
!pip install langchain_community --upgrade --force
!pip install boto3 --upgrade --force
!pip install pinecone-client --upgrade --force
!pip install python-dotenv
!pip install cohere --upgrade --force

## Data Preparation and ingestion pipeline

The first step of the assignment is reading the booking right?! So let's start reading then! We will use Pinecone as vector store. Llamaindex will help us load and index our raw text into Pinecone. We will also use it to access and query the indexed data.

Let's prepare some metadata first. The book pdf was split into separate files, one for reach chapter. We'll use this breakdown to assign metadata to each node inserted as part of our index. We won't cover that step in details but please check out the BookSplitter notebook if you are curious.

In [17]:
from dotenv import load_dotenv
import os 
load_dotenv()

environment = os.environ["PINECONE_ENV"]
index_name = os.environ["PINECONE_INDEX_NAME"]

In [18]:
# Initializing API Keys, credentials, imports and context variables
import logging
import sys
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [19]:
from llama_index import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext,
    Document,
    download_loader,
    set_global_service_context,
    get_response_synthesizer,
)
from llama_index.vector_stores import PineconeVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.indices.document_summary import DocumentSummaryIndex
from llama_index.llms import Bedrock
from pathlib import Path
from llama_index.prompts import PromptTemplate
from IPython.display import Markdown, display

In [20]:
import json

#book and chapters metadata are pre-stored into a a JSON Configuration file
with open("../config/book_2.json", "r") as read_content: 
	books_config = json.load(read_content)

file_name = books_config['books'][0]['fileName']
book_title = books_config['books'][0]['title']
chapter_titles = books_config['books'][0]['chapters']

Initialize Pinecone client

In [None]:
from langchain_community.embeddings import BedrockEmbeddings

embeddings = BedrockEmbeddings(credentials_profile_name="genai-demo", region_name="us-east-1", model_id="amazon.titan-embed-text-v1")
llm = Bedrock(model="anthropic.claude-v2", profile_name="genai-demo")
set_global_service_context(None)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embeddings)
vector_store = PineconeVectorStore(
        index_name=index_name,
        environment=environment
    )
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Let's start indexing! Notice how llama-index abstracts all the communication with the Vector store so far ...


In [30]:
summary_template_str = (
    "Human: The <passage></passage> xml tags below contain a passage of the book.\n"
    " <passage>{context_str}</passage>\n"
    "Please summarize the passage. Provide as many details as possible.\n"
    "Assistant:"
)
summary_template = PromptTemplate(summary_template_str)

def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))


In [32]:
import traceback

PDFReader = download_loader("PDFReader")
chapter_docs = []
loader = PDFReader()

#Let's index each chapter
for pos in range(0, len(chapter_titles)):
    documents = loader.load_data(file=Path(f"{file_name}-{pos}.pdf"))
    chapter_title = chapter_titles[pos]['title']
    for doc in documents:
        doc.metadata['book_title'] = book_title
        doc.metadata['chapter_number'] = pos
        doc.metadata['chapter_title'] = chapter_title
    chapter_docs.append(documents)

#Let's generate chapter summaries
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", 
    use_async=True, 
    service_context=service_context,
    summary_template=summary_template,
)
summary_docs = []
for chapter_pos in range(0, len(chapter_titles)):
    try:
        doc_index = VectorStoreIndex.from_documents(
            chapter_docs[chapter_pos],
            storage_context=storage_context,
            service_context=service_context,
        )
    except Exception as e:
        print("Error when indexing chapter " + str(chapter_pos))
        traceback.print_exc()
        continue
    
    try:
        if chapter_pos>0:
            query_summary_engine = doc_index.as_query_engine(response_synthesizer=response_synthesizer,)
            summary = query_summary_engine.query("Could you summarize the given context?")
            print("Summary " + str(chapter_pos) + ": " + summary.response)
            chapter_title = chapter_titles[chapter_pos]['title']
            summary_doc = Document(text=summary.response)
            summary_doc.metadata['book_title'] = book_title
            summary_doc.metadata['chapter_number'] = chapter_pos
            summary_doc.metadata['chapter_title'] = chapter_title
            summary_doc.metadata['is_summary'] = "Y"
            summary_docs.append(summary_doc)
    except Exception as e:
        print("Error when generating chapter " + str(chapter_pos) + " summary")
        traceback.print_exc()
        continue

summary_index = VectorStoreIndex.from_documents(
    summary_docs,
    storage_context=storage_context,
    service_context=service_context,
)


Upserted vectors:   0%|          | 0/3 [00:00<?, ?it/s]

Upserted vectors:   0%|          | 0/30 [00:00<?, ?it/s]

Summary 1: Empty Response


Upserted vectors:   0%|          | 0/17 [00:00<?, ?it/s]

Summary 2: Empty Response


Upserted vectors:   0%|          | 0/19 [00:00<?, ?it/s]

Summary 3:  Here is a summary of the key points from the passage:

- The passage contains excerpts from chapters 1 and 2 of the book Ishmael by Daniel Quinn. 

- In chapter 1, the narrator describes being captured from the wild as a young gorilla and brought to live in a zoo. He notices the zoo animals seem more "thoughtful" and wonders "why" life is divided into a boring, unpleasant captivity vs an interesting, pleasant life in the wild. 

- In chapter 2, a character called "he" provides three definitions to the narrator: 1) Story - a scenario interrelating man, the world, and the gods 2) To enact - to live so as to make a story into reality 3) Culture - a people enacting a story. 

- He explains there are two fundamentally different stories enacted on earth - one by "Leavers" (hunter gatherers) beginning millions of years ago, and one by "Takers" (civilized man) beginning 10-12 thousand years ago that may end in catastrophe. 

- The narrator doesn't fully understand yet but will lear

Upserted vectors:   0%|          | 0/12 [00:00<?, ?it/s]

Summary 4:  Here is a summary of the key points in the passage:

- The passage contains excerpts from chapters 1 and 2 of the book Ishmael by Daniel Quinn. 

- In chapter 1, the narrator describes how animals in captivity, like a tiger pacing in a cage, often become preoccupied with the question "Why?". The narrator realized he was also asking himself "Why?" about why life is divided into a boring, unpleasant captivity vs an interesting, pleasant freedom. 

- In chapter 2, Ishmael the teacher defines some key terms: "story" as a scenario about man, the world, and gods; "enact" as living to make a story real; "culture" as a people enacting a story. 

- Ishmael states two fundamentally different stories have been enacted on Earth - one by "Leavers" (hunter-gatherers) beginning millions of years ago, and one by "Takers" (civilized cultures) beginning 10-12 thousand years ago that may end in catastrophe. 

- The narrator doesn't yet understand what the Takers' story is that his own culture

Upserted vectors:   0%|          | 0/15 [00:00<?, ?it/s]

Summary 5:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted by the "Takers" (modern human civilization) also has a premise that the narrator should be able to figure out. 

- Ishmael hints that the premise is that the world was made for humans/man. The narrator agrees this is a common belief even among atheists.

- Ishmael explains key definitions: 

- Story: a scenario interrelating man, the world, and the gods
- To enact: to live so as to make the story a reality 
- Culture: a people enacting a story

- Ishmael says there are two fundamentally different stories enacted on Earth: 

- The "Leavers" - enacted for 2-3 million years, still successful
- The "Takers" - enacted for 10-12 

Upserted vectors:   0%|          | 0/18 [00:00<?, ?it/s]

Summary 6:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise. He asks the narrator to figure out what this premise is. 

- The narrator cannot figure it out. Ishmael reveals the premise - that the world was made for man. The narrator agrees this is a common belief.

- Ishmael defines some key terms: 

- Story: A scenario interrelating man, the world, and the gods. 

- To enact: To live so as to make the story a reality. 

- Culture: A people enacting a story.

- Ishmael says there are two fundamentally different stories enacted on earth - one by the "Leavers" beginning 2-3 million years ago, and one by the "Takers" beginning 10-12 thous

Upserted vectors:   0%|          | 0/12 [00:00<?, ?it/s]

Summary 7:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted by the "Takers" (modern civilization) also has a premise that the narrator should be able to figure out. 

- Ishmael gives hints, saying the world was not made for jellyfish, frogs, lizards or rabbits, but for man. The narrator agrees everyone knows the world was made for man.

- Ishmael defines some key terms: 

- Story: a scenario interrelating man, the world, and the gods
- To enact: to live so as to make the story a reality
- Culture: a people enacting a story

- Ishmael says the Leavers have enacted one story and the Takers another. The Leavers' story began 2-3 million years ago and continues successfully. The Takers' 

Upserted vectors:   0%|          | 0/28 [00:00<?, ?it/s]

Summary 8:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise that the narrator should be able to figure out. 

- Ishmael hints that the premise is that the world was made for humans/man. The narrator agrees this is something everyone in his culture knows.

- Ishmael explains some definitions: a story is a scenario interrelating man, the world, and the gods; to enact a story is to live to make it come true; a culture is a people enacting a story. 

- Ishmael says two different stories have been enacted on Earth - one by the "Leavers" beginning millions of years ago, and one by the "Takers" beginning thousands of years ago that may end in

Upserted vectors:   0%|          | 0/41 [00:00<?, ?it/s]

Summary 9:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise of two children from warring families falling in love. 

- Ishmael says the story being enacted by the "Takers" also has a premise that the narrator should be able to figure out. The narrator cannot guess it. 

- Ishmael reveals the premise - that the world was made for man. The narrator agrees this is something everyone in his culture knows.

- Ishmael defines some key terms: 

- Story: a scenario interrelating man, the world, and the gods

- To enact: to live so as to make the story a reality 

- Culture: a people enacting a story

- Ishmael says there are two fundamentally different stories enacted on Earth - one by the "Leavers" and one by the "Takers." 

- The Leavers' story began 2-3 million years ago and conti

Upserted vectors:   0%|          | 0/25 [00:00<?, ?it/s]

Summary 10:  Here is a summary of the key points in the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise of two children from warring families falling in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise that the narrator should be able to figure out. 

- Ishmael hints that the premise is that the world was made for humans/man. The narrator agrees this is something everyone in his culture knows.

- Ishmael explains some key definitions: 

- Story: A scenario interrelating man, the world, and the gods. 

- To enact: To live so as to make the story a reality.

- Culture: A people enacting a story.

- Ishmael says there are two fundamentally different stories enacted on Earth: 

- The Leavers' story began 2-3 million years ago and continues successfully. 

- The Takers' story beg

Upserted vectors:   0%|          | 0/22 [00:00<?, ?it/s]

Summary 11:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet having the premise of two children from warring families falling in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise that the narrator should be able to figure out. 

- Ishmael hints that the premise is that the world was made for humans/man. The narrator agrees this is something everyone in his culture knows.

- Ishmael explains some definitions: a story is a scenario interrelating man, the world, and the gods; to enact a story is to live to make it come true; a culture is a people enacting a story. 

- Ishmael says two fundamentally different stories have been enacted on Earth - one by the "Leavers" beginning 2-3 million years ago that continues today, and one by the "Takers" beginning 

Upserted vectors:   0%|          | 0/27 [00:00<?, ?it/s]

Summary 12:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise that the narrator should be able to figure out. 

- Ishmael hints that the premise is that the world was made for humans/man. The narrator agrees this is something everyone in his culture knows.

- Ishmael explains some definitions: a story is a scenario interrelating man, the world, and the gods; to enact a story is to live to make it come true; a culture is a people enacting a story. 

- Ishmael says two fundamentally different stories have been enacted on Earth - one by the "Leavers" beginning millions of years ago, and one by the "Takers" beginning thousands of years ago 

Upserted vectors:   0%|          | 0/7 [00:00<?, ?it/s]

Summary 13:  Here is a summary of the key points in the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3. 

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet, with the premise that two children from warring families fall in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise. He asks the narrator to figure out what this premise is. 

- The narrator cannot figure it out. Ishmael reveals the premise - that the world was made for man. The narrator agrees this is a common belief even among atheists.

- Ishmael explains key definitions: 

- Story: a scenario interrelating man, the world, and the gods
- To enact: to live so as to make the story a reality 
- Culture: a people enacting a story

- Ishmael states there are two fundamentally different stories enacted on earth: 

- The "Leavers" story began 2-3 million years ago and continues successfully. 

-

Upserted vectors:   0%|          | 0/6 [00:00<?, ?it/s]

Summary 14:  Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by Daniel Quinn. It is from chapters 2 and 3.

- Ishmael tells the narrator that every story has a premise. He gives the example of Romeo and Juliet having the premise of two children from warring families falling in love. 

- Ishmael says the story being enacted in the world by the "Takers" also has a premise that the narrator should be able to figure out. The narrator cannot guess what it is. 

- Ishmael reveals the premise - that the world was made for man. The narrator agrees this is something everyone in his culture knows.

- Ishmael defines some key terms: 

- Story - a scenario interrelating man, the world, and the gods
- To enact - to live so as to make the story a reality
- Culture - a people enacting a story

- Ishmael says there are two fundamentally different stories enacted on earth - one by the "Leavers" and one by the "Takers." 

- The Leavers' story began 2-3 milli

Upserted vectors:   0%|          | 0/14 [00:00<?, ?it/s]

## Let's run a few tests
### Simple query

In [33]:
from llama_index.postprocessor.cohere_rerank import CohereRerank

text_qa_template_str = (
    "Human: The <passage></passage> xml tags below contain a passage of the book.\n"
    " <passage>{context_str}</passage>\n"
    "Using the provided passage answer the following question:\n"
    "{query_str}\nIf the context isn't helpful, please reply that you don't know.\n\n"
    "Assistant:"
)
text_qa_template = PromptTemplate(text_qa_template_str)

refine_template_str = (
    "Human: The original question is within tghe <original_question></original_question> XML tags below.\nWe have provided an"
    " existing answer within the <existing_answer>{context_str}</existing_answer> XML tags below. \nWe have the opportunity to refine"
    " the existing answer (only if needed) with some more context available within the <passage></passage> XML tags below."
    " <original_question>{query_str}</original_question>\n"
    " <existing_answer>{context_str}</existing_answer>\n"
    " <passage>{context_str}</passage>\n"
    "Rewrite a new answer. Use the existing answer and context as new context to expand the existing answer.\n\n"
    "Assistant:"
)
refine_template = PromptTemplate(refine_template_str)

api_key = os.environ["COHERE_API_KEY"]
cohere_rerank = CohereRerank(api_key=api_key, top_n=100)

book_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)
query_engine = book_index.as_query_engine(
    similarity_top_k=10,
    text_qa_template=text_qa_template,
    refine_template = refine_template,
    node_postprocessors=[cohere_rerank],
    )

prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)

response = query_engine.query("Who are the main characters of the book?")
#response = query_engine.query("What does the book talk about?") #may work better by putting entire book in context
#response = query_engine.query("What is the significance of the jellyfish story?")
print(str(response))
print(response.get_formatted_sources())

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Human: The <passage></passage> xml tags below contain a passage of the book.
 <passage>{context_str}</passage>
Using the provided passage answer the following question:
{query_str}
If the context isn't helpful, please reply that you don't know.

Assistant:


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

Human: The original question is within tghe <original_question></original_question> XML tags below.
We have provided an existing answer within the <existing_answer>{context_str}</existing_answer> XML tags below. 
We have the opportunity to refine the existing answer (only if needed) with some more context available within the <passage></passage> XML tags below. <original_question>{query_str}</original_question>
 <existing_answer>{context_str}</existing_answer>
 <passage>{context_str}</passage>
Rewrite a new answer. Use the existing answer and context as new context to expand the existing answer.

Assistant:


<br><br>

 Based on the summary, the main characters of the book Ishmael seem to be:

- Ishmael - a teacher figure who is a gorilla
- The narrator - a human student of Ishmael

The passage mentions Ishmael speaking to and teaching the narrator, so they appear to be the central characters having a dialogue in the book. No other character names are mentioned.
> Source (Doc id: 21b0ad00-d029-406b-9134-69893917c9ec): Here is a summary of the key points in the passage:

- The passage is from the book Ishmael by Da...

> Source (Doc id: 6f1404c5-4388-449b-9cd6-c2c968ae3610): Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by ...

> Source (Doc id: 1f8aa270-1228-4e5b-91a2-32e0d2a59411): Here is a summary of the key points from the passage:

- The passage is from the book Ishmael by ...

> Source (Doc id: f2432ed6-cb26-401a-b8e9-510571b57ba1): Here is a summary of the key points in the passage:

- The passage is from the book Ishmael by Da...

> Source (Doc 

### Using metadata to summarize/query a specific chapter

In [None]:
import pinecone

from llama_index.vector_stores.types import (
    MetadataFilters,
    ExactMatchFilter,
)

query_chapter_title = "SIX"
query_book_title = book_title
query_engine_six = book_index.as_query_engine(
    similarity_top_k=20,
    text_qa_template=text_qa_template,
    refine_template = refine_template,
    node_postprocessors=[cohere_rerank],
    filters=MetadataFilters(
        filters=[
            ExactMatchFilter(key="chapter_title", value=query_chapter_title),
            ExactMatchFilter(key="book_title", value=query_book_title),
        ]
    ),
)
response = query_engine_six.query("What does the passage talk about?")
print("Response = "+ repr(response))

## Time to do some homework

In [None]:
import pandas as pd

assignment = pd.read_csv('../data/assignment_questions.csv')

role_prompt = "You are a 12th grader doing an English assignment. Based on the provided context, please answer the following question.\n QUESTION:"

def get_query_engine_by_chapter(book_index, query_book_title, query_chapter_title="ALL"):
    similarity_top_k_val=20
    if query_chapter_title == "ALL":
        chapter_filter = ExactMatchFilter(key="is_summary", value="Y")
        similarity_top_k_val = 100
    else:
        chapter_filter = ExactMatchFilter(key="chapter_title", value=query_chapter_title)
        
    query_engine = book_index.as_query_engine(
        similarity_top_k=similarity_top_k_val,
        text_qa_template=text_qa_template,
        refine_template = refine_template,
        node_postprocessors=[cohere_rerank],
        filters=MetadataFilters(
            filters=[
                chapter_filter,
                ExactMatchFilter(key="book_title", value=query_book_title),
            ]
        ),
    )
    return query_engine

book_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=service_context)

questions_one = assignment[assignment['chapter'] == "ONE"]
query_engine_one = get_query_engine_by_chapter(book_index, book_title, "ONE")
questions_one['answer'] = questions_one['question'].map(lambda q: query_engine_one.query(role_prompt + q).response, na_action='ignore')

questions_two = assignment[assignment['chapter'] == "TWO"]
query_engine_two = get_query_engine_by_chapter(book_index, book_title, "TWO")
questions_two['answer'] = questions_two['question'].map(lambda q: query_engine_two.query(role_prompt + q).response, na_action='ignore')

#General questions
questions_all = assignment[assignment['chapter'] == "ALL"]
query_engine_all = get_query_engine_by_chapter(book_index, book_title)
questions_all['answer'] = questions_all['question'].map(lambda q: query_engine_all.query(role_prompt + q).response, na_action='ignore') #re-initialize query_engine if needed by running beginning of previous section

questions = pd.concat([questions_one, questions_two, questions_all], sort=False)

pd.options.display.max_colwidth = 1000
questions.head(5)

In [36]:
questions.to_csv('../data/assignment_answers.csv', index=False)