# Document Ingestion (Indexing)

In [32]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.proxies import WebshareProxyConfig

from langchain_text_splitters import RecursiveCharacterTextSplitter


In [33]:
import os
from dotenv import load_dotenv
load_dotenv()


True

## Getting trascript of the yt video 

In [34]:
ytt_api = YouTubeTranscriptApi(
#    proxy_config=WebshareProxyConfig(
#         proxy_username=os.getenv("PROXY_USERNAME"),
#         proxy_password=os.getenv("PROXY_PASSWORD"),
#     )
    )

# fetched_transcript = ytt_api.fetch("Pmd6knanPKw",languages=["en","hi"])
video_id = "P14cRV-m6ZY"
fetched_transcript = ytt_api.fetch(video_id,languages=["en","hi"])

transcript = ""

for snippet in fetched_transcript:
   transcript= transcript + " " + snippet.text

print(len(transcript.split(" "))) #~13000 tokens

9915


In [35]:
fetched_transcript[0]

FetchedTranscriptSnippet(text='You know, whenever I read the Bhagavad Gita, especially commentaries upon it, explanations,', start=13.003, duration=5.797)

In [36]:
for snippet in fetched_transcript:
   if "?" in snippet.text:
      print(snippet.text)

You might think, Atman, who or what is that?
who am I?
Or more precisely, what am I?
artist or am I a good person or a bad person?
What does Sri Krishna say?
Tvameva viditva atimrityu meti, nanyah panthah vidyate ayanayaÓ. Beautiful verse. What does it mean? ŌShrinwantu Vishwe Amritasya Putraaha.Ó
What is this truth?
What is it like?
sand between land, between sea and land, what shall I build against the falling of the night?
What shall I make against the falling?
Falling of the night is death, it's coming, so what shall I build?
And what happens if you realize this reality, if you find this reality?
Any other way?
Can we have genetic engineering or something like that to transcend death?
If that is beyond death, by experiencing it, how do I go beyond death?
Do you see the question?
But how does that help me?
Do you know what it is?
Did you catch that?
I go beyond death by realizing that reality?
Ultimately, what good is it to you or me?
anything, I have not attained anything, then wha

In [37]:
print(transcript)



## Using Text splitter to create chunks 

In [38]:
# chunk_size, chunk_overlap, seperator, 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)

chunks = text_splitter.create_documents([transcript],metadatas=[{"video_id":video_id}])

print(len(chunks))

# to see how a chunk looks like
for chunk in chunks[:4] :
   print(chunk)

69
page_content='You know, whenever I read the Bhagavad Gita, especially commentaries upon it, explanations, there are different approaches to it. Some say the central teaching of the Gita is bhakti, devotion. Some say the central teaching of the Gita is action. Sri Krishna relentlessly urges Arjuna into action. Some say it is duty. Another way of looking at the Bhagavad Gita is that it is a manual on spiritual life with teachings about God and avatar and meditation and so on. All of which, no doubt, are true. But if you actually look at the Bhagavad Gita itself, when Sri Krishna starts teaching Arjuna, the first thing he teaches Arjuna is probably the central message of Vedanta. I'm saying it very carefully, central message of Vedanta. I did not say central message of Gita because the central message of Gita and the central message of Vedanta are one and the same. The Gita is part of Vedanta. Gita in itself may have a unique approach to it. There may be something special in the Gita o

In [39]:
chunks[9].page_content

"level sand between land, between sea and land, what shall I build against the falling of the night? What shall I make against the falling? Falling of the night is death, it's coming, so what shall I build? And it's a very evocative thing, whatever you're building on sand also is built in sand, it will go away one day. So, Tamasah parastath, he says against the falling of the light, there is something, against the falling of the night, he says there is something beyond darkness. This thing which he has realized, he says, is beyond darkness, is beyond the falling of the night. There's this famous poem, do not go gentle into the, Dylan Thomas, into the night, rage, rage, rage against the dying of the light, I've forgotten the exact poem. It's what, understanding what he's trying to say and what is the response of Vedanta to that, we'll see. In fact, Krishna's teaching about the Atman here is a direct response to that. Tamasah parastath, beyond darkness, there is a reality. And what"

## Indexing 

### Embedding Model

In [40]:
# Setting up Env Variables
import os
import getpass
from dotenv import load_dotenv

load_dotenv()

if "GOOGLE_API_KEY" not in os.environ:
   # os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
   os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key:")

In [41]:
# Using Embedding Models to create vector embeddings.
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# configure embdding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

# checking: embedd the chunks
# vectors = embeddings.embed_documents([chunk.page_content for chunk in chunks])

# vectors = await embeddings.aembed_documents(texts_to_embed)
# async for vector in vectors:
#    print(vector)

# for vector in vectors:
#    print(vector)

# len(vectors)

E0000 00:00:1760843445.402938 2642520 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1760843445.403517 2642520 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


### FAISS vector store
> local, simple storage

In [42]:
# Loading documents to vector store/DB
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(chunks,embeddings)


### Pinecone vector storage
> serverless, scalable, awesome.

In [1]:
# experimenting with pinecode
from pinecone import Pinecone , ServerlessSpec
import os 
import dotenv

dotenv.load_dotenv()

# pc_api_key = os.getenv("PINECONE_API_KEY")
# pc = Pinecone(api_key=pc_api_key)
pc = Pinecone()

In [44]:
# creating index in pinecone
index_name = "video-transcript" # 1. intialize the name of the index.

if index_name not in pc.list_indexes().names(): #2. create a index with that name
   pc.create_index(
      name=index_name,
      dimension=3072, # gemini-embedding-001 has 3072 dimensions by default
      metric='cosine', # specifies the distance calculation method used to find the "nearest" vectors during a search [cosine, euclidean, and dotproduct]
      spec=ServerlessSpec(cloud='aws', region='us-east-1')
   ) 




In [None]:
# indexing(Upserting) the vecotrs in pinecone
index = pc.Index(index_name)

vectors = []
for i,chunk in enumerate(chunks):
   embedding = embeddings.embed_query(chunk.page_content) # this is creating one request for each chunk. practically, not the best method, we can just directly get all the chunks retrieved in on request....
   # vectors = embeddings.embed_documents([chunk.page_content for chunk in chunks]) like this and mapping it later, in the function.

   vector = {
      "id": f"chunk_{i}_{chunk.metadata.get("video_id")}",
      "values":embedding,
      "metadata":{
         "text": chunk.page_content, 
         "video_id":chunk.metadata.get("video_id"),
         "chunk_index":i
      }
   }
   vectors.append(vector)


upserted_data = index.upsert(vectors)


## Building a Retriever

there are multiple types of retrieveres , we are building one from the vector store, as we say "all vector stores can be cast to retrievers". we can do this by using `.as_retriever()` method which creates a  `VectorStoreRetriever` instance

### Building from langchain's `as_retriever` method

In [47]:
retriever = vector_store.as_retriever(
   search_type = "similarity", # Other types: mmr, similarity_score_threshold, 
   search_kwargs = {
      "k":4,
      # other kwargs : 
      # "score_threshold":0.5
   }
)
retriever

VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x10efe3e00>, search_kwargs={'k': 4})

In [51]:
query = "what is the true nature of atman?"
relevant_chunks= retriever.invoke(query)
relevant_chunks

[Document(id='73ee3412-e53f-432d-88bb-69c8d0717be9', metadata={'video_id': 'P14cRV-m6ZY'}, page_content="just said, the soul or atman is eternal. He just said it's immortal. It is not born, it does not die. If it is not born and if it does not die, if it is eternal, then it has existence as an intrinsic characteristic. This having existence as an intrinsic characteristic is called sat, pure being. This is one more clue about the nature of the atman. One is that it is eternal. Second one, if you ask what is it actually? If it is not the body, if it is not the mind, then what is it? What's the nature? What's the material? What will you say? It is pure being or sat, existence itself. Atman is existence itself. Now this leads to a very interesting conclusion. If the soul or the atman is existence, if your nature is existence itself, if it is sat, then something other than you, follow this carefully, this is very subtle, sounds very metaphysical, very simple also, but very profound. Somethi

### Retriever from pinecone.

> there are quite a few ways to search in the vector store in pinecone, we are going to search using query vector.

In [49]:
# Supported only for indexes with integrated embedding
# search_with_text = index.search(
#     namespace="example-namespace", 
#     query={
#         "inputs": {"text": "Disease prevention"}, 
#         "top_k": 4
#     },
#     fields=["category", "chunk_text"],
#     rerank={
#         "model": "bge-reranker-v2-m3",
#         "top_n": 2,
#         "rank_fields": ["chunk_text"] # Specified field must also be included in 'fields'
#     }
# )
query = "what is the true nature of atman?"
query_vector = embeddings.embed_query(chunk.page_content) #first embedd the query

search_with_vector= index.query(
   vector = query_vector,
   top_k=4,
   include_metadata=True,
   include_values=False
)


In [50]:
print(type(search_with_vector))
search_with_vector.matches[0]


<class 'pinecone.core.openapi.db_data.model.query_response.QueryResponse'>


{'id': 'chunk_68_P14cRV-m6ZY',
 'metadata': {'chunk_index': 68.0,
              'text': 'a change in the thought, in the emotions, the way you '
                      'think and feel and the way you deal with people and '
                      'events in your life. A lot of peace will come. A great '
                      'release will come upon you. The more we think about '
                      'this, great lightness comes upon us. Fear of death '
                      "diminishes. I won't say disappears, it diminishes. It "
                      'becomes diminished. The horrid reality of a world '
                      "oppressing us, it becomes shimmery because it's not "
                      'real apart from you. You feel free of the bondage of '
                      'karma. You realize yourself as light, as pure awareness '
                      'right now. I pray to them that this may become a '
                      'reality in our lives. Om shanti shanti shanti Hari om '
  

In [None]:
# Extract and return the relevant documents

retrieved_documents = []
for match in search_with_vector.matches:
   retrieved_documents.append({
        "id": match.id,
        "score": match.score,
        "text": match.metadata.get("text")
        })
   
retrieved_documents

[{'id': 'chunk_68_P14cRV-m6ZY',
  'score': 0.999923706,
  'text': "a change in the thought, in the emotions, the way you think and feel and the way you deal with people and events in your life. A lot of peace will come. A great release will come upon you. The more we think about this, great lightness comes upon us. Fear of death diminishes. I won't say disappears, it diminishes. It becomes diminished. The horrid reality of a world oppressing us, it becomes shimmery because it's not real apart from you. You feel free of the bondage of karma. You realize yourself as light, as pure awareness right now. I pray to them that this may become a reality in our lives. Om shanti shanti shanti Hari om tat sat Sri Ramakrishnaarpanamastu"},
 {'id': 'chunk_14_P14cRV-m6ZY',
  'score': 0.86605835,
  'text': "if the whole world were to turn into Ramakrishna, everybody is Sri Ramakrishna, an incarnation of God. Ultimately, what good is it to you or me? Whole world, they all become spiritual giants, but w

## Creating a Template 
relevant chunks + the query provided by the user

In [None]:
# building from the chunks retrieved from FAISS
context = "\n\n".join([chunk.page_content for chunk in relevant_chunks])
template = f'''You are the most spiritual swamy, and you will explain the answers to the questions in very comprehensive and easy manner. so based on this consider the below data: 
{context}

Now, consider the above points and based on those points only answer the question: {query}'''

In [None]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate(
   template=f'''You are the most spiritual swamy, and you will explain the answers to the questions in very comprehensive and easy manner. so based on this consider the below data: 
   {context}

   Now, consider the above points and based on those points only answer the question: {query}''',
   input_variables = ['context','query']
)

prompt = prompt_template.invoke({"context":context,"query":query})

In [18]:
prompt.__dict__

{'text': "You are the most spiritual swamy, and you will explain the answers to the questions in very comprehensive and easy manner. so based on this consider the below data: \n   this is the central teaching of Vedanta, that if we would really know ourselves as we truly are, Atman means the self, the essence, the self, what we truly are. If we know that, who am I or what am I, then all our problems would actually be solved. All the rest, God, the teachings about God, the teachings about the creation of the universe, the teachings about incarnation, Avatar, the teachings about Bhakti and Jnana, love and devotion and knowledge, the teachings about meditation, teachings about action, Karma, all of them come afterwards. Chronologically in the Gita also, you see that they come afterwards. The first teaching, the very first teaching from approximately the 11th verse where Sri Krishna starts speaking, starts teaching Arjuna, to about the 25th verse of the second chapter is a concentrated tea

### Generating Output

In [19]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
ai_msg = llm.invoke(prompt)
ai_msg.content

E0000 00:00:1759848160.212319 1856769 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


'My dear seekers, listen with an open heart as we delve into the most profound truth, the very core of our existence, as revealed in the sacred teachings.\n\nYou ask, "What is Atman?" Ah, this is the very first and most essential question that Sri Krishna addresses, the foundational teaching of Vedanta itself!\n\nSimply put, **Atman is the Self, the essence, what we truly are.** It is not what we *think* we are, nor what the world defines us to be. It is our **real nature**, our **spiritual nature**, something far deeper than our personality, our mind, or our physical body. It is the real nature that exists beyond all our differing personalities and dispositions. When we ask "Who am I?" or, more precisely, "What am I?", the answer is Atman.\n\nNow, let us understand its glorious characteristics, as given in the wisdom:\n\n1.  **Eternal and Immortal:** The Atman is beyond death. It is not born, and it does not die. Sri Krishna repeats this truth again and again – it is an eternal, immor

### Implementing Langchain

Every Runnable component is designed to perform a specific task: it accepts an input, processes it, and returns an output in a consistent and predictable manner. A chain composed of multiple Runnables is also a Runnable itself, meaning it can be invoked, streamed, or further combined with other components.

**Core concepts of a Runnable**
* **Standard interface:** All Runnables expose standard methods like invoke, batch, and stream for handling single inputs, multiple inputs, or streaming outputs.
* **Composability:** You can chain Runnables together using the pipe | operator, where the output of one component automatically becomes the input for the next.
* **Asynchronous support:** All Runnables can be executed asynchronously, allowing for efficient, concurrent processing. 


In [None]:
from langchain_core.runnables import RunnableParallel , RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [None]:
def format_docs(relevant_chunks):
   context_text = "\n\n".join(doc.page_content for doc in relevant_chunks)
   return context_text


In [None]:
''' 
RunnableParallel:	Executes multiple Runnables concurrently with the same input, collecting their outputs into a single dictionary.

RunnableLambda:	Wraps any standard Python function, allowing you to insert custom logic into a chain.

RunnablePassthrough:	Acts as an identity function, passing its input through unchanged. It's often used with RunnableParallel to preserve the original input for later steps.
'''

parallel_chain = RunnableParallel({
   'context':retriever | RunnableLambda(format_docs),
   'query': RunnablePassthrough() # This tells the chain to take the original input (the query "What is a runnable?") and pass it to this key
})

In [29]:
parallel_chain.invoke("Define productivity in 5 words")

{'context': "ensure that it happens and that it happens with the highest degree of efficiency even though that morning 90minute work block is so vital. Of course there's a second work block. So, combined that's just three hours of focused work, which may not seem like a lot, but if you were to dissect your day and kind of look at the arc and structure of your day, I'd be willing to bet that if we added up the total period of time in which you were in deep work, really focused, dedicated work, that it would probably amount to about 3 or 4 hours. And of course, throughout the day, there are other things happening outside of those 90-minute work blocks. I'm checking my text messages. I'm checking my email. I'm responding to various demands. I'm working and tending to life. So, while I've carved some boundaries or delineated some boundaries around those work blocks, and I'm certain that if you do too, you will benefit from them. Of course, please adapt and modify what I've described today 

In [None]:
'''
StrOutputParser(): is the simplest and most common output parser in LangChain. Its purpose is to take the output from a language model, which can be a complex object like a BaseMessage, and extract the plain string content. This ensures the final output of a chain is a simple, usable string that can be easily displayed, logged, or passed to the next component.
'''

parser = StrOutputParser()

In [37]:
main_chain = parallel_chain | prompt_template | llm | parser

In [None]:
main_chain.invoke("summarize the video .")

'Hello there! As your productivity coach, I\'m thrilled to break down the essence of a highly effective daily routine for you. What we\'ve just explored highlights some incredibly powerful strategies for maximizing focus and efficiency, not just in your work, but in how you navigate your entire day.\n\nIt\'s not about working *more* hours, but about working *smarter* and with deeper intention. By understanding your brain\'s natural rhythms and setting up your day strategically, you can achieve remarkable results.\n\nLet\'s distill these insights into a practical, 3-step daily routine designed to significantly boost your productivity:\n\n---\n\n### Your 3-Step Daily Routine for Enhanced Productivity:\n\n#### **Step 1: The Energized Start & Your Cognitive Peak**\n\nThis step is all about setting the stage for a highly productive day by leveraging your natural morning energy and tackling your most demanding tasks when your brain is freshest.\n\n*   **Wake Up & Get Moving (Optic Flow):** S