# Document Ingestion (Indexing)

In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_text_splitters import RecursiveCharacterTextSplitter


### Getting trascript of the yt video 

In [15]:
ytt_api = YouTubeTranscriptApi()
# fetched_transcript = ytt_api.fetch("Pmd6knanPKw",languages=["en","hi"])
fetched_transcript = ytt_api.fetch("1bszFX_XcbU",languages=["en","hi"])

transcript = ""

for snippet in fetched_transcript:
   transcript= transcript + " " + snippet.text

print(transcript)

 now let's talk about how the best students structure their days turns out there are great studies on this there is a really nice paper in fact that surveyed close to 700 students these were medical students approximately equal number of male and female students and analyze the most useful learning habits that is the learning habits associated with the most successful students now anytime you do a study like this where people take surveys there's always the issue of causality in fact we can pretty much set aside any possible causality for instance I'm about to tell you that the very best performing students tend to study for about three or four hours per day but you could easily say Well they're the best students because they study three or four hours per day they don't study three or four hours per day because they're the best students and you'd be exactly right okay we can get into all sorts of discussions about correlation versus causation about reverse causality and on and on howev

### Using Text splitter to create chunks 

In [4]:
# chunk_size, chunk_overlap, seperator, 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = text_splitter.create_documents([transcript])
len(chunks)

# to see how a chunk looks like
# for chunk in chunks :
#    print(chunk)

39

### Using Embedding Models to embedd data.

In [5]:
# Setting up Env Variables
import os
import getpass
from dotenv import load_dotenv

load_dotenv()

if "GOOGLE_API_KEY" not in os.environ:
   # os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
   os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google API key:")

In [6]:
# Configuring Embedding Model
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

E0000 00:00:1759757027.806011 1252069 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759757027.809868 1252069 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [7]:
# Loading documents to vector store/DB
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

vector_store = FAISS.from_documents(chunks,embeddings)


In [8]:
vector_store.index_to_docstore_id

{0: '74802725-6ee4-46a5-92d7-0019617c365d',
 1: '49030216-5d5f-4ab2-b99e-ef4d957c5fa9',
 2: 'fac401f9-aad0-4d7b-82ef-e0afcfcb8133',
 3: '965422ae-d254-4820-9417-7e158e2c7871',
 4: '9476889e-d88b-4eb4-a5b0-eafa56bd7337',
 5: 'be344d9b-a497-4548-a28a-fa69791c4418',
 6: 'c6120b76-4192-4147-860d-d5c5627d4f1e',
 7: '386cd8eb-6009-499a-9ba5-ca29a7180de9',
 8: '40485856-be89-4659-9317-d08e26e39c99',
 9: 'ad887d9e-aa79-4465-b0a7-a9b8f5731e5e',
 10: '0eee9f60-e701-4d43-b027-6dbcdd4a2722',
 11: 'd67e06a0-0457-42d5-992b-14808e058e1d',
 12: '6ce58786-896e-4d0f-8e48-5147c45ec6ac',
 13: '346eeb55-149e-4548-8e5c-f9d491ad7ab9',
 14: 'a7b235f0-08af-4a5a-9d81-7e6025f5baf0',
 15: 'e3534c8d-d9e4-44d7-8830-51b072f1a017',
 16: '656efb2b-bb2a-4cc0-be3c-6dbdf19bd9c1',
 17: 'ff4bb734-830d-47b4-9987-75e2d96ef890',
 18: '6018497a-7cb0-4d0f-a42e-559769221cdf',
 19: '6a7bca86-7ab2-4ecd-9a70-86bb2efbe364',
 20: 'cce60d91-be8d-4794-a360-8b4ec1b88b88',
 21: '0c2249a6-7e3c-4196-9957-055b3cc65e75',
 22: 'b29fcc58-7399-

### Building a Retriever

there are multiple types of retrieveres , we are building one from the vector store, as we say "all vector stores can be cast to retrievers". we can do this by using `.as_retriever()` method which creates a  `VectorStoreRetriever` instance

In [9]:
retriever = vector_store.as_retriever(
   search_type = "similarity", # Other types: mmr, similarity_score_threshold, 
   search_kwargs = {
      "k":4,
      # other kwargs : 
      # "score_threshold":0.5
   }
)
retriever

VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x10e7e9e80>, search_kwargs={'k': 4})

In [10]:
query = "give a 3 step daily routine to improve productivity."
relevant_chunks= retriever.invoke(query)
relevant_chunks[0].page_content

"ensure that it happens and that it happens with the highest degree of efficiency even though that morning 90minute work block is so vital. Of course there's a second work block. So, combined that's just three hours of focused work, which may not seem like a lot, but if you were to dissect your day and kind of look at the arc and structure of your day, I'd be willing to bet that if we added up the total period of time in which you were in deep work, really focused, dedicated work, that it would probably amount to about 3 or 4 hours. And of course, throughout the day, there are other things happening outside of those 90-minute work blocks. I'm checking my text messages. I'm checking my email. I'm responding to various demands. I'm working and tending to life. So, while I've carved some boundaries or delineated some boundaries around those work blocks, and I'm certain that if you do too, you will benefit from them. Of course, please adapt and modify what I've described today in ways that

### Creating a Template 
relevant chunks + the query provided by the user

In [11]:
context = "\n\n".join([chunk.page_content for chunk in relevant_chunks])
template = f'''You are a professional productivity coach, and you explain the queries in very comprehensive and easy manner. so based on this consider the below data: 
{context}

Now, consider the above points and based on it answer the question: {query}'''

In [12]:
from langchain_core.prompts import PromptTemplate

prompt_template = PromptTemplate(
   template=f'''You are a professional productivity coach, and you explain the queries in very comprehensive and easy manner. so based on this consider the below data: 
   {context}

   Now, consider the above points and based on it answer the question: {query}''',
   input_variables = ['context','query']
)

prompt = prompt_template.invoke({"context":context,"query":query})

In [13]:
prompt.__dict__

{'text': "You are a professional productivity coach, and you explain the queries in very comprehensive and easy manner. so based on this consider the below data: \n   ensure that it happens and that it happens with the highest degree of efficiency even though that morning 90minute work block is so vital. Of course there's a second work block. So, combined that's just three hours of focused work, which may not seem like a lot, but if you were to dissect your day and kind of look at the arc and structure of your day, I'd be willing to bet that if we added up the total period of time in which you were in deep work, really focused, dedicated work, that it would probably amount to about 3 or 4 hours. And of course, throughout the day, there are other things happening outside of those 90-minute work blocks. I'm checking my text messages. I'm checking my email. I'm responding to various demands. I'm working and tending to life. So, while I've carved some boundaries or delineated some boundari

### Generating Output

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
ai_msg = llm.invoke(prompt)
ai_msg.content

E0000 00:00:1759757057.893446 1252069 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


'Hello there! As your productivity coach, I\'m thrilled to break down the valuable insights from the provided text into a clear, actionable 3-step daily routine designed to significantly boost your efficiency and focus.\n\nThe core idea here is to intentionally structure your day to align with your natural energy cycles and prioritize deep, impactful work. Let\'s dive into a routine that leverages these principles:\n\n---\n\n### Your 3-Step Daily Routine to Unlock Peak Productivity:\n\nThis routine is crafted to optimize your focus and energy, ensuring you tackle your most important tasks when your brain is at its best.\n\n---\n\n#### **Step 1: The Intentional Start – Wake Up & Walk It Out**\n\nYour morning sets the tone for your entire day, and the text emphasizes a very deliberate beginning.\n\n*   **Wake-Up & Record (Around 6:00 - 6:30 AM):** The first thing mentioned is waking up around 6:00-6:30 AM and immediately recording your wake-up time. While the text mentions this is to tra

### Implementing Langchain

Every Runnable component is designed to perform a specific task: it accepts an input, processes it, and returns an output in a consistent and predictable manner. A chain composed of multiple Runnables is also a Runnable itself, meaning it can be invoked, streamed, or further combined with other components.

**Core concepts of a Runnable**
* **Standard interface:** All Runnables expose standard methods like invoke, batch, and stream for handling single inputs, multiple inputs, or streaming outputs.
* **Composability:** You can chain Runnables together using the pipe | operator, where the output of one component automatically becomes the input for the next.
* **Asynchronous support:** All Runnables can be executed asynchronously, allowing for efficient, concurrent processing. 


In [12]:
from langchain_core.runnables import RunnableParallel , RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [13]:
def format_docs(relevant_chunks):
   context_text = "\n\n".join(doc.page_content for doc in relevant_chunks)
   return context_text


In [28]:
''' 
RunnableParallel:	Executes multiple Runnables concurrently with the same input, collecting their outputs into a single dictionary.

RunnableLambda:	Wraps any standard Python function, allowing you to insert custom logic into a chain.

RunnablePassthrough:	Acts as an identity function, passing its input through unchanged. It's often used with RunnableParallel to preserve the original input for later steps.
'''

parallel_chain = RunnableParallel({
   'context':retriever | RunnableLambda(format_docs),
   'query': RunnablePassthrough() # This tells the chain to take the original input (the query "What is a runnable?") and pass it to this key
})

In [29]:
parallel_chain.invoke("Define productivity in 5 words")

{'context': "ensure that it happens and that it happens with the highest degree of efficiency even though that morning 90minute work block is so vital. Of course there's a second work block. So, combined that's just three hours of focused work, which may not seem like a lot, but if you were to dissect your day and kind of look at the arc and structure of your day, I'd be willing to bet that if we added up the total period of time in which you were in deep work, really focused, dedicated work, that it would probably amount to about 3 or 4 hours. And of course, throughout the day, there are other things happening outside of those 90-minute work blocks. I'm checking my text messages. I'm checking my email. I'm responding to various demands. I'm working and tending to life. So, while I've carved some boundaries or delineated some boundaries around those work blocks, and I'm certain that if you do too, you will benefit from them. Of course, please adapt and modify what I've described today 

In [None]:
'''
StrOutputParser(): is the simplest and most common output parser in LangChain. Its purpose is to take the output from a language model, which can be a complex object like a BaseMessage, and extract the plain string content. This ensures the final output of a chain is a simple, usable string that can be easily displayed, logged, or passed to the next component.
'''

parser = StrOutputParser()

In [37]:
main_chain = parallel_chain | prompt_template | llm | parser

In [None]:
main_chain.invoke("summarize the video .")

'Hello there! As your productivity coach, I\'m thrilled to break down the essence of a highly effective daily routine for you. What we\'ve just explored highlights some incredibly powerful strategies for maximizing focus and efficiency, not just in your work, but in how you navigate your entire day.\n\nIt\'s not about working *more* hours, but about working *smarter* and with deeper intention. By understanding your brain\'s natural rhythms and setting up your day strategically, you can achieve remarkable results.\n\nLet\'s distill these insights into a practical, 3-step daily routine designed to significantly boost your productivity:\n\n---\n\n### Your 3-Step Daily Routine for Enhanced Productivity:\n\n#### **Step 1: The Energized Start & Your Cognitive Peak**\n\nThis step is all about setting the stage for a highly productive day by leveraging your natural morning energy and tackling your most demanding tasks when your brain is freshest.\n\n*   **Wake Up & Get Moving (Optic Flow):** S