In [248]:
!pip install langchain_groq



In [249]:
!pip install -q youtube-transcript-api langchain-community langchain-openai \
               faiss-cpu tiktoken python-dotenv

In [250]:
pip install langchain_huggingface langchain_community langchain_text_splitters



In [251]:
from langchain_huggingface import ChatHuggingFace,HuggingFaceEndpoint
from dotenv import load_dotenv
import os
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
load_dotenv()

hf_token=os.getenv("HF_TOKEN")

llm = HuggingFaceEndpoint(
    repo_id="moonshotai/Kimi-K2-Instruct",  # Use the correct model repo_id
    task="text-generation",  # You can adjust this based on your model task
    model_kwargs={"headers": {"Authorization": f"Bearer {hf_token}"}}
)

model=ChatHuggingFace(llm=llm)

In [252]:
from langchain_groq import ChatGroq
import os

groq_token = os.getenv("test_groq")
llm = ChatGroq(
    groq_api_key=groq_token,
    model_name="meta-llama/llama-4-scout-17b-16e-instruct"
)

# Use llm directly - no need for ChatHuggingFace wrapper
response = llm.invoke("Your prompt here")

In [253]:
from youtube_transcript_api import YouTubeTranscriptApi

api = YouTubeTranscriptApi()

# List available transcripts with types, dialects, and translation support
transcript_list = api.list("Gfr50f6ZBvo")

for t in transcript_list:
    print(
        f"- {t.language_code} "
        f"({'manual' if not t.is_generated else 'auto-generated'}) "
        f"{'[translatable]' if t.is_translatable else ''}"
    )


- en (auto-generated) [translatable]


In [254]:
from youtube_transcript_api import YouTubeTranscriptApi
print(dir(YouTubeTranscriptApi))  # Should list 'get_transcript'

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'fetch', 'list']


In [255]:
from youtube_transcript_api import YouTubeTranscriptApi

video_id = "Gfr50f6ZBvo"  # Example video

# Create an instance first
api = YouTubeTranscriptApi()

# Now call fetch()
transcript = api.fetch(video_id, languages=['en'])
print(transcript)

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='the following is a conversation with', start=0.08, duration=3.44), FetchedTranscriptSnippet(text='demus hasabis', start=1.76, duration=4.96), FetchedTranscriptSnippet(text='ceo and co-founder of deepmind', start=3.52, duration=5.119), FetchedTranscriptSnippet(text='a company that has published and builds', start=6.72, duration=4.48), FetchedTranscriptSnippet(text='some of the most incredible artificial', start=8.639, duration=4.561), FetchedTranscriptSnippet(text='intelligence systems in the history of', start=11.2, duration=4.8), FetchedTranscriptSnippet(text='computing including alfred zero that', start=13.2, duration=3.68), FetchedTranscriptSnippet(text='learned', start=16.0, duration=2.96), FetchedTranscriptSnippet(text='all by itself to play the game of gold', start=16.88, duration=4.559), FetchedTranscriptSnippet(text='better than any human in the world and', start=18.96, duration=5.6), FetchedTranscriptSnippet(text='alph

In [256]:
# Assuming 'transcript' is your FetchedTranscript object
transcript_dict_list = [
    {
        "text": snippet.text,
        "start": snippet.start,
        "duration": snippet.duration
    }
    for snippet in transcript.snippets
]

# Print the first 5 entries to verify
print(transcript_dict_list[:5])

[{'text': 'the following is a conversation with', 'start': 0.08, 'duration': 3.44}, {'text': 'demus hasabis', 'start': 1.76, 'duration': 4.96}, {'text': 'ceo and co-founder of deepmind', 'start': 3.52, 'duration': 5.119}, {'text': 'a company that has published and builds', 'start': 6.72, 'duration': 4.48}, {'text': 'some of the most incredible artificial', 'start': 8.639, 'duration': 4.561}]


In [257]:
import json
json_output = json.dumps(transcript_dict_list, indent=2)
print(json_output)

[
  {
    "text": "the following is a conversation with",
    "start": 0.08,
    "duration": 3.44
  },
  {
    "text": "demus hasabis",
    "start": 1.76,
    "duration": 4.96
  },
  {
    "text": "ceo and co-founder of deepmind",
    "start": 3.52,
    "duration": 5.119
  },
  {
    "text": "a company that has published and builds",
    "start": 6.72,
    "duration": 4.48
  },
  {
    "text": "some of the most incredible artificial",
    "start": 8.639,
    "duration": 4.561
  },
  {
    "text": "intelligence systems in the history of",
    "start": 11.2,
    "duration": 4.8
  },
  {
    "text": "computing including alfred zero that",
    "start": 13.2,
    "duration": 3.68
  },
  {
    "text": "learned",
    "start": 16.0,
    "duration": 2.96
  },
  {
    "text": "all by itself to play the game of gold",
    "start": 16.88,
    "duration": 4.559
  },
  {
    "text": "better than any human in the world and",
    "start": 18.96,
    "duration": 5.6
  },
  {
    "text": "alpha fold two

# 1.**Indexing** (Text Splitting)

In [258]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

In [259]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Extract just the text from your transcript
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,  # Focused paragraphs
    chunk_overlap=600,  # Preserves context
    separators=["\n\n", "\n", ". ", "? ", "! "]  # Split at natural boundaries
)

chunks = splitter.create_documents(texts)

In [260]:
len(chunks)

3790

In [261]:
chunks[0]

Document(metadata={}, page_content='the following is a conversation with')

# **1.Indexing (Embedding Generation and storing in vector store)**

In [262]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Lightweight model (good for CPU)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Higher quality (requires more RAM)
# embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")

vector_store=FAISS.from_documents(chunks,embedding=embeddings)

  return forward_call(*args, **kwargs)


In [263]:
vector_store.index_to_docstore_id

{0: '37e00221-24c8-48dd-bda0-e5fb03b95e95',
 1: 'bb1d7ee2-8611-4f27-aefd-6ef62b1c0924',
 2: '2d27cfd7-61e6-489b-9540-6c387d6c5da7',
 3: '06312a1e-f614-4e88-8747-a6bc302b797a',
 4: '1962371e-a11a-4d3e-afc8-920b1a80b261',
 5: '6e3c35a7-5285-4b93-91e3-3050feb37288',
 6: '596c9027-0f8b-4ae2-bffc-23051c278edf',
 7: 'd1b6e891-3805-485c-8e28-1b2fb98992f2',
 8: 'ba37f72a-63ea-4b62-af10-b8ade9c9859f',
 9: '494ed4bf-6cf8-48f7-b246-647a6e765a2d',
 10: '3c16a8e2-34d5-4793-81d0-9e2710969b0d',
 11: '1cf956ee-408d-4280-b23e-1385140e6689',
 12: 'eb05ff4c-ae74-4f34-be07-af8d457df32d',
 13: '0b43ed7c-f86b-4da5-a3a4-4d090637e95d',
 14: '3a58c044-a58c-46ff-9076-a9dea08ba82e',
 15: 'e8bc3a13-dc8b-408f-9673-64aecab51512',
 16: 'e7d0a6c6-3a07-411b-81f7-6bae006ef688',
 17: 'a811551a-8a95-4e7e-ada4-686b0107dbf7',
 18: '1994c6ec-a191-4c7f-9b1f-608192cfcb22',
 19: '564c04bc-8abd-4136-817a-0248cf82e013',
 20: '06e48ba9-28a2-42e0-bf71-fce1370957bd',
 21: '663febcb-e1b1-46a8-b7d9-c2c3b7172295',
 22: 'f68ff539-8f37-

In [264]:
vector_store.get_by_ids(['d87c9b26-c4c2-48de-9f68-40a4793677e0'])

[]

# **2.Retrieval**

In [265]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs={
        'k': 8,  # More documents
        'lambda_mult': 0.6,  # Balanced diversity
        'fetch_k': 20  # Larger candidate pool
    }
)

In [266]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7dd680c30110>, search_type='mmr', search_kwargs={'k': 8, 'lambda_mult': 0.6, 'fetch_k': 20})

In [267]:
retriever.invoke('what is deepmind')

  return forward_call(*args, **kwargs)


[Document(id='60841619-7951-49b9-b7c3-30e50d7216f4', metadata={}, page_content='deepmind originally was a confluence of'),
 Document(id='45d1f6b5-1bfd-4b8c-a1bb-0880f676d9aa', metadata={}, page_content='deepmind um and my current thinking on'),
 Document(id='1c8c3320-8e1e-407f-bc6c-86fa0926920f', metadata={}, page_content='interesting position where deepmind is'),
 Document(id='c32da91d-f6bb-405a-a999-fa09f9d11b94', metadata={}, page_content='deepmind and what my career is being'),
 Document(id='4104bba1-b946-42b3-8977-9785c927fe2d', metadata={}, page_content='big goal for deepmind how much of it is'),
 Document(id='2d27cfd7-61e6-489b-9540-6c387d6c5da7', metadata={}, page_content='ceo and co-founder of deepmind'),
 Document(id='d5e5bb28-4951-4b6d-9fe8-3d229ccb48e7', metadata={}, page_content='innovated with at deepmind to encourage'),
 Document(id='a20a632e-0983-4301-97c6-56694910718d', metadata={}, page_content='start of deepmind was that we would use')]

# **3.Augmentation**

In [268]:
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7dd4edff16d0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7dd4e1171e90>, model_name='meta-llama/llama-4-scout-17b-16e-instruct', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [269]:
prompt = PromptTemplate(
    template="""
    Analyze this context thoroughly:
    {context}

    Question: {question}

    Guidelines:
    1. If the name appears multiple times, identify all relevant positions
    2. Connect related information across chunks
    3. State if information is incomplete
    """,
    input_variables=['context', 'question']
)

In [270]:
question="is the topic of aliens discussed in this video ? if yes then what was discussed?"
retrieved_docs=retriever.invoke(question)

  return forward_call(*args, **kwargs)


In [271]:
context_text=f" \n\n ".join(doc.page_content for doc in retrieved_docs)

In [272]:
final_prompt=prompt.invoke({"context":context_text,"question":question})

# **4.Generation**

In [273]:
retrieved_docs

[Document(id='4356e48c-f710-44ab-9056-48e117d8dec0', metadata={}, page_content="do you think there's a lot of alien"),
 Document(id='9383ad0c-522f-4a3b-9a15-6055608e2ab0', metadata={}, page_content='what we spoke about at the beginning'),
 Document(id='b2f21b2f-789b-4695-b512-816170e65180', metadata={}, page_content='and it was projected by the aliens or'),
 Document(id='5d5b30a6-69f4-4ec7-b0df-d96ff139a3b2', metadata={}, page_content='forever right and sci-fi has talked'),
 Document(id='2f34c400-e07a-4c94-974e-3dc12893b642', metadata={}, page_content='other explanation did was there um 2001'),
 Document(id='d6e62bc7-f088-4572-95f3-c770731f5937', metadata={}, page_content="dawning of the the space age uh we've"),
 Document(id='f32be65c-6e91-4763-8075-930e38894b23', metadata={}, page_content="aliens would say well we haven't really"),
 Document(id='ed8e2829-f8dc-40c1-bbdc-cc89889ca8c3', metadata={}, page_content='it was a it was a really interesting it')]

In [274]:
context_text

"do you think there's a lot of alien \n\n what we spoke about at the beginning \n\n and it was projected by the aliens or \n\n forever right and sci-fi has talked \n\n other explanation did was there um 2001 \n\n dawning of the the space age uh we've \n\n aliens would say well we haven't really \n\n it was a it was a really interesting it"

In [275]:
final_prompt

StringPromptValue(text="\n    Analyze this context thoroughly:\n    do you think there's a lot of alien \n\n what we spoke about at the beginning \n\n and it was projected by the aliens or \n\n forever right and sci-fi has talked \n\n other explanation did was there um 2001 \n\n dawning of the the space age uh we've \n\n aliens would say well we haven't really \n\n it was a it was a really interesting it\n    \n    Question: is the topic of aliens discussed in this video ? if yes then what was discussed?\n    \n    Guidelines:\n    1. If the name appears multiple times, identify all relevant positions\n    2. Connect related information across chunks\n    3. State if information is incomplete\n    ")

In [276]:
answer=llm.invoke(final_prompt)
print(answer)

content='After thoroughly analyzing the context, I can conclude that:\n\n**Yes, the topic of aliens is discussed in this video.**\n\nHere\'s a breakdown of the relevant information:\n\n1. The topic of aliens is first mentioned in the phrase "do you think there\'s a lot of alien".\n2. The speaker then seems to connect the idea of aliens to a previous conversation, mentioning "what we spoke about at the beginning".\n3. The speaker suggests that this previous conversation might have been "projected by the aliens".\n4. The discussion appears to be related to science fiction (sci-fi), which has explored the idea of aliens, as mentioned in the phrase "sci-fi has talked".\n5. The speaker also references a specific example, "2001: A Space Odyssey" (implied by "um2001"), which is a classic sci-fi movie that explores the theme of aliens and space.\n6. The speaker seems to be considering alternative explanations for something, mentioning "other explanation did was there".\n7. The aliens are quote

In [277]:
print(answer.content)

After thoroughly analyzing the context, I can conclude that:

**Yes, the topic of aliens is discussed in this video.**

Here's a breakdown of the relevant information:

1. The topic of aliens is first mentioned in the phrase "do you think there's a lot of alien".
2. The speaker then seems to connect the idea of aliens to a previous conversation, mentioning "what we spoke about at the beginning".
3. The speaker suggests that this previous conversation might have been "projected by the aliens".
4. The discussion appears to be related to science fiction (sci-fi), which has explored the idea of aliens, as mentioned in the phrase "sci-fi has talked".
5. The speaker also references a specific example, "2001: A Space Odyssey" (implied by "um2001"), which is a classic sci-fi movie that explores the theme of aliens and space.
6. The speaker seems to be considering alternative explanations for something, mentioning "other explanation did was there".
7. The aliens are quoted as saying "we haven't

# 5.Building a **Chain**

In [278]:
from langchain_core.runnables import RunnableParallel,RunnableSequence,RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda

In [279]:
def format_docs(retrieved_docs):
  context_text="\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [280]:
parallel_chain=RunnableParallel({
    'context':retriever | RunnableLambda(format_docs),
    'question':RunnablePassthrough()
})

In [281]:
parallel_chain.invoke('who is demis')

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


{'context': "here's demis\n\nceo and co-founder of deepmind\n\nand and actually levantal who is another\n\nthe world to judith polgar who obviously\n\nobviously nick bostrom i think famously\n\nconversation with demas establish to\n\nher to have a life they can be proud of\n\nconsciousness gravity",
 'question': 'who is demis'}

In [282]:
parser=StrOutputParser()

In [283]:
main_chain=parallel_chain | prompt| llm |parser

In [284]:
main_chain.invoke("Can you summarize the video")

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


'After thoroughly analyzing the context, here is a summarization of the information:\n\n**Summary:** The speaker appears to be discussing a video or animation that explains how something works. They express their enjoyment and amazement at seeing the animation, stating it was "incredible" and "pretty much amazing to see."\n\n**Key Points:**\n\n* The speaker mentions a video/animation that explains how something works.\n* They express their enthusiasm and appreciation for the video, thanking someone named "Lex".\n* The speaker is asked if they can summarize the video and explain how it works.\n* The speaker seems to be having a conversation with someone, possibly Lex, and is asked to analyze something, but they interrupt to say they have to run.\n\n**Positions of Names:**\n\n* "Lex" is mentioned once, as someone the speaker is conversing with and thanking.\n\n**Related Information:**\n\n* The speaker\'s enthusiasm and appreciation for the video/animation are connected across chunks, sug