# YouTube ChatBot (RAG)
Chat with any YouTube video using transcript + embeddings + retrieval


In [None]:
import re
from youtube_transcript_api import YouTubeTranscriptApi

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS



  from .autonotebook import tqdm as notebook_tqdm


In [33]:
from langchain_huggingface import HuggingFaceEndpoint , ChatHuggingFace

In [2]:
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
TOP_K = 3


In [3]:
def extract_video_id(youtube_url: str) -> str:
    pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
    match = re.search(pattern, youtube_url)
    if not match:
        raise ValueError("Invalid YouTube URL")
    return match.group(1)


In [4]:
# def load_youtube_transcript(video_id: str) -> str:
#     transcript = YouTubeTranscriptApi.get_transcript(video_id)
#     return " ".join(segment["text"] for segment in transcript)

def load_youtube_transcript(video_id: str) -> str:
    api_object = YouTubeTranscriptApi()
    transcript_list = api_object.fetch(video_id)
    

    transcript_text =" ".join(item.text for item in transcript_list)

    return transcript_text


In [5]:
def split_text(text: str):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    return splitter.create_documents([text])


In [6]:
def create_vector_store(documents):
    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME
    )
    texts = [doc.page_content for doc in documents]
    return FAISS.from_texts(texts, embeddings)


In [7]:
def get_retriever(vector_store):
    return vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": TOP_K}
    )


In [15]:
def load_llm():
    llm_endpoint = HuggingFaceEndpoint(
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        temperature=0,
        max_new_tokens=512
    )

    chat_model = ChatHuggingFace(llm=llm_endpoint)
    return chat_model

In [11]:
# =========================
# Answer Generation (LLM)
# =========================

def generate_answer(llm, question: str, retrieved_docs):
    context = "\n\n".join(doc.page_content for doc in retrieved_docs)

    prompt = f"""
You are a helpful assistant.
Answer the question using ONLY the context below.

Context:
{context}

Question:
{question}

Answer:
"""

    response = llm.invoke(prompt)
    return response


In [31]:
# =========================
# Main Pipeline
# =========================

def run_youtube_chatbot(youtube_url: str, question: str):
    video_id = extract_video_id(youtube_url)

    transcript_text = load_youtube_transcript(video_id)

    documents = split_text(transcript_text)

    vector_store = create_vector_store(documents)

    retriever = get_retriever(vector_store)

    retrieved_docs = retriever.invoke(question)

    model = load_llm()
    answer = generate_answer(llm=model, question=question,retrieved_docs= retrieved_docs)

    return answer


In [None]:
youtube_url = "https://youtu.be/gIrMptNPf5M?si=UJ0D5PUVHP5MCyYL"
question = "What is this video about?"

docs = run_youtube_chatbot(youtube_url, question)
print(docs)



content=' This video is about optimization of algorithms, specifically for finding the sum of a subarray with a minimum value and the time and space complexity analysis of different approaches. The speaker emphasizes the importance of understanding concepts and keeping them intact, and provides an analysis of the time and space complexity for various methods, including one with nested loops. The video references a previous problem in the same playlist which is related to finding the minimum from every subarray.' additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 89, 'prompt_tokens': 554, 'total_tokens': 643}, 'model_name': 'mistralai/Mistral-7B-Instruct-v0.2', 'system_fingerprint': '', 'finish_reason': 'stop', 'logprobs': None} id='lc_run--019b1c05-65f2-7c81-99d3-18124cbee0eb-0' usage_metadata={'input_tokens': 554, 'output_tokens': 89, 'total_tokens': 643}


In [42]:
print(type(docs))
print(type(docs.content))

<class 'langchain_core.messages.ai.AIMessage'>
<class 'str'>
