# Youtube Video Chatbot

In [1]:
import os
import secret_key
os.environ['GROQ_API_KEY'] = secret_key.groq_key
os.environ["LANGCHAIN_TRACING_V2"] = 'True'
os.environ["LANGCHAIN_API_KEY"] = secret_key.langchain_key

In [2]:
from langchain_groq import ChatGroq

llm = ChatGroq(model = 'llama-3.1-70b-versatile', temperature = 0.5)

In [75]:
llm.invoke("Write a short poem on love")

AIMessage(content="Soft whispers in the night,\nA gentle touch, a loving light,\nTwo hearts beat as one, a flame,\nLove's warmth that time cannot claim.\n\nIn eyes that shine like stars above,\nA love so strong, a tender love,\nIt guides us through life's joys and fears,\nAnd wipes away our bitter tears.\n\nWith every breath, it grows anew,\nA love that's pure, and forever true,\nA bond that's strong, a love that's free,\nA treasure that's meant to be.", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 104, 'prompt_tokens': 41, 'total_tokens': 145, 'completion_time': 0.419413639, 'prompt_time': 0.01040309, 'queue_time': 0.8908361220000001, 'total_time': 0.429816729}, 'model_name': 'llama-3.1-70b-versatile', 'system_fingerprint': 'fp_b3ae7e594e', 'finish_reason': 'stop', 'logprobs': None}, id='run-7f5d71f4-b182-4734-8225-a690f463476f-0', usage_metadata={'input_tokens': 41, 'output_tokens': 104, 'total_tokens': 145})

## Getting the Transcript and Summarizing it

In [64]:
from langchain.prompts import PromptTemplate

summary_prompt = PromptTemplate(
    input_variables = ['transcripts'],
    template = """You are a YouTube video explainer. You will recieve a transcript for a video and your task is to summarize it for others to understand. 
    You will output the complete summary of the transcript without missing any detail.
    Here is the transcript:\n {transcript}"""
)

In [72]:
from langchain_core.output_parsers import StrOutputParser
summary_chain = summary_prompt | llm

In [38]:
from youtube_transcript_api import YouTubeTranscriptApi

def get_transcript(video_url):
    video_id = video_url.split("v=")[1]  # Extract video ID from URL
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    transcript_text = " ".join([t['text'] for t in transcript])
    return transcript_text

transcript = get_transcript("https://www.youtube.com/watch?v=tuEYQvEYR-M")

In [70]:
ans = summary_chain.invoke({'transcript':transcript})

We now need to split this transcript in chunks so that we can embed and store it in out vector db.

In [41]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(transcript)

## Initializing Pinecone

In [13]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key = 'b670b069-a28b-4b19-85bf-bde5481cc688')

In [78]:
index_name = 'youtube-video-chatbot-trial'

if index_name not in pc.list_indexes():
    pc.create_index(
        name = index_name,
        dimension=384,
        metric='cosine',
        spec = ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [15]:
index = pc.Index(index_name)

## Storing in VectorDB

In [43]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")

for i, chunk in enumerate(chunks):
        embeddings = embedder.encode(chunk)
        index.upsert([(str(i), embeddings.tolist())])

## Querying the VectorDB

In [51]:
question = 'What is the mechanism of levitation?'

query_embedding = embedder.encode(question)

In [52]:
vec_embed_list = query_embedding.tolist()

result = index.query(vector = vec_embed_list, top_k=5, include_values=True)

In [53]:
result

{'matches': [{'id': '9',
              'score': 0.451150805,
              'values': [-0.0930360109,
                         -0.0444561951,
                         0.0274823736,
                         0.049454283,
                         0.0488982387,
                         0.0144062527,
                         0.00230381568,
                         0.0383606441,
                         -0.0346764587,
                         0.0133797657,
                         -0.035236802,
                         -0.00413212925,
                         0.0450203829,
                         0.0460022092,
                         -0.0411134064,
                         0.0173908342,
                         0.0576756559,
                         -0.0096150646,
                         -0.0224970877,
                         0.0260922089,
                         0.0748292506,
                         -0.078463912,
                         -0.012760465,
                         0.0203639

In [54]:
chunk_ids = [match['id'] for match in result['matches']]

In [55]:
chunk_ids

['9', '8', '3', '11', '4']

In [56]:
relevant_chunks = [chunks[int(i)] for i in chunk_ids]   
relevant_chunks

['diamagnetism water is a little diamagnetic so we can actually levitate small animals with this effect diamagnetism simply means that when the material becomes magnetic when placed in a magnetic field its own magnetic field is in the opposite direction magnetic fields cannot be canceled so these materials just redirect the lines around themselves this results in a measurable repulsive force between the material and the magnetic field quantifying this effect involves suspending the material with a force caliper in a magnetic field and measuring the change in weight caused by the repulsion this is what the graph would look like when quantifying the diomagnetism of a superconductor a straight line at high temperatures but a sharp change at their critical temperature becoming a perfect diamagnus at this point all magnetic field lines are expelled outside the material and the material just levitates above the magnet this is called the Meisner effect but even though we have tests like this'

In [57]:
context = " ".join(relevant_chunks)

## Getting Final Answer

In [86]:
qa_prompt = PromptTemplate(
        input_variables=["context", "question"],
        template="""You are an answering bot. You recieve a context and question. The context is taken from a youtube video transcript. 
        You have to answer the user question directly without any additional text other than answer to the question.
        However this doesnt mean that you always have to answer in short. Make use of the context and question to determine how long your answer should be.
        Just make sure to not include any irrelevant text in the answer. 
        Here is the context: {context}, 
        and the question: {question}""",
    )

qa_chain = qa_prompt | llm

In [58]:
final_ans = qa_chain.invoke({'context':context, 'question':question})

In [59]:
print(final_ans.content)

The mechanism of levitation is diamagnetism. Diamagnetism is a property of materials that become magnetic when placed in a magnetic field, but their own magnetic field is in the opposite direction. This results in a measurable repulsive force between the material and the magnetic field, causing the material to levitate.
