In [1]:
# Necessary imports
import os
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.retrievers import ContextualCompressionRetriever, MultiQueryRetriever
from langchain_classic.retrievers.document_compressors import LLMChainExtractor
from langchain_pinecone import PineconeVectorStore
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings, ChatHuggingFace
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv

# load huggingface api key
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
# embeddig model config
embedding_model = HuggingFaceEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

# LLM for document compressor
llm = ChatGroq(model="llama-3.1-8b-instant")

main_llm = HuggingFaceEndpoint(
    repo_id="google/gemma-2-2b-it",
    task="text-generation"
)
model = ChatHuggingFace(llm=main_llm)

### 1. INDEXING

In [3]:
# ============================== Step1: Document Ingestion ==============================
video_id = "Gfr50f6ZBvo" 
try:
    api = YouTubeTranscriptApi()

    transcript_list = api.list(video_id=video_id)
    transcripts = transcript_list.find_transcript(["en", "hi"]).fetch()
        
    result = " ".join(chunk.text for chunk in transcripts)
    
except TranscriptsDisabled:
    print(f"Transcript was not availabel for the video id: {video_id}")

except Exception as e:
    print(f"An Error Occured: {type(e)}")

# ============================== Step2: Text Splitting ============================== 
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
# documents splitted into smaller chunks
chunk = splitter.create_documents(texts=[result])

# ============================== Step3: Embedding and Storing into Vector Store ============================== 
vector_store = PineconeVectorStore.from_documents(
    documents=chunk,
    embedding=embedding_model,
    index_name=os.getenv("PINECONE_INDEX_NAME")
)

### 2. RETRIEVAL

In [4]:
base_retriever = vector_store.as_retriever(
    search_type="mmr",
    kwargs={"k":4}
)
base_compressor = LLMChainExtractor.from_llm(llm=llm)

retriever = ContextualCompressionRetriever(
    base_compressor=base_compressor,
    base_retriever=base_retriever
)

query = "Is the topic of nuclear fusion discussed ? if yess what was the disscussion"

context = retriever.invoke(input=query)

print(f"========== Retrireved Documents: ==========\n")
for i, res in enumerate(context, start=1):
    print(f"Result {i}")
    print(res.page_content)


Result 1
in this case in fusion we we collaborated with epfl in switzerland the swiss technical institute who are amazing they have a test reactor that they were willing to let us use which you know i double checked with the team we were going to use carefully and safely i was impressed they managed to persuade them to let us use it and um and it's a it's an amazing test reactor they have there and they try all sorts of pretty crazy experiments on it and um the the the what we tend to look at is if we go into a new domain like fusion what are all the bottleneck problems uh like thinking from first principles you know what are all the bottleneck problems that are still stopping fusion working today and then we look at we you know we get a fusion expert to tell us and then we look at those bottlenecks and we look at the ones which ones are amenable to our ai methods today yes right and and and then and would be interesting from a research perspective from our point of view from an ai po

### 3. AUGMENTATION

In [5]:
prompt = PromptTemplate(
    template="""
Answer the question only using the context given below, if the context is not available return "I don't know"
context: {result}
question: {query}
""",
input_variables=["result", "query"]
)

### 4. GENERATION

In [6]:
parser = StrOutputParser()

chain = prompt | model | parser

final_result = chain.invoke({"result": result, "query": query})
print(final_result)

BadRequestError: (Request ID: Root=1-69542fc1-1dd993a1617a3e26295df97c;9d401bac-483b-473b-8856-6457c7576900)

Bad request: