In [2]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

##Step 1a - Indexing (Document Ingestion)

In [4]:
video_id = "Gfr50f6ZBvo" 
transcript_list = YouTubeTranscriptApi().fetch(video_id=video_id).to_raw_data()
print("Transcript fetched successfully.")

# Convert the transcript to a string format
transcript_text = " ".join([entry['text'] for entry in transcript_list])
print(f"Transcript text: {transcript_text[:100]}...")  # Print first 100 characters for brevity

Transcript fetched successfully.
Transcript text: the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has...


Step 1b - Indexing (Text Splitting)

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript_text])

print(f"Number of chunks created: {len(chunks)}")

Number of chunks created: 168


Step 1c and 1d - Indexing (Embedding generation and storing in Vector store)

In [7]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = FAISS.from_documents(chunks, embeddings)

In [8]:
vector_store.save_local("faiss_YT_index")

In [10]:
vector_store.index_to_docstore_id
vector_store.get_by_ids(["5a9d2117-0453-4a48-9bdc-06773b988cca"])

[Document(id='5a9d2117-0453-4a48-9bdc-06773b988cca', metadata={}, page_content='demas establish to support this podcast please check out our sponsors in the description and now let me leave you with some words from edskar dykstra computer science is no more about computers than astronomy is about telescopes thank you for listening and hope to see you next time')]

Step2 - Retrieval

In [12]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001C69FE9A270>, search_kwargs={'k': 4})

In [14]:
retriever.invoke("What is deepmind")

[Document(id='cbd2f40d-00dc-4e86-b1aa-fbf888400752', metadata={}, page_content="the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a bit of a personal qu

Step 3 - Augmentation

In [15]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.4)
prompt = PromptTemplate(
    template="You are a helpful assistant. " \
              "Answer ONLY from the provided transcript context." \
              "If the contect is insufficient, just say you don'f know." \
              "\n\n {context}" \
              "\n\nQuestion: {question}",
    input_variables=["context", "question"]
)

In [24]:
question = "is the topic of bmw cars discussed in the video? if yes then what was discussed?"
retrieved_docs = retriever.invoke(question)
print(f"Retrieved documents: {retrieved_docs}")

# Combine the retrieved documents into a single context string
context = "\n\n".join([doc.page_content for doc in retrieved_docs])
print(f"Context for question: {context[:500]}...")  # Print first 500 characters for brevity


Retrieved documents: [Document(id='cbd2f40d-00dc-4e86-b1aa-fbf888400752', metadata={}, page_content="the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and now dear friends here's demis hassabis let's start with a

In [25]:
#final prompt
final_prompt = prompt.invoke({"context": context, "question": question})
print(f"Final prompt: {final_prompt}") # Print first 500 characters for brevity

Final prompt: text="You are a helpful assistant. Answer ONLY from the provided transcript context.If the contect is insufficient, just say you don'f know.\n\n the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible artificial intelligence systems in the history of computing including alfred zero that learned all by itself to play the game of gold better than any human in the world and alpha fold two that solved protein folding both tasks considered nearly impossible for a very long time demus is widely considered to be one of the most brilliant and impactful humans in the history of artificial intelligence and science and engineering in general this was truly an honor and a pleasure for me to finally sit down with him for this conversation and i'm sure we will talk many times again in the future this is the lex friedman podcast to support it please check out our sponsors in the description and

Step 4 - Generation

In [26]:
answer = llm.invoke(final_prompt)
print(f"Answer: {answer}")  # Print the final answer

Answer: content="I don't know." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 697, 'total_tokens': 701, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-C6OfzJuUoVg8b3ScCkqaw8HFqjUID', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--31c0b75e-73d9-4e52-b606-3b110d86727d-0' usage_metadata={'input_tokens': 697, 'output_tokens': 4, 'total_tokens': 701, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
