# Ask a youtube video

### Install dependencies

In [90]:
#!pip install langchain
#!pip install langchain-community
#!pip install python-dotenv
#!pip install langchain-openai
#!pip install weaviate-client
#!pip install openai
#!pip install youtube_transcript_api

### Load .env variables

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

YOUR_OPENAI_KEY = os.getenv("YOUR_OPENAI_KEY")
YOUR_WEAVIATE_KEY = os.getenv("YOUR_WEAVIATE_KEY")
YOUR_WEAVIATE_URL = os.getenv("YOUR_WEAVIATE_URL")

# 1. Fetch the video information

In [8]:
from youtube_transcript_api import YouTubeTranscriptApi

"""
This function extracts the video id from the youtube link
:param youtube_link: the youtube link
:return: the video id
"""
def get_video_id(youtube_link: str) -> str:

    # different formats of youtube links
    video_id = youtube_link.split("v=")
    if len(video_id) > 1:
        return video_id[1]
    video_id = youtube_link.split("youtu.be/")
    if len(video_id) > 1:
        return video_id[1]
    
    return youtube_link.split("/")[-1]

"""
This function fetches the transcript of a youtube video
:param youtube_link: the youtube link
:return: the transcript of the video
"""
def fetch_transcript(youtube_link : str) -> str:

    video_id = get_video_id(youtube_link)

    # I could have added even more languages
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en','fr'])

    return " ".join([line['text'] for line in transcript])

In [9]:
video_urls = [ "https://www.youtube.com/live/RXeOiIDNNek?si=DjyZK7jnuCYbbXg3" ] # WWDC 2024 Keynote]

video_transcripts = [ fetch_transcript(video_url) for video_url in video_urls ]

In [10]:
video_transcripts

["big day everyone biggest day of the year who's fired up Beth is fired up doctor Rockwell one more thing just have fun out there okay Phil I'm getting too old for this stuff [Music] go go it's [Music] showtime the yeah [Music] yeah wow that was so cool good morning welcome to Apple Park we're glad you could join us for what promises to be an action-packed and memorable WWDC WWDC marks a moment in the year when we're able to celebrate our Global developer Community developers continue to amaze us with the apps they create for our products apps that are used by over a billion people around the world it's important for us to provide this community with the newest tools and Technologies to do their very best work today we're going to have some incredible updates to our platforms and I'm excited that we'll introduce profound new intelligence capabilities that we hope will inspire developers Delight users and make our platforms even smarter and more useful than ever before we get into our p

# 2. Split transcript into sentences

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split the text into chunks of 800 characters with an overlap of 100 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
split_transcripts = [ text_splitter.split_text(transcript) for transcript in video_transcripts ]

# 3. Vector DB Storage

In [12]:
import weaviate
import weaviate.classes as wvc
from langchain.vectorstores import Weaviate

# Connect to Weaviate
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=YOUR_WEAVIATE_URL,
    auth_credentials=weaviate.auth.AuthApiKey(api_key=YOUR_WEAVIATE_KEY),
    headers = {
        "X-OpenAI-Api-Key": YOUR_OPENAI_KEY
    }
)

print("Client is ready: ", client.is_ready())

# Deletes previously created collections
client.collections.delete_all()

try:
    # Create the Transcript collection
    collection = client.collections.create(
        name="Transcript",
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(
            model="text-embedding-3-small"  # Specify the model
        ),
        properties=[
            wvc.config.Property(
                name="text",
                data_type=wvc.config.DataType.TEXT,  # Data type for the property
                description="Text of the transcript"  # Description of the property
            ),
        ]
    )
finally:
    client.close()

# Deprecated way to store using vectorstore
client = weaviate.Client(url=YOUR_WEAVIATE_URL, auth_client_secret=weaviate.auth.AuthApiKey(YOUR_WEAVIATE_KEY), additional_headers={"X-OpenAI-Api-Key": YOUR_OPENAI_KEY})
vectorstore = Weaviate(client, "Transcript", "text", attributes=["source"])

Client is ready:  True


            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [13]:
# Load text into the vector store

text_meta_pair = []
for transcript, video_url in zip(split_transcripts, video_urls):
    for chunk in transcript:
        text_meta_pair.append((chunk, {"source": video_url}))
texts, metas = zip(*text_meta_pair)
vectorstore.add_texts(texts, metas)

['c4cdd1fe-4bcf-4144-891f-f6fa06dbf557',
 '8f8bad77-d521-4aec-8975-e1233c703987',
 'f2d7c33e-655d-4759-8c30-a417552dcd77',
 'fdae768e-42d5-41d9-a61b-309b2cc4959f',
 '09c9c89b-78fc-41df-a2bd-8bb2dd50eb98',
 '993c9e95-173b-4b4a-913e-d78e44d4279a',
 '35f31e4c-c3d7-4ab4-89ff-8472a9754d1d',
 '2ef69472-2839-401e-840e-19a4b788197d',
 'a6a2078f-0760-4386-98d7-d789b63ae96f',
 '95ddf3cc-97ea-488c-b192-b5bc76105f75',
 '2d956691-3811-4d22-b0a3-0d8494b45111',
 'c2c446b5-f543-49e0-aad3-9e32a08a5f44',
 'fdcad8d7-7e64-435b-bfc9-fbeab1451c97',
 '303554da-6f5e-4233-a5cc-a5be409c517c',
 'ea89ed9d-f755-435a-a46c-dd2175253d68',
 '603354f1-4976-4767-8fa8-9bfa28e204c7',
 '58a4bd74-3a9a-4a6f-9755-da57bf8c33fb',
 'f0dd3615-6306-492d-b06a-42e656115906',
 'b194ad67-9325-4471-8e1a-c7c0057e7864',
 '61f6630b-1dab-4f6e-86b6-8b8de3f9b7a4',
 'b760472d-7309-4856-82b2-ea19cb5ae7f3',
 '6eab8068-cb3b-44b5-9619-d769b5daa8cd',
 '8b83b64a-0724-42e6-b0d6-d399feedd5cf',
 'f352ef23-a71b-49f2-a3fb-873e918888ec',
 'a54d775f-fdc0-

# 4. Similarity Search

In [22]:
# query = "What's new on ios18 ? "
# query = "Réponds en Français. Tell me about Apple Intelligence."
# query = "Have you heard about a way to create custom emojis ?"
query = "Do you think that all that AI in our everyday life devices could be harmful ?"

# retrieve chunks related to the query
vect = vectorstore.similarity_search(query, top_k=10)
vect

[Document(page_content="wondering if it's going to prevent me from getting to my daughter's play performance on time Apple intelligence can process the relevant personal data to assist me it can understand who my daughter is the play detail she sent several days ago the time and location for my meeting and predicted traffic between my office and the theater understanding this kind of personal context is essential for delivering truly helpful intelligence but it has to be done right you should not have to hand over all the details of your life to be warehoused and analyzed in someone's AI cloud with apple intelligence powerful intelligence goes hand inand with powerful privacy let me tell you more about its architecture and how it's built with privacy at the core the Cornerstone of the personal intelligence", metadata={'source': 'https://www.youtube.com/live/RXeOiIDNNek?si=DjyZK7jnuCYbbXg3'}),
 Document(page_content="and how it's built with privacy at the core the Cornerstone of the per

# 5. My Custom Chatbot

In [23]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = YOUR_OPENAI_KEY, temperature=0.1),
    chain_type="stuff"
)

/home/dup/Documents/Pro/InsideBoardAI/AskYourVideo/.venv/lib/python3.12/site-packages/pydantic/main.py:1070: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/


" I don't know. It depends on how it is implemented and used. The context provided suggests that Apple is prioritizing privacy in their AI technology, which could potentially mitigate any potential harm. However, it is important for companies to be transparent and ethical in their use of AI to ensure it is not harmful to individuals or society as a whole."

In [None]:
# create answer
answer = chain.run(input_documents=vect, question=query)
