In [None]:
%pip install youtube-transcript-api google-generativeai chromadb

In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

import google.generativeai as genai

import chromadb
from chromadb.utils import embedding_functions

import os

  from .autonotebook import tqdm as notebook_tqdm


Gemini API
https://ai.google.dev.pricing
https://github.com/johnnycode8/chromadb_quickstart/blob/main/README.md

In [None]:

# load gemini api key from .venv/

GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
genai.configure(api_key=GEMINI_API_KEY)


# instantiate gemini model

genai_model = genai.GenerativeModel('models/gemini-1.5-flash')

chroma_client = chromadb.PersistentClient(path="my_vectordb")

gemini_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=GEMINI_API_KEY)

chroma_collection = chroma_client.get_or_create_collection(name='yt_notes', embedding_function=gemini_ef)

In [None]:
# Some sample YouTube videos:
# https://youtu.be/IdLSZEYlWVo
# https://youtu.be/tL-wnMVyTQI
# https://youtu.be/etSdP9CFmko
# https://youtu.be/rgRIZDsEwCk
# https://youtu.be/_EA-74yr5D4

yt_video_id = 'hQH4-5o0BMM'

prompt = "Extract key notes from video transcript: "



In [None]:
# Reference: https://github.com/jdepoix/youtube-transcript-api
transcript = YouTubeTranscriptApi.get_transcript(yt_video_id, languages=['en','en-US','en-GB'])
transcript = TextFormatter().format_transcript(transcript)

with open("temp_transcript.txt", "w") as file:
    file.write(transcript)




In [None]:
# https://ai.google.dev/api/generate-content
response = genai_model.generate_content(prompt + transcript, stream=False)

with open("temp_notes.txt", "w") as file:
    file.write(response.text)

# Review temp_notes.txt, edit if necessary

In [None]:
with open("temp_notes.txt", "r") as file:
    notes = file.read()

# Insert, if record doesn't exist, otherwise update existing record
# https://docs.trychroma.com/reference/py-collection#upsert
chroma_collection.upsert(
    documents=[notes],
    ids=[yt_video_id]
)

# Validation
result = chroma_collection.get(yt_video_id, include=['documents'])
result

In [None]:
query_text = "How much beef do I need for the beef ribs recipe?"
n_results = 5

# https://docs.trychroma.com/reference/py-collection#query
results = chroma_collection.query(
    query_texts=[query_text],
    n_results=n_results,
    include=['documents', 'distances', 'metadatas'],
)

for i in range(len(results['ids'][0])):
    id       = results["ids"][0][i]
    document = results['documents'][0][i]

    print("************************************************************************")
    print(f"{i+1}.  https://youtu.be/{id}")
    print("************************************************************************")
    print(document)

In [None]:
prompt = "Answer the following QUESTION using DOCUMENT as context."
prompt += f"QUESTION: {query_text}"
prompt += f"DOCUMENT: {results['documents'][0][0]}"

response = genai_model.generate_content(prompt, stream=False)
print(response.text)