In [2]:
import os
import logging
import re

from googleapiclient.discovery import build
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from youtube_transcript_api import YouTubeTranscriptApi

# logging.basicConfig(
#     filename='logs/errors.log', 
#     level=logging.INFO,
#     format='%(asctime)s - %(message)s',
#     datefmt='%Y-%m-%d %H:%M:%S'
# )

def retrieve_channel_items(channel_handle:str):
    """Given a YT channel handlen retrieves its stats and videos"""
    youtube = build("youtube", "v3", developerKey=os.getenv("YOUTUBE_API_KEY"))
    
    #Step 1: Get the channel’s Uploads playlist ID
    request = youtube.channels().list(
        part="contentDetails,statistics",
        forHandle=channel_handle,
        )
    response = request.execute()
    uploads_playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    
    #bonus get the channel statistics
    statistics = response['items'][0]['statistics']
    
    #Step 2: List videos from the uploads playlist
    videos = []
    
    request = youtube.playlistItems().list(
        part="contentDetails,snippet",
        playlistId=uploads_playlist_id,
        maxResults=50
    )
    
    while request:
        response = request.execute()
    
        for item in response["items"]:
            title = item["snippet"]["title"]
            video_id = item["snippet"]["resourceId"]["videoId"]
            url = f"https://www.youtube.com/watch?v={video_id}"
    
            videos.append({"title": title, "url": url})
    
        request = youtube.playlistItems().list_next(request, response)
    
    return(statistics, videos)

def generate_vector_store(data):
    """Generates a FAISS vector store given based on a list of videos titles"""
    title_list = [item["title"] for item in data]
    url_list = [item["url"] for item in data]
    metadatas = [{"url": url} for url in url_list]

    embeddings = OpenAIEmbeddings()

    vectorstore = FAISS.from_texts(title_list, embeddings, metadatas=metadatas)

    return(vectorstore)

def fast_rag(vectorstore, llm, user_query, n_videos=10):
    """Performs similairity search with the vector store and analyzes the result providing an answer"""
    docs = vectorstore.similarity_search(user_query, k=n_videos)
    context_blocks = []
    for i, d in enumerate(docs, 1):
        url = d.metadata.get("url", "")
        content = (d.page_content or "").strip()
        context_blocks.append(f"[video {i}] {content}: {url}".strip())

    context = "\n\n---\n\n".join(context_blocks)

    prompt = f"""Answer the following question using the provided videos from the YouTube channel
    Instructions:
    - Base your answer ONLY on the provided YouTube channel videos.
    - If some videos are not relevant with user question, don't talk about its. 
    - Provide URL for each relevant video

    Question: {user_query}

    YouTube channel videos:
    {context}
    """
    answer = llm.invoke(prompt).content
    return(docs, answer)

def retrieve_video_transcript(url_or_id: str):
    """Extracts video ID and fetches the transcript as a single string."""
    # Extract video ID from URL if necessary
    video_id = url_or_id
    if "youtube.com" in url_or_id or "youtu.be" in url_or_id:
        match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url_or_id)
        if match:
            video_id = match.group(1)
    print(video_id)
    try:
        full_transcript=""
        ytt_api = YouTubeTranscriptApi()
        transcript_list = ytt_api.list(video_id)
        try:
            transcript = transcript_list.find_transcript(['en'])
        except:
            print("No english transcript available ; looking for other languages to translate (fr, es, de)")
            transcript = next(iter(transcript_list)).translate('en')
            # transcript = transcript_list.find_transcript(['fr', 'es', 'de']).translate('en')
        fetched_transcript = transcript.fetch()
        for snippet in fetched_transcript.snippets:
            full_transcript+= snippet.text + " "
        return full_transcript
    except Exception as e:
        logging.exception(f"Error fetching transcript for video ID: {video_id} ; {e}")
        raise Exception(f"Captions are disabled or unavailable for this video ({video_id}).") from e

In [3]:
retrieve_video_transcript("a4WHGod6JG0")

a4WHGod6JG0


ERROR:root:Error fetching transcript for video ID: a4WHGod6JG0 ; 'FetchedTranscriptSnippet' object is not subscriptable
Traceback (most recent call last):
  File "/tmp/ipykernel_54091/1356014411.py", line 111, in retrieve_video_transcript
    full_transcript = " ".join([item['text'] for item in fetched_transcript])
                                ~~~~^^^^^^^^
TypeError: 'FetchedTranscriptSnippet' object is not subscriptable


Exception: Captions are disabled or unavailable for this video (a4WHGod6JG0).

In [30]:
retrieve_video_transcript("https://www.youtube.com/watch?v=bBVum_LXPCw")

bBVum_LXPCw
No english transcript available ; looking for other languages to translate (fr, es, de)


Exception: Captions are disabled or unavailable for this video (bBVum_LXPCw).

In [31]:
ytt_api = YouTubeTranscriptApi()
transcript_list = ytt_api.list("bBVum_LXPCw")

In [32]:
for transcript in transcript_list:
    print(transcript)

fr ("French")[TRANSLATABLE]
fr ("French (auto-generated)")[TRANSLATABLE]


In [34]:
next(iter(transcript_list)).translate('en').fetch()

IpBlocked: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=bBVum_LXPCw! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

Ways to work around this are explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).


If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [25]:
transcript = transcript_list.find_generated_transcripts().translate('en')

AttributeError: 'TranscriptList' object has no attribute 'find_generated_transcripts'

In [24]:
transcript_list.find_transcript(['fr', 'es', 'de']).translate('en').fetch()

IpBlocked: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=bBVum_LXPCw! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

Ways to work around this are explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).


If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [23]:
transcript_list.find_transcript(['fr']).fetch()

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='Bienvenue dans l’Essentiel du Dessous Des Cartes.\xa0', start=6.96, duration=2.0), FetchedTranscriptSnippet(text='Dernière ligne droite avant\xa0\nles élections européennes\xa0', start=8.96, duration=2.64), FetchedTranscriptSnippet(text='Et aujourd’hui on prend la direction de l’Espagne,\nVoici Pedro Sánchez\xa0', start=11.6, duration=4.4), FetchedTranscriptSnippet(text='Premier Ministre depuis 2018 !\nIl a réussi à se maintenir encore\xa0\xa0', start=16.0, duration=4.28), FetchedTranscriptSnippet(text='au pouvoir en faisant alliance avec\xa0\nles Indépendantistes de Catalogne.\xa0', start=20.28, duration=4.32), FetchedTranscriptSnippet(text='En échange de cette alliance :\nPedro Sánchez vient de faire adopter\xa0\xa0', start=24.6, duration=3.6), FetchedTranscriptSnippet(text='une loi qui amnistie ces Catalans qui ont tenté\xa0\nen 2017 de faire sécession du reste de l’Espagne,\xa0\xa0', start=28.2, duration=6.32), FetchedTrans