In [None]:
# -*- coding: utf-8 -*-
"""Video and Audio Summarizer for Colab

This script allows users to summarize YouTube videos, uploaded video files, or audio files.
"""

'Video and Audio Summarizer for Colab\n\nThis script allows users to summarize YouTube videos, uploaded video files, or audio files.\n'

In [2]:
# Install necessary packages
!pip install -q yt-dlp pydub llama-index-readers-file ipywidgets openai
!pip install -q git+https://github.com/openai/whisper.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.1/170.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.5/361.5 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.8/295.8 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import yt_dlp
import os
import subprocess
from pathlib import Path
from google.colab import files
from llama_index.readers.file import VideoAudioReader
from openai import OpenAI
import ipywidgets as widgets
from IPython.display import display
import textwrap
import mimetypes

In [4]:
# Global variable to track processed files
processed_files = set()

In [5]:
# Set up OpenAI API key
from google.colab import userdata
open_ai_key = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = open_ai_key
client = OpenAI(api_key=open_ai_key)

In [6]:
def download_audio(video_url, output_path='audio.wav'):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

    # Check if the file was renamed
    if os.path.exists(output_path + '.wav'):
        os.rename(output_path + '.wav', output_path)

    print(f"Debug: Audio file downloaded as {output_path}")
    return output_path

In [7]:
def process_audio(file_path):
    print(f"Debug: Processing audio file: {file_path}")
    try:
        reader = VideoAudioReader()
        documents = reader.load_data(Path(file_path))
        transcript = documents[0].text
        print("Transcript:")
        print(transcript)

        summary = summarize_text(transcript)
        print("\nAudio/Video Summary:")
        print(textwrap.fill(summary, width=80))
    except Exception as e:
        print(f"Error processing audio: {str(e)}")
        # If there's an error with VideoAudioReader, try using whisper directly
        try:
            import whisper
            model = whisper.load_model("base")
            result = model.transcribe(file_path)
            transcript = result["text"]
            print("Transcript (using whisper directly):")
            print(transcript)

            summary = summarize_text(transcript)
            print("\nAudio/Video Summary:")
            print(textwrap.fill(summary, width=80))
        except Exception as e:
            print(f"Error processing audio with whisper: {str(e)}")

In [8]:
def determine_input_type(url, uploaded_file):
    if url:
        return 'youtube'
    elif uploaded_file:
        file_name = uploaded_file.get('name', '')
        print(f"Debug: File name in determine_input_type: {file_name}")

        if not file_name:
            return 'unknown'

        # Check file extension
        file_extension = os.path.splitext(file_name)[1].lower()
        print(f"Debug: File extension is {file_extension}")

        if file_extension in ['.mp4', '.avi', '.mov']:
            return 'video'
        elif file_extension in ['.mp3', '.wav', '.ogg', '.m4a']:
            return 'audio'

        # If extension check fails, try mime type
        mime_type, _ = mimetypes.guess_type(file_name)
        print(f"Debug: Detected MIME type is {mime_type}")

        if mime_type:
            if mime_type.startswith('video'):
                return 'video'
            elif mime_type.startswith('audio'):
                return 'audio'

    return 'unknown'

In [14]:
def process_youtube(url):
    try:
        audio_file = download_audio(url)
        if os.path.exists(audio_file):
            print(f"Debug: Audio file exists: {audio_file}")
            print(f"Debug: File size: {os.path.getsize(audio_file)} bytes")
            process_audio(audio_file)
        else:
            print(f"Error: Audio file {audio_file} not found after download")
    except Exception as e:
        print(f"An error occurred while processing YouTube URL: {str(e)}")


In [15]:
def process_video(file_path):
    audio_file = 'audio.wav'
    if os.path.exists(audio_file):
        os.remove(audio_file)

    os.system(f"ffmpeg -i '{file_path}' -vn -acodec pcm_s16le -ar 44100 -ac 2 '{audio_file}' -y")
    process_audio(audio_file)

In [26]:
from IPython.display import display, Markdown
def process_audio(file_path):
    print(f"Debug: Starting to process audio file: {file_path}")
    try:
        reader = VideoAudioReader()
        documents = reader.load_data(Path(file_path))
        transcript = documents[0].text
        print("Transcript:")
        print(transcript)

        summary = summarize_text(transcript)
        print("\nAudio/Video Summary:")
        #print(textwrap.fill(summary, width=80))
        display(Markdown(summary))
    except Exception as e:
        print(f"Error processing audio: {str(e)}")

In [17]:
def summarize_text(text, max_tokens=500):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes video transcripts."},
            {"role": "user", "content": f"Please summarize the following video transcript in about 250 words and be concise:\n\n{text}"}
        ],
        max_tokens=max_tokens
    )
    return response.choices[0].message.content

In [18]:
def handle_upload(change):
    global processed_files
    if not change['new']:
        print("No file was uploaded.")
        return

    for filename, file_info in change['new'].items():
        if filename in processed_files:
            print(f"File {filename} has already been processed. Skipping.")
            continue

        print(f"Debug: Uploaded file info: {filename}")

        file_name = file_info['metadata']['name']
        content = file_info['content']

        print(f"Debug: File name is {file_name}")

        if not content:
            print("The uploaded file is empty.")
            continue

        with open(file_name, 'wb') as f:
            f.write(content)

        input_type = determine_input_type('', {'name': file_name})
        print(f"Debug: Determined input type is {input_type}")

        if input_type == 'video':
            process_video(file_name)
        elif input_type == 'audio':
            process_audio(file_name)
        else:
            print(f"Unsupported file type: {file_name}. Please upload an MP4 video or MP3/WAV audio file.")

        processed_files.add(filename)

In [19]:
def delete_uploaded_files(b):
    global processed_files
    for file in os.listdir():
        if file.endswith(('.mp4', '.mp3', '.wav', '.wav')):
            os.remove(file)
    processed_files.clear()  # Clear the set of processed files
    print("All uploaded files have been deleted and the processed files list has been cleared.")
    print("Ready for new input. Please enter a YouTube URL or upload new files.")



In [20]:
# Widgets
url_input = widgets.Text(
    value='',
    placeholder='Enter YouTube video URL',
    description='YouTube URL:',
    disabled=False
)

In [21]:
upload_button = widgets.FileUpload(
    accept=".mp4,.mp3,.wav",
    multiple=True
)

In [22]:
process_button = widgets.Button(
    description='Process Media',
    disabled=False,
    button_style='',
    tooltip='Click to process the media'
)

In [23]:
delete_button = widgets.Button(
    description='Delete Uploaded Files',
    disabled=False,
    button_style='danger',
    tooltip='Click to delete all uploaded files'
)

In [24]:
def on_button_click(b):
    global processed_files
    url = url_input.value
    uploaded_files = upload_button.value

    if not url and not uploaded_files:
        print("Please enter a YouTube video URL or upload a video (MP4) or audio (MP3/WAV) file.")
        return

    if url:
        process_youtube(url)
    elif uploaded_files:
        for filename, file_info in uploaded_files.items():
            if filename in processed_files:
                print(f"File {filename} has already been processed. Skipping.")
                continue

            file_name = file_info['metadata']['name']
            input_type = determine_input_type('', {'name': file_name})

            try:
                if input_type == 'video':
                    process_video(file_name)
                elif input_type == 'audio':
                    process_audio(file_name)
                else:
                    print(f"Unsupported file type: {file_name}. Please upload an MP4 video or MP3/WAV audio file.")

                processed_files.add(filename)
            except Exception as e:
                print(f"An error occurred while processing {file_name}: {str(e)}")
    else:
        print("Unsupported input type. Please enter a YouTube video URL or upload a video (MP4) or audio (MP3/WAV) file.")

    # Clear the widgets
    url_input.value = ''
    upload_button.value.clear()
    upload_button._counter = 0

# Link button clicks to functions
process_button.on_click(on_button_click)
delete_button.on_click(delete_uploaded_files)

# Display widgets
display(url_input)
display(upload_button)
display(process_button)
display(delete_button)

# Observe file upload
upload_button.observe(handle_upload, names='value')

# video_url:  "https://youtu.be/IxbR0yTMMY8?si=c5oTph_rRV2Sk_Gq"

Text(value='', description='YouTube URL:', placeholder='Enter YouTube video URL')

FileUpload(value={}, accept='.mp4,.mp3,.wav', description='Upload', multiple=True)

Button(description='Process Media', style=ButtonStyle(), tooltip='Click to process the media')

Button(button_style='danger', description='Delete Uploaded Files', style=ButtonStyle(), tooltip='Click to dele…

[youtube] Extracting URL: https://youtu.be/DpQQi2scsHo?si=_jAuO_fgchVu0Cwt
[youtube] DpQQi2scsHo: Downloading webpage
[youtube] DpQQi2scsHo: Downloading ios player API JSON
[youtube] DpQQi2scsHo: Downloading web creator player API JSON
[youtube] DpQQi2scsHo: Downloading player 53afa3ce
[youtube] DpQQi2scsHo: Downloading m3u8 information
[info] DpQQi2scsHo: Downloading 1 format(s): 251
[download] Destination: audio.wav
[download] 100% of   10.18MiB in 00:00:00 at 23.31MiB/s  
[ExtractAudio] Destination: audio.wav.wav
Deleting original file audio.wav (pass -k to keep)
Debug: Audio file downloaded as audio.wav
Debug: Audio file exists: audio.wav
Debug: File size: 154138970 bytes
Debug: Starting to process audio file: audio.wav


100%|███████████████████████████████████████| 139M/139M [00:03<00:00, 48.2MiB/s]


Transcript:

Audio/Video Summary:
Only four companies in the world have a market value exceeding $2 trillion:
Microsoft, Apple, Alphabet (Google's parent company), and Nvidia, a California-
based company founded in 1993 initially to enhance video game graphics. Nvidia's
stock value surged from $1 trillion to $2 trillion in eight months, driven by
high demand for its advanced technologies crucial for artificial intelligence
(AI).  At Nvidia's annual developer conference in March, CEO and co-founder
Jensen Huang discussed the transformative potential of AI, comparing the current
moment to Apple's iPhone launch. Huang introduced Nvidia's latest graphics
processing unit (GPU), named Blackwell, which is designed in America but
manufactured in Taiwan. The GPU processes numerous calculations simultaneously
and is key to AI development.  Various industries are leveraging Nvidia's AI
technology. At the AI conference, applications showcased included a digital twin
of Earth for rapid weather pred

The video transcript discusses Nvidia's transformation from a video game graphics company to a leading force in artificial intelligence (AI). Founded in 1993, Nvidia reached a $2 trillion valuation in just eight months, driven by its advanced GPU technology crucial for AI development. The transcript features Jensen Huang, Nvidia's co-founder and CEO, who emphasizes AI’s transformative potential.

At Nvidia's recent developer conference, Huang showcased the new “Blackwell” GPU, noted for unprecedented speed and efficiency. The GPU enables massive calculations in parallel, making it foundational for various AI applications. Examples include advancing drug discovery, creating virtual movie sets, and aiding material design.

Huang compares the AI revolution to historical technological leaps, underscoring AI’s role in future advancements. Nvidia's GPUs are also pivotal in AI models for innovative tasks, such as drug development and robotic automation. Companies like Generate Biomedicines and Figure, a robotics startup, use Nvidia’s technology to make significant strides in their fields.

The narrative traces Nvidia's journey from a startup discussed at a Denny’s to a tech giant integral to AI's growth. Despite high demand and substantial achievements, Huang remains humble and focused on innovation. However, the transcript also highlights the duality of AI's promise and potential risk, reflecting broader societal concerns over AI’s future role. Jensen Huang asserts the collaborative potential of humans and AI, ultimately envisioning a future where AI fosters human advancement rather than domination.