In [None]:
!pip install -q requests torch bitsandbytes transformers accelerate gradio sentencepiece yt-dlp datasets[audio]

In [None]:
#Integrating GRADIO to Youtube Video Summary
#we will use the Whisper-1 Model for audio to text + Groq
#we will use the "meta-llama/Meta-Llama-3.1-8B-8192" for summary task
#Finally integrate this in Gradio UI


In [None]:
from groq import Groq
import requests
from dotenv import load_dotenv
import os
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import gradio as gr
import torch
import requests
from IPython.display import Markdown, display, update_display
import numpy as np
import re
import yt_dlp
from datasets import load_dataset


In [None]:
#sign in HF
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
#Groq API
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
groq=Groq()

In [None]:
#Speech transcription using open-source model:  automatic speech recognition (ASR) ==>> https://huggingface.co/openai/whisper-large-v3-turbo for more info
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [None]:
#Extracting Audio from Ytube url
def download_audio(url):

  ydl_opts = {
      'format': 'bestaudio/best',
      'postprocessors': [{
          'key': 'FFmpegExtractAudio',
          'preferredcodec': 'mp3',
      }],
      'outtmpl': 'audio_file.%(ext)s'  # This sets the output filename
  }

  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
      ydl.download([url])

      return 'audio_file.mp3'

In [None]:
#To handle large audio files >30 min we need a different approach
#we need to split the audio in smaller files and process each chunk separately
from pydub import AudioSegment #handle audio splitting
import math 
import tempfile

def split_audio_file(audio_file_path,chunk_length_ms=10*60*1000): #10-minute chunks
    #""Split the audio files into chunks of specified lenght""
    audio = AudioSegment.from_mp3(audio_file_path)
    chunks = []
    
    #Calculate how many chunks we need
    total_length_ms = len(audio)
    num_chunks = math.ceil(total_length_ms/chunk_length_ms) #round-up numbers 

    #Create a temporary directory to store chunks
    #temp_dir = temp_dir.mkdtemp()
    temp_dir = tempfile.mkdtemp()
    print(f"Created temporary directory: {temp_dir}")

    for i in range(num_chunks):
        start_ms= i*chunk_length_ms
        end_ms=min((i+1)* chunk_length_ms , total_length_ms) # handle the remaining portion of the audio

        chunk = audio[start_ms:end_ms]
        chunk_path = os.path.join(temp_dir,f"chunk_{i}.mp3")
        #chunk_export = (chunk_path, format=="mp3")
        chunk.export(chunk_path, format="mp3")
        chunks.append(chunk_path)
    return chunks,temp_dir


In [None]:
#Function to use Groq's API for transcription for chunks files
def groq_transcribe_chunk(audio_chunk_path):
    url = "https://api.groq.com/openai/v1/audio/transcriptions"

    headers = {
        "Authorization": f"Bearer {groq_api_key}"
    }

    with open(audio_chunk_path, "rb") as audio_file:
        files={
            "file":("audio.mp3",audio_file,"audio/mpeg")
        }
        data ={
            "model":"whisper-large-v3-turbo",
            "response_format":"text"
        }

        response = requests.post(url, headers=headers, files=files, data=data)

        if response.status_code == 200:
            return response.text
        else:
            raise Exception(f"Groq API Error: {response.status_code} - {response.text}")    

In [None]:
#Transcribe and summarize actions
def transcribe(url):
    try:   
        #Step 1: Download audio and transcribe using Groq
        print("Downloading audio from YouTube...")
        audio_path=download_audio(url)

        #Step 2: Split the audio into chunks
        print("Splitting audio into chunks...")
        #chunks = split_audio_file(audio_path)
        chunks,temp_dir = split_audio_file(audio_path)

        #Step 3: Transcribe each chunk
        print(f"Transcribing {len(chunks)} chunks...")

        full_transcript =""

        for i, chunk_path in enumerate(chunks):
            print(f"Transcribing chunk {i+1}/{len(chunks)}...")
            chunk_transcript = groq_transcribe_chunk(chunk_path)
            full_transcript += chunk_transcript + " "

            #Delete the chunk file after processing
            os.remove(chunk_path)

        #Clean up - remove the temp directory and the original audio file
        #os.rmdir(os.path.dirname(chunks[0])) #remove temp directory
        os.rmdir(temp_dir)  # Remove temp directory
        os.remove(audio_path) #remove original audio file

        print(f"*** Full transcript here: {full_transcript.strip()}")

        #Step 4:Summarize
        system_prompt = """
        
        You are a specialized summarization assistant designed to create accurate, informative summaries from audio transcripts. Your primary goal is to extract and organize key information without adding, distorting, or fabricating any content.
        
        ## Core Guidelines
        
        1. **Only use information explicitly present in the transcript**. Do not add interpretations, assumptions, or information not directly stated.
        2. **Maintain factual accuracy** at all times. If something is ambiguous or unclear in the transcript, acknowledge the uncertainty rather than making assumptions.
        3. **Preserve the original meaning and intent** of the speakers.
        4. **Use direct quotes** when appropriate to maintain accuracy.
        
        ## Required Summary Structure
        
        Your summary must include the following sections in this order:
        
        1. **Title**: Extract or derive the title directly from the transcript. If no explicit title is mentioned, create a concise, descriptive title based solely on the main topic discussed.
        
        2. **Main Topic**: A 1-2 sentence description of the central subject being discussed.
        
        3. **Participants**: Only include this section if speakers are clearly identified in the transcript. List all participants mentioned by name or role.
        
        4. **Discussion Points**: Outline the key topics covered in the conversation in chronological order. Use bullet points for clarity.
        
        5. **Highlights**: List 3-5 notable moments, quotes, or insights from the transcript. These should be direct references to content in the transcript.
        
        6. **Action Points**: Only include this section if specific actions, tasks, or next steps are mentioned in the transcript. List each action item with any associated responsibility or deadline if mentioned.
        
        7. **Key Takeaways**: Summarize 3-5 main conclusions or important insights from the discussion. These must be directly derived from the transcript content.
        
        ## Error Prevention Protocol
        
        - If information for any required section is not present in the transcript, explicitly state "No [section name] mentioned in the transcript" rather than fabricating content.
        - If uncertain about any information, indicate this with phrases like "possibly" or "appears to be" rather than stating as fact.
        - Double-check all names, numbers, dates, and technical terms against the transcript.
        
        
        ``
        Remember: Your primary responsibility is to maintain the integrity of the original content. When in doubt, prioritize accuracy over comprehensiveness.
        """
           
        user_prompt = f"Below is an extract transcript of youtube video. Write summary in markdown of the whole text: \n{full_transcript.strip()}"
    
       
         # Create the chat completion using Groq's API
        chat_completion = groq.chat.completions.create(
            model="llama3-8b-8192",
            messages =[
            {"role":"system", "content":system_prompt},
            {"role":"user", "content":user_prompt}
        ],
        max_tokens=8000     #this depends on the model                              
        )

        # Extract the assistant's response
        assistant_response = chat_completion.choices[0].message.content
    
        return assistant_response
    except Exception as e:
        return f"Error: {e}"


In [None]:
#Adding Gradio GUI
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Textbox(
        label = "Youtube URL",
        placeholder = "Enter Youtube video URL here..",
        lines=1
    ),

    outputs=gr.Markdown(label="Video Summary", min_height=60),
    title="YouTube video summary",
    description="Enter a YouTube URL to get the summary. This process may take a few minutes.",
    flagging_mode="never"
)

demo.launch(inbrowser=True)