<a href="https://colab.research.google.com/drive/1n5csfChR4iRIhvaIpuh5hCrgrUiPYXUi?usp=sharing" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# https://github.com/martinopiaggi/summarize



In [None]:
# @markdown ## 🔗 **Source Configuration**

# @markdown **Source Type**
Type_of_source = "YouTube Video"  # @param ["YouTube Video", "Google Drive Video Link", "Dropbox Video Link", "Local File"]

# @markdown **Source URL or Path**
Source = ""  # @param {type:"string"}

# Set variables based on user input
Type = Type_of_source
URL = Source

# @markdown **Use YouTube Captions**

# @markdown If source is a Youtube video, it's recommended to use the available YouTube captions to save on transcription time and API usage. 

use_Youtube_captions = True  # @param {type:"boolean"}

# @markdown ---
# @markdown ## 🌐 **API Configuration**

# @markdown The summarization process uses the API key specified in `api_key` variable. 
# @markdown Ensure you have set the required environment variables or Colab secrets for your API keys.

api_endpoint = "Groq"  # @param ["Groq", "OpenAI", "Custom"]

# Define endpoints and models based on the selected API
endpoints = {
    "Groq": "https://api.groq.com/openai/v1",
    "OpenAI": "https://api.openai.com/v1",
    "Custom": "http://localhost:1234/v1"  # Example custom endpoint
}
base_url = endpoints.get(api_endpoint)

# Define models based on the selected API
model = {
    "Groq": "llama-3.1-70b-versatile",
    "OpenAI": "gpt-4",
    "Custom": "custom-model-id"  # Placeholder for any custom model
}.get(api_endpoint)

# @markdown ---
# @markdown ## 🎤 **Transcription Settings**
# @markdown The transcription settings are applied only if you want to use Whisper transcription and not Youtube Captions. 


# @markdown If you plan to use Whisper API endpoint (only **Groq** endpoint is supported for now) you have to specify your Groq API key in `api_key_groq`.

# @markdown Why use `api_key_groq` and `api_key` ? So that you can use a different API for summarization (e.g., OpenAI), specify the corresponding API key in `api_key`.

# @markdown If using locally Whisper: remember to switch the runtime type in Google Colab to a GPU instance (e.g., T4). Go to **Runtime** > **Change runtime type** and select **GPU** as the hardware accelerator.

# @markdown **Transcription Method**
transcription_method = "Cloud Whisper"  # @param ["Cloud Whisper", "Local Whisper"]

# @markdown **Language** (ISO-639-1 code, e.g., "en" for English)
language = "auto"  # @param {type:"string"}

# @markdown **Initial Prompt for Whisper** (Optional)
initial_prompt = ""  # @param {type:"string"}

## Libraries and helper functions
Re-run if you change settings in the previous cell

In [None]:
# @markdown ## Libraries and helper functions
# @markdown Re-run if you change settings in the previous cell

import subprocess
import re
import os
!pip install python-dotenv
from dotenv import load_dotenv

if use_Youtube_captions:
  !pip install youtube-transcript-api
  from youtube_transcript_api import YouTubeTranscriptApi

if (not Type == "YouTube Video") or (not use_Youtube_captions):
  if transcription_method == "Local Whisper":
    !pip install openai-whisper
    import whisper
  else:
    !pip install --upgrade groq
    from groq import Groq

if Type == "YouTube Video":
  !pip install pytubefix
  from pytubefix import YouTube

if Type == "Google Drive Video Link":
  from google.colab import drive
  drive.mount('/content/drive')


# Function to get configuration value
def get_api_key():
    if api_endpoint == "Groq":
      return get_groq_api_key()
    try:
        from google.colab import userdata
        api_key = userdata.get('api_key')
    except ImportError:
        load_dotenv()
        api_key = os.getenv('api_key')

    if not api_key:
        raise ValueError("API key not found in environment variables or Colab secrets")

    return api_key

def get_groq_api_key():
    try:
        from google.colab import userdata
        groq_api_key = userdata.get('api_key_groq')
    except ImportError:
        load_dotenv()
        groq_api_key = os.getenv('api_key_groq')

    if not groq_api_key:
        raise ValueError("Groq API key not found in environment variables or Colab secrets")

    return groq_api_key

# Converts the audio file to MP3 with low sample rate and bitrate to reduce the file size (to stay in audio file API limits) 
def process_audio_file(input_path, output_path):
    command_convert = [
        'ffmpeg', '-y', '-i', input_path,
        '-ar', str(8000),
        '-ac', str(1),
        '-b:a', '16k',
        output_path
    ]
    subprocess.run(command_convert, check=True)


!pip install openai
import openai
client = openai.OpenAI(api_key = get_api_key(), base_url=base_url)

 ## Video fetching
 Re-run cell if you change the source URL

In [None]:
# @markdown ## Video fetching
# @markdown Re-run cell if you change the source URL
skip_transcription=False
transcription_text = ""
textTimestamps = ""

def seconds_to_time_format(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"

def download_youtube_audio_only(url):
    yt = YouTube(url)
    audio_stream = yt.streams.get_audio_only()
    saved_path = audio_stream.download(mp3=True,output_path=".", skip_existing=True)
    return saved_path

def download_youtube_captions(url):
    regex = r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
    video_id =  re.search(regex, url).group(1)
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    try:
      transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
    except:
      for available_transcript in transcript_list:
        if available_transcript.is_translatable:
          transcript = available_transcript.translate('en').fetch()
          break

    transcription_text = ""
    for entry in transcript:
            start_time = seconds_to_time_format(entry['start'])
            transcription_text += f"{start_time} {entry['text'].strip()}\n"

    transcript_file_name = f"{video_id}_captions.md"

    with open(transcript_file_name, 'w', encoding='utf-8') as f:
      f.write(transcription_text)

    return transcription_text,transcript_file_name

if Type == "YouTube Video":
    #clean youtube url from timestamp
    URL = re.sub('\&t=\d+s?', '', URL)
    if use_Youtube_captions:
      transcription_text, transcript_file_name = download_youtube_captions(URL)
      skip_transcription=True
    else:
      video_path_local =  download_youtube_audio_only(URL)
      # Process the audio file to reduce its size
      processed_audio_path = os.path.splitext(video_path_local)[0] + '_processed.mp3'
      process_audio_file(video_path_local, processed_audio_path)
      video_path_local = processed_audio_path  # Update to the processed file path

elif Type == "Google Drive Video Link":
  subprocess.run(['ffmpeg', '-y', '-i', "drive/MyDrive/" + URL, '-vn', '-acodec', 'pcm_s16le',
                  '-ar', '16000', '-ac', '1', 'gdrive_audio.wav'], check=True)
  video_path_local = "gdrive_audio.wav"
  # Process the audio file to reduce its size
  processed_audio_path = os.path.splitext(video_path_local)[0] + '_processed.mp3'
  process_audio_file(video_path_local, processed_audio_path)
  video_path_local = processed_audio_path  # Update to the processed file path

elif Type == "Dropbox Video Link":
    subprocess.run(['wget', URL, '-O', 'dropbox_video.mp4'], check=True)
    subprocess.run(['ffmpeg', '-y', '-i', 'dropbox_video.mp4', '-vn', '-acodec', 'pcm_s16le',
                    '-ar', '16000', '-ac', '1', 'dropbox_video_audio.wav'], check=True)
    video_path_local = "dropbox_video_audio.wav"
    # Process the audio file to reduce its size
    processed_audio_path = os.path.splitext(video_path_local)[0] + '_processed.mp3'
    process_audio_file(video_path_local, processed_audio_path)
    video_path_local = processed_audio_path  # Update to the processed file path

elif Type == "Local File":
    local_file_path = Source
    subprocess.run(['ffmpeg', '-y', '-i', local_file_path, '-vn', '-acodec', 'pcm_s16le',
                    '-ar', '16000', '-ac', '1', 'local_file_audio.wav'], check=True)
    video_path_local = "local_file_audio.wav"
    # Process the audio file to reduce its size
    processed_audio_path = os.path.splitext(video_path_local)[0] + '_processed.mp3'
    process_audio_file(video_path_local, processed_audio_path)
    video_path_local = processed_audio_path  # Update to the processed file path

## Transcription using Whisper
***Only run this cell if the source is not YouTube or you decided not to use YouTube captions.***

In [None]:
# @markdown ### Transcription
# @markdown Re-run cell if you change transcription settings
if not skip_transcription:
    transcription_text = ""

    if video_path_local:
        # Single file transcription
        audio_files = [video_path_local]
    else:
        # Multiple chunk files
        audio_files = audio_chunks

    for audio_file_path in audio_files:
        if transcription_method == "Local Whisper":
            # Local Whisper transcription
            transcription = model_whisper.transcribe(
                audio_file_path,
                beam_size=5,
                language=None if language == "auto" else language,
                task="translate",
                initial_prompt=initial_prompt or None
            )

            for segment in transcription["segments"]:
                start_time = seconds_to_time_format(segment['start'])
                transcription_text += f"{start_time} {segment['text'].strip()} "

        elif transcription_method == "Cloud Whisper":
            # Cloud Whisper using Groq API
            groq_client = Groq(api_key=get_groq_api_key())
            with open(audio_file_path, "rb") as audio_file:
                transcription_response = groq_client.audio.transcriptions.create(
                    file=(os.path.basename(audio_file_path), audio_file.read()),
                    model="distil-whisper-large-v3-en" if language == "en" else "whisper-large-v3",
                    prompt=initial_prompt or None,
                    response_format="verbose_json",
                    language=None if language == "auto" else language,
                    temperature=0.0
                )

            # Corrected code using dot notation
            for segment in transcription_response.segments:
                start_time = seconds_to_time_format(segment['start'])
                transcription_text += f"{start_time} {segment['text'].strip()} "
else:
  print("Using YouTube captions for transcription.")

# Save the transcription
if not skip_transcription:
    transcript_file_name = 'transcription.md'
    with open(transcript_file_name, 'w', encoding='utf-8') as f:
        f.write(transcription_text)
else:
    transcript_file_name = f"{video_id}_captions.md"


## Summarization and elaboration

In [None]:
prompt_type = "Questions and answers"  # @param ['Summarization', 'Only grammar correction with highlights','Distill Wisdom', 'Questions and answers']

# @markdown Parallel API calls (mind rate limits)
parallel_api_calls = 30 # @param

# @markdown Chunk size (tokens) (mind model context length). Higher = less granular summary.
# @markdown Rule of thumb: 28k for 3h, 10k for 1h, 5k for 30min, 4k for shorter.
chunk_size = 18000 # @param

# @markdown Overlap (tokens) between chunks
overlap_size = 20 # @param

# @markdown Max output tokens of each chunk (mind model limits). Higher = less granular summary.
# @markdown Rule of thumb: 4k, 2k or 1k depending on content density.
max_output_tokens = 2096 # @param

final_summary = ""

prompts = {
    'Distill Wisdom': """Analyze the transcript and extract key insights and wisdom including a concise title that reflects the content.

**{TITLE}**

**IDEAS**
- ...

**QUOTES**
- ...

**REFERENCES**
- ...

- **Formatting Guidelines**:
  - **Title**: Start with the title in bold (`**{TITLE}**`).
  - **Categories**: Use bold for category headers (`**IDEAS**`, `**QUOTES**`, `**REFERENCES**`).
  - **Bullet Points**: Use hyphens (`-`) for each bullet point.
  - **Omit Empty Categories**: Do not include a category if there are no relevant items.
  - **No Additional Text**: Do not add any introductory phrases, explanations, or headers using `#`.
  - **Strict Template Adherence**: Follow the template exactly as shown above without deviations.

Here is the text:
""",
    'Summarization': """Summarize the video transcript excerpt including a concise title that reflects the content. Wrap the title with **markdown bold notation**. Write the summary as if you are continuing a conversation without needing to signal a beginning. Here is the transcript: """,
    'Only grammar correction with highlights': """Repeat the following text correcting any grammatical errors and formatting error. Highlight only the important quote (if there are any) with **markdown bold notation**. Focus solely on the essence of the content as if you are continuing a conversation without using any form of introduction like 'Here's the corrected text:'. Here is the text to fix: """,
    'Questions and answers': """Analyze the input text and generate 5 essential questions that, when answered, capture the main points and core meaning of the text. Do not add any introductory phrases, explanations! Just start with the questions and answers. Mark each  question with **bold syntax** and don't number them. 2.) When formulating your questions: a. Address the central theme or argument b. Identify key supporting ideas c. Highlight important facts or evidence d. Reveal the author's purpose or perspective e. Explore any significant implications or conclusions. 3.) Answer all of your generated questions one-by-one in detail.

Here is the text:
"""
   }

# Select the appropriate prompt
summary_prompt = prompts[prompt_type]


def extract_and_clean_timestamps(text_chunks):
    timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2})')
    cleaned_texts = []
    timestamp_ranges = []
    for chunk in text_chunks:
        timestamps = timestamp_pattern.findall(chunk)
        if timestamps:
            for timestamp in timestamps:
                # Remove each found timestamp from the chunk
                chunk = chunk.replace(timestamp, "")
            timestamp_ranges.append(timestamps[0])  # Assuming you want the first timestamp per chunk
        else:
            timestamp_ranges.append("")
        cleaned_texts.append(chunk.strip())  # Strip to remove any leading/trailing whitespace
    return cleaned_texts, timestamp_ranges

def format_timestamp_link(timestamp):
    if Type == "YouTube Video":
      hours, minutes, seconds = map(int, timestamp.split(':'))
      total_seconds = hours * 3600 + minutes * 60 + seconds
      return f"{timestamp} - {URL}&t={total_seconds}"
    else:
      return f"{timestamp}"

import concurrent.futures
import time

def summarize(prompt):
    completion = client.chat.completions.create(
            model=model,
            messages=[
            {"role": "system", "content": summary_prompt},
            {"role": "user", "content": prompt}
            ],
            max_tokens=max_output_tokens
    )
    return completion.choices[0].message.content

def process_and_summarize(text):
    texts = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap_size)]
    cleaned_texts, timestamp_ranges = extract_and_clean_timestamps(texts)
    summaries = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_api_calls) as executor:
        future_to_chunk = {executor.submit(summarize, text_chunk): idx for idx, text_chunk in enumerate(cleaned_texts)}
        for future in concurrent.futures.as_completed(future_to_chunk):
            idx = future_to_chunk[future]
            try:
                summarized_chunk = future.result()
                summary_piece = format_timestamp_link(timestamp_ranges[idx]) + "\n\n" + summarized_chunk
                summary_piece += "\n"
                summaries.append((idx, summary_piece))
            except Exception as exc:
                print(f'Chunk {idx} generated an exception: {exc}')
                # Resubmit the task with the new model
                time.sleep(10)
                future_to_chunk[executor.submit(summarize, texts[idx])] = idx

    summaries.sort()  # Ensure summaries are in the correct order
    final_summary = "\n\n".join([summary for _, summary in summaries])

    # Save the final summary
    final_name = transcript_file_name.replace(".md", "_FINAL.md") if Type != "Dropbox video link" else "final_dropbox_video.md"
    with open(final_name, 'w') as f:
        f.write(final_summary)


process_and_summarize(transcription_text)

In [None]:
# @markdown Clean folder
!rm *.md