
https://github.com/martinopiaggi/summarize

In [47]:
# @markdown ## 🔗 **Source Configuration**

# @markdown **Source Type**
Type_of_source = "Google Drive Video Link"  # @param ["YouTube Video", "Google Drive Video Link", "Dropbox Video Link", "Local File"]

# @markdown **Source URL or Path**
Source = "https://www.youtube.com/watch?v=SYCOdzb8H0U"  # @param {type:"string"}

# Set variables based on user input
Type = Type_of_source
URL = Source

# @markdown **Use YouTube Captions**
# @markdown If source is a Youtube video, it's recommended to use the available YouTube captions
# @markdown to save on transcription time and API usage.
use_Youtube_captions = True  # @param {type:"boolean"}

# @markdown ## 🎤 **Transcription Settings**
# @markdown Settings applied only if using Whisper transcription, not YouTube Captions

transcription_method = "Cloud Whisper"  # @param ["Cloud Whisper", "Local Whisper"]
language = "it"  # @param {type:"string"}
initial_prompt = ""  # @param {type:"string"}

In [55]:
# @markdown ## 🌐 **API Configuration**

predefined_endpoint = "Groq"  # @param ["OpenAI", "Groq", "DeepSeek", "Perplexity", "Google", "Hyperbolic", "Custom"]

endpoints = {
    "OpenAI": {"url": "https://api.openai.com/v1", "default_model": "gpt-4o", "key_env": "api_key_openai"},
    "Groq": {"url": "https://api.groq.com/openai/v1", "default_model": "llama-3.3-70b-versatile", "key_env": "api_key_groq"},
    "DeepSeek": {"url": "https://api.deepseek.com/v1", "default_model": "deepseek-chat", "key_env": "api_key_deepseek"},
    "Perplexity": {"url": "https://api.perplexity.ai", "default_model": "sonar-medium-chat", "key_env": "api_key_perplexity"},
    "Google": {"url": "https://generativelanguage.googleapis.com/v1beta/openai", "default_model": "gemini-1.5-pro", "key_env": "api_key_google"},
    "Hyperbolic": {"url": "https://api.hyperbolic.xyz/v1", "default_model": "meta-llama/Llama-3.3-70B-Instruct", "key_env": "api_key_hyperbolic"}
}

use_default_model = True  # @param {type:"boolean"}
model_name = ""  # @param {type:"string"}
custom_endpoint_url = ""  # @param {type:"string"}

# Function to get API key (defined here but called later)
def get_api_key():
    try:
        from google.colab import userdata
        api_key = userdata.get(api_key_env)
        if api_key:
            print(f"✅ Found API key in Colab secrets ({api_key_env})")
            return api_key
    except: pass

    load_dotenv()
    api_key = os.getenv(api_key_env) or os.getenv("api_key")
    if api_key:
        print(f"✅ Found API key from environment")
        return api_key

    print("⚠️ No API key found")
    return None

# Function to fetch models
def fetch_models(base_url, api_key):
    import requests
    try:
        response = requests.get(f"{base_url}/models", headers={"Authorization": f"Bearer {api_key}"}, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if "data" in data: return [model["id"] for model in data["data"]]
            if "models" in data: return data["models"]
        return []
    except: return []

# Set base URL and model based on selection
if predefined_endpoint == "Custom":
    base_url = custom_endpoint_url
    default_model = model_name
    api_key_env = "api_key"
else:
    base_url = endpoints[predefined_endpoint]["url"]
    default_model = endpoints[predefined_endpoint]["default_model"]
    api_key_env = endpoints[predefined_endpoint]["key_env"]

model = default_model if use_default_model else model_name

In [56]:
# @markdown ## 🛠️ Install Dependencies and Set Up Environment

# Install dependencies silently
import os
import sys
import io
from IPython.display import clear_output

# Capture installation output
def install_with_hidden_output(packages):
    stdout = sys.stdout
    stderr = sys.stderr
    str_out = io.StringIO()
    str_err = io.StringIO()
    sys.stdout = str_out
    sys.stderr = str_err
    success = True

    try:
        if isinstance(packages, list):
            for package in packages:
                !pip install {package} -q
        else:
            !pip install {packages} -q
    except Exception as e:
        success = False
    finally:
        sys.stdout = stdout
        sys.stderr = stderr

    if not success or "error" in str_err.getvalue().lower():
        print("❌ Installation error:")
        print(str_err.getvalue())
        return False
    return True

# Install dependencies silently
install_with_hidden_output(["nest_asyncio"])
install_status = install_with_hidden_output("git+https://github.com/martinopiaggi/summarize.git@feature/refactor-backend")

# Only continue if installation was successful
if not install_status:
    print("⚠️ There were issues with installation. Check errors above.")

# Import required modules
import nest_asyncio
from dotenv import load_dotenv
from summarizer import main, CONFIG
import asyncio

# Apply nest_asyncio for Colab compatibility
nest_asyncio.apply()

# Install source-specific dependencies
if Type == "Google Drive Video Link":
    from google.colab import drive
    drive.mount('/content/drive')
elif Type == "Local File":
    from google.colab import files

# Get transcription key (always need Groq for Cloud Whisper)
def get_groq_api_key():
    try:
        from google.colab import userdata
        groq_api_key = userdata.get('api_key_groq')
        if groq_api_key: return groq_api_key
    except: pass
    return os.getenv('api_key_groq')

# Set API keys - NOW WE GET THE API KEY
api_key = get_api_key()  # This will print the API key info
groq_api_key = get_groq_api_key()

# Set environment variables
os.environ['api_key'] = api_key or ""
os.environ['api_key_groq'] = groq_api_key or ""

# NOW DISPLAY MODEL INFO HERE (moved from Setup to API Configuration)
if api_key:
    print(f"✅ Using {predefined_endpoint} with model: {model}")
    available_models = fetch_models(base_url, api_key)

    if available_models:
        print(f"\nAvailable models:")
        for model_item in available_models:
            print(f"- {model_item}")
else:
    print("⚠️ No API key available")

✅ Found API key in Colab secrets (api_key_groq)
✅ Using Groq with model: llama-3.3-70b-versatile

Available models:
- llama-3.2-11b-vision-preview
- llama-3.2-90b-vision-preview
- qwen-2.5-32b
- gemma2-9b-it
- mistral-saba-24b
- llama-3.2-3b-preview
- llama3-70b-8192
- llama-3.3-70b-versatile
- qwen-2.5-coder-32b
- mixtral-8x7b-32768
- llama3-8b-8192
- distil-whisper-large-v3-en
- llama-3.2-1b-preview
- whisper-large-v3-turbo
- llama-guard-3-8b
- llama-3.1-8b-instant
- whisper-large-v3
- deepseek-r1-distill-qwen-32b
- llama-3.3-70b-specdec
- qwen-qwq-32b
- deepseek-r1-distill-llama-70b


In [50]:
# @markdown ## ⚙️ Configure Summarization Settings
prompt_type = "Summarization"  # @param ['Summarization', 'Only grammar correction with highlights','Distill Wisdom', 'Questions and answers', 'Essay Writing in Paul Graham Style']
parallel_api_calls = 5  # @param {type:"slider", min:1, max:60, step:1}
chunk_size = 16000      # @param {type:"slider", min:2000, max:28000, step:2000}
max_output_tokens = 2048  # @param {type:"slider", min:1024, max:8192, step:1024}

In [51]:
# @markdown ## 🚀 Run Summarization
AutoDownload = False  # @param {type:"boolean"}

# Process local file uploads
if Type == "Local File" and not URL:
    print("📁 Please upload your video file...")
    uploaded = files.upload()
    if uploaded:
        URL = list(uploaded.keys())[0]
        print(f"✅ Using uploaded file: {URL}")

# Configure settings
CONFIG.update({
    "type_of_source": Type,
    "source_url_or_path": URL,
    "use_youtube_captions": use_Youtube_captions if Type == "YouTube Video" else False,
    "transcription_method": transcription_method,
    "language": language,
    "initial_prompt": initial_prompt,
    "prompt_type": prompt_type,
    "parallel_api_calls": parallel_api_calls,
    "chunk_size": chunk_size,
    "max_output_tokens": max_output_tokens,
    "base_url": base_url,
    "model": model,
    "api_key": api_key  # Pass API key directly
})

# Progress indicator
from IPython.display import display, HTML
display(HTML("<div style='padding:10px; background:#e8f4ff; border-radius:5px; margin-bottom:15px;'><b>🚀 Starting summarization...</b></div>"))

try:
    # Fixed: Run in main thread with proper event loop handling
    from concurrent.futures import ThreadPoolExecutor
    import threading

    def run_summarizer(config):
        # Set a new event loop for this thread
        new_loop = asyncio.new_event_loop()
        asyncio.set_event_loop(new_loop)
        try:
            return main(config)
        finally:
            new_loop.close()

    # Run in a separate thread with its own event loop
    with ThreadPoolExecutor() as executor:
        future = executor.submit(run_summarizer, CONFIG)
        final_summary = future.result()

    # Save to file with metadata
    filename = "summary.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"# Summary for: {URL}\n\n")
        f.write(f"Generated using: {model} at {base_url}\n\n")
        f.write(final_summary)

    display(HTML(f"<div style='padding:10px; background:#e8fff0; border-radius:5px;'><b>✅ Summary completed!</b> Saved to {filename}</div>"))

    # Download button
    if AutoDownload:
      from google.colab import files
      files.download(filename)

except Exception as e:
    error_message = str(e)
    display(HTML(f"<div style='padding:10px; background:#fff0f0; border-radius:5px;'><b>❌ Error:</b> {error_message}</div>"))

    if "api_key" in error_message.lower():
        display(HTML("<div style='padding:10px; background:#fffde7; border-radius:5px; margin-top:10px;'><b>⚠️</b> Make sure you've set up your API keys correctly!</div>"))
    elif "ffmpeg" in error_message.lower():
        print("\n⚠️ Installing ffmpeg...")
        !apt-get update && apt-get install -y ffmpeg
        print("Please run the cell again.")