# https://github.com/martinopiaggi/summarize



In [None]:
Source = "" #@param {type:"string"}
Type_of_source = "Youtube video or playlist" #@param ['Youtube video or playlist', 'Google Drive video link','Dropbox video link']
Type = Type_of_source
URL = Source

#@markdown ---
#@markdown Insert your API key depending on which endpoint you want to use (by default Groq, otherwise check under OpenAI_endpoint to use chatGpt)
api_key = "sk-proj-ilzNGQlXHIdbrwirjWq0T3BlbkFJ3PDGts8qA44tMLpEmBij" #@param {type:"string"}

OpenAI_endpoint = True  #@param {type:"boolean"}
use_Youtube_captions = True #@param {type:"boolean"}


*Remember to change runtime type to T4 GPU in case you are planning to use Faster Whisper and not youtube autogenerated captions*

In [None]:
%%capture
#@markdown ## Installation of libraries
#@markdown Re-run this cell if you change settings in the previous cell.

import subprocess
import re

import torch
from torch.utils.data import Dataset, DataLoader

if use_Youtube_captions:
  !pip install youtube-transcript-api
  from youtube_transcript_api import YouTubeTranscriptApi

if (not Type == "Youtube video or playlist") or (not use_Youtube_captions):
  !pip install faster-whisper
  from faster_whisper import WhisperModel

if OpenAI_endpoint:
  !pip install openai
  import openai
  client = openai.OpenAI(api_key=api_key)
else:
  !pip install groq
  from groq import Groq
  client = Groq(api_key=api_key)

if Type == "Youtube video or playlist":
  !pip install git+https://github.com/pytube/pytube
  from pytube import YouTube

if Type == "Google Drive video link":
  from google.colab import drive
  drive.mount('/gdrive')

if Type == ("Dropbox video link"):
  !sudo apt update && sudo apt install ffmpeg


In [None]:
#@markdown ## Video fetching
#@markdown Re-run this cell if you change the source URL.

video_path_local_list = []
skip_transcription=False

Text = ""
TextTimestamps = ""

def seconds_to_time_format(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"


def download_youtube_audio_only(url):
    yt = YouTube(url)
    audio_stream = yt.streams.get_audio_only()
    saved_path = audio_stream.download(output_path=".", skip_existing=True)
    return saved_path


def download_youtube_captions(url):
    regex = r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
    video_id =  re.search(regex, url).group(1)
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

    try:
      transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
    except:
      for available_transcript in transcript_list:
        if available_transcript.is_translatable:
          transcript = available_transcript.translate('en').fetch()
          break

    text = ""
    for entry in transcript:
            start_time = seconds_to_time_format(entry['start'])
            text += f"{start_time} {entry['text'].strip()}\n"

    transcript_file_name = f"{video_id}_captions.md"

    with open(transcript_file_name, 'w', encoding='utf-8') as f:
      f.write(Text)

    return text,transcript_file_name

if Type == "Youtube video or playlist":
    #clean youtube url from timestamp
    URL = re.sub('\&t=\d+s?', '', URL)
    if use_Youtube_captions:
      Text, transcript_file_name = download_youtube_captions(URL)
      skip_transcription=True
    else:
      video_path_local_list.append(download_youtube_audio_only(URL))

elif Type == "Google Drive video link":
  subprocess.run(['ffmpeg', '-y', '-i', '/gdrive/My Drive/' + URL, '-vn', '-acodec', 'pcm_s16le',
                  '-ar', '16000', '-ac', '1', 'gdrive_audio.wav'], check=True)
  video_path_local_list.append("gdrive_audio.wav")

elif Type == "Dropbox video link":
    subprocess.run(['wget', URL, '-O', 'dropbox_video.mp4'], check=True)
    subprocess.run(['ffmpeg', '-y', '-i', 'dropbox_video.mp4', '-vn', '-acodec', 'pcm_s16le',
                    '-ar', '16000', '-ac', '1', 'dropbox_video_audio.wav'], check=True)
    video_path_local_list.append("dropbox_video_audio.wav")


In [None]:
%%capture
# @markdown ## Transcription using Faster Whisper
# @markdown ***Only run this cell if the source is not YouTube or you decided not to use YouTube captions.***

# @markdown You can specify the language and initial prompt to increase speed

if not skip_transcription:
  language = "" # @param {type:"string"}
  initial_prompt = "" # @param {type:"string"}

  video_path_local = str(video_path_local_list[0])


  model = WhisperModel('small', device="cuda", compute_type='int8')
  segments, info = model.transcribe(str(video_path_local), beam_size=5,
                                    language=None if language == "auto" else language,
                                    task="translate",
                                    initial_prompt=initial_prompt)

  transcript_file_name = video_path_local.replace(".mp4", ".md")

  with open(transcript_file_name, 'w') as f:
    for segment in segments:
      start_time = seconds_to_time_format(segment.start)
      Text += f"[{start_time}] {segment.text.strip()} "

    f.write(Text)


In [None]:
# @markdown ## Summarization and elaboration
prompt_type = "Summarization"  # @param ['Summarization', 'Only grammar correction with highlights']
# @markdown Set the number of parallel API calls (be mindful of usage rate limits)
parallel_api_calls = 1 # @param



# Define your prompts using a dictionary for easier management
prompts = {
    'Summarization': """Summarize the video transcript excerpt including a concise title that reflects the content. Wrap the title with **markdown bold notation**. Write the summary as if you are continuing a conversation without needing to signal a beginning. Here is the transcript: """,
    'Only grammar correction with highlights': """Repeat the following text correcting any grammatical errors and formatting error. Highlight only the important quote (if there are any) with **markdown bold notation**. Focus solely on the essence of the content as if you are continuing a conversation without using any form of introduction like 'Here's the corrected text:'. Here is the text to fix: """
}

# Select the appropriate prompt
summary_prompt = prompts[prompt_type]


def extract_and_clean_timestamps(text_chunks):
    timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2})')
    cleaned_texts = []
    timestamp_ranges = []
    for chunk in text_chunks:
        timestamps = timestamp_pattern.findall(chunk)
        if timestamps:
            for timestamp in timestamps:
                # Remove each found timestamp from the chunk
                chunk = chunk.replace(timestamp, "")
            timestamp_ranges.append(timestamps[0])  # Assuming you want the first timestamp per chunk
        else:
            timestamp_ranges.append("")
        cleaned_texts.append(chunk.strip())  # Strip to remove any leading/trailing whitespace
    return cleaned_texts, timestamp_ranges

def format_timestamp_link(timestamp):
    if Type == "Youtube video or playlist":
      hours, minutes, seconds = map(int, timestamp.split(':'))
      total_seconds = hours * 3600 + minutes * 60 + seconds
      return f"{timestamp} - {URL}&t={total_seconds}"
    else:
      return f"{timestamp}"

import concurrent.futures
import time

def summarize(prompt):
    # Adjust this condition based on your environment or configuration
    if OpenAI_endpoint:
        model="gpt-3.5-turbo"
    else:
        model="llama3-8b-8192"
    completion = client.chat.completions.create(
            model=model,
            messages=[
            {"role": "system", "content": summary_prompt},
            {"role": "user", "content": prompt}
            ],
            max_tokens=4096
    )
    return completion.choices[0].message.content

def process_and_summarize(text):
    chunk_size, overlap_size = 4096, 20
    texts = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size - overlap_size)]
    cleaned_texts, timestamp_ranges = extract_and_clean_timestamps(texts)
    summaries = []

    with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_api_calls) as executor:
        future_to_chunk = {executor.submit(summarize, text_chunk): idx for idx, text_chunk in enumerate(cleaned_texts)}
        for future in concurrent.futures.as_completed(future_to_chunk):
            idx = future_to_chunk[future]
            try:
                summarized_chunk = future.result()
                summary_piece = format_timestamp_link(timestamp_ranges[idx]) + " " + summarized_chunk
                summary_piece += "\n"
                summaries.append((idx, summary_piece))
            except Exception as exc:
                print(f'Chunk {idx} generated an exception: {exc}')
                # Resubmit the task with the new model
                time.sleep(10)
                future_to_chunk[executor.submit(summarize, texts[idx])] = idx

    summaries.sort()  # Ensure summaries are in the correct order
    final_summary = "\n\n".join([summary for _, summary in summaries])

    # Save the final summary
    final_name = transcript_file_name.replace(".md", "_FINAL.md") if Type != "Dropbox video link" else "final_dropbox_video.md"
    with open(final_name, 'w') as f:
        f.write(final_summary)


process_and_summarize(Text)