<a href="https://colab.research.google.com/github/martinopiaggi/summarize/blob/main/Martino_Summarize_videos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summarization notebook with AIs

Repository: https://github.com/martinopiaggi/summarize

In [None]:
#@markdown # Source of the summary
#@markdown ## **Type**
Type = "Dropbox video link" #@param ['Text', 'Text from Google Drive','Youtube video or playlist', 'Videos on Google Drive folder','Dropbox video link']
#@markdown (*Run this cell again if you change the source*)

#@markdown ---
#@markdown #### **Text**
#@markdown (*only if type is text*)
Text = "" #@param {type:"string"}
#@markdown #### **Youtube video or playlist**
#@markdown (*only if type is yt videos*)
URL = "https://www.youtube.com/watch?v=VqnF1TTkKV0" #@param {type:"string"}
#@markdown #### **Google Drive video**
#@markdown *audio (mp4, wav), or folder containing video and/or audio files*
#@markdown (*only if type is from Google Drive*)
video_path = "Colab Notebooks/transcription/my_video.mp4" #@param {type:"string"}
#@markdown #### **Dropbox link video**
#@markdown *The video share link which allows anyone to view it*
dropbox_URL = "" #@param {type:"string"}
#@markdown ---
#@markdown #### If source is video, you want timestamps in final summary?
Timestamps = True #@param {type:"boolean"}

if Type is ("Text" or "Text from Google Drive"):
  Timestamps = False

In [None]:
!wget -O dropbox_video.mp4


In [None]:
!sudo apt update && sudo apt install ffmpeg
!ffmpeg -i dropbox_video.mp4 -vn -acodec copy dropbox_video_audio.wav

In [None]:
#@markdown ---
#@markdown # Install libraries
#@markdown This cell will take a little while to download several libraries

#@markdown ---
!pip install transformers
!pip install tensorflow
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn",device=0)

import re
import math

if Type == ("Youtube video or playlist" or 'Videos on Google Drive folder'):

  video_path_local_list = []
  ! pip install faster-whisper
  from faster_whisper import WhisperModel
  from pathlib import Path
  import subprocess
  import torch
  import shutil
  import numpy as np

  model = WhisperModel('small', device="cuda", compute_type='int8')


if Type == "Youtube video or playlist":
  !pip install yt-dlp
  from pathlib import Path
  import yt_dlp

  ydl_opts = {
  'format': 'm4a/bestaudio/best',
  'outtmpl': '%(id)s.%(ext)s',
  # ℹ️ See help(yt_dlp.postprocessor) for a list of available Postprocessors and their arguments
  'postprocessors': [{  # Extract audio using ffmpeg
  'key': 'FFmpegExtractAudio',
  'preferredcodec': 'wav',
  }]
  }

  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download([URL])
    list_video_info = [ydl.extract_info(URL, download=False)]

  for video_info in list_video_info:
    video_path_local_list.append(Path(f"{video_info['id']}.wav"))

  for video_path_local in video_path_local_list:
    if video_path_local.suffix == ".mp4":
        video_path_local = video_path_local.with_suffix(".wav")
    result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])


In [None]:
#@markdown ---


if Type is not ("Text" or "Text from Google Drive"):

  def seconds_to_time_format(s):
      # Convert seconds to hours, minutes, seconds, and milliseconds
      hours = s // 3600
      s %= 3600
      minutes = s // 60
      s %= 60
      seconds = s // 1
      milliseconds = round((s % 1) * 1000)

      # Return the formatted string
      return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"


  #@markdown # Trascription
  #@markdown Trascription of videos (if needed)
  language = "auto" #@param ["auto", "en", "zh", "ja", "fr", "de"] {allow-input: true}
  initial_prompt = "Here are some English words you may need: OneDrive" #@param {type:"string"}

  segments, info = model.transcribe(str(video_path_local), beam_size=5,
                                    language=None if language == "auto" else language,
                                    initial_prompt=initial_prompt,
                                    vad_filter=True, #voice activity detection
                                    vad_parameters=dict(min_silence_duration_ms=50))

  ext_name = ".srt"
  transcript_file_name = video_path_local.stem + ext_name
  sentence_idx = 1
  with open(transcript_file_name, 'w') as f:
    for segment in segments:
      if Timestamps:
        ts_start = seconds_to_time_format(segment.start)
        ts_end = seconds_to_time_format(segment.end)
        f.write(f"{ts_start} -> {ts_end} ")
      f.write(f"{segment.text.strip()}\n")
      sentence_idx = sentence_idx + 1

  try:
    shutil.copy(video_path_local.parent / transcript_file_name,
              drive_whisper_path / transcript_file_name
    )
    display(Markdown(f"**Transcript file created: {drive_whisper_path / transcript_file_name}**"))
  except:
    display(Markdown(f"**Transcript file created: {video_path_local.parent / transcript_file_name}**"))



In [None]:
#@markdown ---
#@markdown # Summarization
#@markdown Using https://huggingface.co/facebook/bart-large-cnn

summarizer = pipeline("summarization", model="facebook/bart-large-cnn",device=0)
tokenizer = summarizer.tokenizer

if Type is not ("Text" or "Text from Google Drive"):
  Text = open(transcript_file_name, "r").read()

Text = re.sub(r'\n', ' ', Text)

tokens = tokenizer.encode(Text.strip())

# Calculate the number of chunks needed
chunk_len = math.ceil(len(tokens) / 512)
chunksNumber = len(tokens)//chunk_len

# Split the tokens into chunks
chunks = [tokens[i:i+chunksNumber] for i in range(0, len(tokens), chunksNumber)]

if(len(chunks)>1):
  if (len(chunks[-1]) + len(chunks[-2])) < 1024:
      merged_chunk = chunks.pop(-1) + chunks.pop(-1)
      chunks.append(merged_chunk)

summary = ''

for chunk in chunks:
    if Timestamps:
      chunkText = tokenizer.decode(chunk);
      init_ts = re.findall(r"\d{2}:\d{2}:\d{2} -", chunkText)[0]
      end_ts = re.findall(r"> \d{2}:\d{2}:\d{2} ", chunkText)[-1]
      chunkText = re.sub(r"(\d{2}:?)* -> (\d{2}:?)*", '', chunkText)

    # Set max_length and min_length based on token count
    max_length = len(chunk) // 3
    min_length = len(chunk) // 5

    #Generate summary for each chunk without sampling (example)
    summary_chunk = summarizer(chunkText, max_length=max_length, min_length=min_length, do_sample=True)
    if Timestamps:
      summary += init_ts
      summary += end_ts + ' '
    summary += summary_chunk[0]['summary_text'] + "\n"

print(summary)