In [60]:
# Put this in terminal
# pip install transformers 
# pip install ipywidgets
# pip install python-dotenv
 
import subprocess #lets us download captions
import os
import re # Detect lines that are just numbers and skip them
import glob #let's python search for files matching a pattern
import time
from transformers import pipeline

files = glob.glob("*.vtt")
print(files)

['bO5nvE289ec.en.vtt']


In [None]:
video_url = "https://www.youtube.com/watch?v=bO5nvE289ec"

In [75]:
def get_transcript(video_url: str, lang = "en", cookies_file = "cookies (1).txt") -> str:
  # First, get information about available captions
  info_result = subprocess.run([
      "yt-dlp",
      "--list-subs",
      video_url
  ], check=False, text=True, capture_output=True)

  print("Available captions:")
  print(info_result.stdout)
  print(info_result.stderr)

  match = re.search(r"v=([a-zA-z0-9_-]+)",video_url)
  if not match:
      raise ValueError("Invalid YouTube URL")
  else:
    video_id = match.group(1)
  
  # This is a list so we have to get the first element which is our file
  caption_file = glob.glob(f"{video_id}.*")
  # If we already have the caption file, read it and return the transcript
  if not caption_file:
    print("No creator captions, trying auto captions")
    other_result = subprocess.run([
    "yt-dlp", #download captions
    "--write-auto-sub", #auto subtitles if possible
    "--write-sub", #Also can use creator captions if they have it
    "--sub-langs", f"{lang}", #english language
    "--skip-download", #prevents actual video from being downloaded
    "--output", "%(id)s.%(ext)s", #forces file names to be VIDEOID.ext
    "--verbose", # Add verbose flag for debugging
    video_url #enter actual link
    ], check = False, text = True, capture_output = True)
    caption_file = glob.glob(f"{video_id}.vtt")
    if not caption_file:
      raise ValueError("No captions found")

  file = caption_file[0]
# Get rid of empty spaces and arrows
  print(f"Using caption file {file}")
  transcript = ""
  with open(file, "r", encoding = "utf-8") as f:
    for line in f:
      line = line.strip()
      if not line:
        continue
      if "-->" in line:
        continue
      if re.match(r"^\d+$", line):
        continue
      transcript += line + " "
    cleaned_transcript = re.sub(r"<.*?>", " ", transcript)
  return cleaned_transcript.split()

In [76]:
text = get_transcript(video_url)
print(text[:100])

Available captions:
[youtube] Extracting URL: https://www.youtube.com/watch?v=bO5nvE289ec
[youtube] bO5nvE289ec: Downloading webpage
[youtube] bO5nvE289ec: Downloading tv client config
[youtube] bO5nvE289ec: Downloading tv player API JSON
[youtube] bO5nvE289ec: Downloading web safari player API JSON
[youtube] bO5nvE289ec: Downloading m3u8 information
[info] Available automatic captions for bO5nvE289ec:
Language      Name                                               Formats
en-US                                                            vtt
ab            Abkhazian                                          vtt, srt, ttml, srv3, srv2, srv1, json3
aa            Afar                                               vtt, srt, ttml, srv3, srv2, srv1, json3
af            Afrikaans                                          vtt, srt, ttml, srv3, srv2, srv1, json3
ak            Akan                                               vtt, srt, ttml, srv3, srv2, srv1, json3
sq            Albanian          

In [77]:
import textwrap
from typing import List, Optional

def _normalize_token(t: str) -> str:
    # remove angle-tags, lower-case, strip surrounding punctuation
    t = re.sub(r"<.*?>", "", t)
    t = t.lower()
    t = re.sub(r"^[^\w']+|[^\w']+$", "", t)  # strip leading/trailing punctuation (keep apostrophes)
    return t

def remove_consecutive_duplicate_phrases(words: List[str],
                                        max_seq_len: Optional[int] = None,
                                        normalize: bool = False) -> List[str]:
    """
    Collapse consecutive duplicated phrases in a token list.
      - words: list of word tokens (strings)
      - max_seq_len: maximum phrase length to consider (None -> n//2)
      - normalize: if True, compare using a normalized version of tokens (lowercase, strip tags/punct)
    Returns a cleaned list of tokens with duplicates collapsed (one copy kept).
    """
    n = len(words)
    if n == 0:
        return []

    if max_seq_len is None:
        max_seq_len = n // 2
    max_seq_len = max(1, min(max_seq_len, n // 2))

    # optionally build normalized view for comparisons
    if normalize:
        norm_words = [_normalize_token(w) for w in words]
    else:
        norm_words = words

    cleaned: List[str] = []
    i = 0
    while i < n:
        found = False
        # largest possible chunk at i that can repeat at least once
        max_L = min(max_seq_len, (n - i) // 2)
        # try long -> short so we prefer whole-sentence matches
        for L in range(max_L, 0, -1):
            a_norm = norm_words[i:i+L]
            b_norm = norm_words[i+L:i+2*L]
            if a_norm == b_norm:
                # count how many consecutive copies of this chunk exist
                count = 1
                while i + (count+1)*L <= n and norm_words[i + count*L : i + (count+1)*L] == a_norm:
                    count += 1
                # keep ONE copy (the original tokens, not normalized)
                cleaned.extend(words[i:i+L])
                # skip all copies
                i += count * L
                found = True
                break
        if not found:
            # no repeated chunk starting at i -> keep single token
            cleaned.append(words[i])
            i += 1

    return str(cleaned)

# ----- quick tests -----
long_text = remove_consecutive_duplicate_phrases(text)  # your video transcript

# Wrap text to 80 characters per line
joined_text = ''
for word in long_text:
  joined_text += word

replace = ['"',"'",","]
for punc in replace:
  joined_text = joined_text.replace(punc,"")
new_text = joined_text.strip("[]")
lister = new_text.split()
newer_text = lister[5:-1]
newest_text = " ".join(newer_text)

wrapped_text = textwrap.fill(newest_text, width=110)
print(wrapped_text)

This video is about how to find good parameters for a machine learning model. The search for good parameters
is known as optimization and the tool we use is known as an optimizer. For a long time the atom optimizer has
been the default choice. But now theres a new exciting challenger the muon optimizer. The muon optimizer is
getting increasingly more attention in the machine learning community. Its delivering impressive results on
small language models and is about twice as computationally efficient as AdamW. In other words you can train
faster use less memory and still get great results. Lets first revisit Adam. In standard supervised learning
we have a model that makes predictions based on the input data. At first these predictions are just random
guesses. Since the models parameters are initialized randomly we use the training data to compute the gradient
of the loss with respect to each parameter. The gradient acts like a guide showing us which direction the
parameters should move 

In [85]:
# Our text is too long! We gotta chunk it up
# The model has a max token limit (1024 for BART)
# 1 token about 0.75 words on average
# So let's chunk text into 500 word chunks just for safety
def chunk_text(text, word_limit = 700):
    text = text.split()
    text_length = len(text)
    chunks = []
    num_chunks = (text_length // word_limit) + 1 
    for i in range(num_chunks):  
        if i == num_chunks - 1: # last chunk
            # join to the end
            chunks.append(" ".join(text[i*word_limit:]))
        else:
            # join for every 500 chunks
            chunks.append(" ".join(text[i*word_limit:(i+1)*word_limit]))
    return chunks

chunks = chunk_text(newest_text)
overall_summary = ""
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
for chunk in chunks:
    # Now you can use the summarizer
    summary = summarizer(chunk, max_length=250, min_length=30, do_sample=False)
    overall_summary += summary[0]['summary_text'] + " "
wrapped_summary = textwrap.fill(overall_summary, width = 110)
print(wrapped_summary)


Device set to use cpu


The muon optimizer is getting increasingly more attention in the machine learning community. Its delivering
impressive results on small language models and is about twice as computationally efficient as AdamW.
Mathematically this process is called singular value decomposition or SVD. It allows us to express any 2D
matrix as the product of three special matrices U S and VRpose. By computing the SVD of our momentum matrix
then setting all the singular values in S to one we obtain the osono matrix we want. Researchers have observed
that as trending continues the attention logics can grow larger and larger. This may cause the trending
process to become unstable. To resolve this issue we add the weight decay mechanism as used in Atom W. We
adjust the learning rate by taking account the size of the 2D matrix. Multi-head latent attention MLA proposed
by DC. MLA compress the query key and value representations into a low rank space to reduce the size of the KB
cache. But this low rank key valu