### All Again

In [30]:
# Install necessary packages
!pip install -U openai-whisper
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
!pip install ffmpeg-python

# Install FFmpeg (if not already installed in Colab)
!apt-get install ffmpeg

# Install other necessary packages
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu117
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [31]:
import os
import torch
import whisper
import nltk
import ffmpeg
import subprocess
import re
from google.colab import files
from IPython.display import Video, display

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [32]:
print("Is GPU available:", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Is GPU available: True


In [33]:
video_filename = list(uploaded.keys())[0]
print(f"Uploaded video file: {video_filename}")

Uploaded video file: How do SSDs Work_ How to fit 3 WEEKS of TV in a microchip the size of a dime!! Explained in 3min..mp4


In [34]:
# Extract audio from the video using FFmpeg
audio_filename = "extracted_audio.wav"

command = [
    'ffmpeg',
    '-i', video_filename,
    '-f', 'wav',
    '-ar', '16000',  # Sample rate
    '-ac', '1',      # Mono channel
    audio_filename
]

subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
print(f"Audio extracted and saved as {audio_filename}")

Audio extracted and saved as extracted_audio.wav


In [None]:
# Load the Whisper model
model = whisper.load_model("base", device=device)

100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 149MiB/s]
  checkpoint = torch.load(fp, map_location=device)



In [35]:
# Transcribe the audio with timestamps
print("Starting transcription...")
result = model.transcribe(audio_filename, word_timestamps=True)
print("Transcription completed.")

Starting transcription...
Transcription completed.


In [36]:
print(result)

{'text': " Most smartphones can store 128GB and this solid-state drive can store one terabyte of data and it all happens inside this microchip right here. If this one terabyte solid-state drive were full of movies and TV shows it would take about three weeks of non-stop binging to watch them all. So how can this incredibly small microchip fit such an insane amount of content? To understand that we've got to zoom into a nanoscopic view of the insides. In here we can see an individual memory cell called charge trap flash. This memory cell stores three bits of information by trapping different levels of electrons on a charge trap. Very few extra electrons are a 1 1 1 while a lot of electrons are a 0 0 0 and the other levels of trapped electrons have other three bit designations. Measuring this value doesn't change the amount of electrons and once electrons are placed on the charge trap they stay trapped there for years. However when the memory cell is erased the electrons are forcibly rem

In [37]:
# Get the transcription segments
segments = result['segments']

# Prepare lists to hold sentences and their timestamps
sentence_texts = []
sentence_times = []

current_sentence = ''
current_start = None

# Regular expression to detect sentence-ending punctuation
sentence_endings = re.compile(r'[.!?]')

for segment in segments:
    words = segment['words']
    for word_info in words:
        word = word_info['word']
        word_start = word_info['start']
        word_end = word_info['end']

        if current_start is None:
            current_start = word_start

        current_sentence += word

        if sentence_endings.search(word):
            # End of sentence detected
            current_end = word_end
            sentence_texts.append(current_sentence.strip())
            sentence_times.append((current_start, current_end, current_sentence.strip()))
            # Reset for next sentence
            current_sentence = ''
            current_start = None
        else:
            current_sentence += ' '

# Handle any remaining sentence
if current_sentence:
    current_end = word_end
    sentence_texts.append(current_sentence.strip())
    sentence_times.append((current_start, current_end, current_sentence.strip()))

In [38]:
print(sentence_texts)

['Most  smartphones  can  store  128GB  and  this  solid -state  drive  can  store  one  terabyte  of  data  and  it  all  happens  inside  this  microchip  right  here.', 'If  this  one  terabyte  solid -state  drive  were  full  of  movies  and  TV  shows  it  would  take  about  three  weeks  of  non -stop  binging  to  watch  them  all.', 'So  how  can  this  incredibly  small  microchip  fit  such  an  insane  amount  of  content?', "To  understand  that  we've  got  to  zoom  into  a  nanoscopic  view  of  the  insides.", 'In  here  we  can  see  an  individual  memory  cell  called  charge  trap  flash.', 'This  memory  cell  stores  three  bits  of  information  by  trapping  different  levels  of  electrons  on  a  charge  trap.', 'Very  few  extra  electrons  are  a  1  1  1  while  a  lot  of  electrons  are  a  0  0  0  and  the  other  levels  of  trapped  electrons  have  other  three  bit  designations.', "Measuring  this  value  doesn't  change  the  amount  of  electro

In [39]:
print(sentence_times)

[(0.0, 12.6, 'Most  smartphones  can  store  128GB  and  this  solid -state  drive  can  store  one  terabyte  of  data  and  it  all  happens  inside  this  microchip  right  here.'), (12.9, 22.1, 'If  this  one  terabyte  solid -state  drive  were  full  of  movies  and  TV  shows  it  would  take  about  three  weeks  of  non -stop  binging  to  watch  them  all.'), (22.46, 28.12, 'So  how  can  this  incredibly  small  microchip  fit  such  an  insane  amount  of  content?'), (28.56, 33.24, "To  understand  that  we've  got  to  zoom  into  a  nanoscopic  view  of  the  insides."), (33.88, 39.08, 'In  here  we  can  see  an  individual  memory  cell  called  charge  trap  flash.'), (39.6, 45.56, 'This  memory  cell  stores  three  bits  of  information  by  trapping  different  levels  of  electrons  on  a  charge  trap.'), (45.9, 58.34, 'Very  few  extra  electrons  are  a  1  1  1  while  a  lot  of  electrons  are  a  0  0  0  and  the  other  levels  of  trapped  electrons  hav

In [45]:
technical_keywords = {
    'gpu_computing': [
        'gpu', 'cuda', 'parallel computing', 'graphics processing', 'shader',
        'compute shader', 'opencl', 'vulkan', 'graphics pipeline', 'rendering',
        'texture', 'buffer', 'compute unit', 'thread block', 'warp', 'kernel'
    ],
    'computer_architecture': [
        'instruction set', 'isa', 'pipeline', 'branch prediction', 'cache hierarchy',
        'memory hierarchy', 'von neumann', 'harvard architecture', 'superscalar',
        'out of order execution', 'speculative execution', 'microarchitecture',
        'fetch', 'decode', 'execute', 'writeback', 'forwarding', 'hazard',
        'stall', 'microcode', 'microinstruction'
    ],
    'assembly_programming': [
        'assembly', 'assembler', 'mnemonic', 'opcode', 'operand', 'risc',
        'cisc', 'arm assembly', 'x86 assembly', 'riscv', 'risc-v', 'instruction set',
        'register file', 'immediate value', 'addressing mode', 'branch instruction',
        'jump instruction', 'load store', 'arithmetic instruction', 'logical instruction'
    ],
    'low_level': [
        'memory', 'pointer', 'address', 'register', 'cache', 'assembly',
        'instruction', 'binary', 'bit', 'byte', 'stack', 'heap', 'allocation',
        'memory mapping', 'virtual memory', 'physical memory', 'page table',
        'segmentation', 'protection ring', 'privilege level'
    ],
    'system_programming': [
        'operating system', 'driver', 'interrupt', 'system call', 'process',
        'thread', 'scheduling', 'synchronization', 'mutex', 'semaphore',
        'context switch', 'privilege level', 'kernel mode', 'user mode'
    ]
}

# Flatten the list of keywords
all_keywords = set()
for keywords in technical_keywords.values():
    all_keywords.update(keywords)

In [41]:
def sentence_contains_keyword(sentence, keywords):
    sentence_lower = sentence.lower()
    for keyword in keywords:
        if re.search(r'\b' + re.escape(keyword.lower()) + r'\b', sentence_lower):
            return True
    return False

# Identify technical sentences
technical_sentences = []
for start, end, sentence in sentence_times:
    if sentence_contains_keyword(sentence, all_keywords):
        technical_sentences.append((start, end, sentence))

print("Technical sentences identified:")
for s in technical_sentences:
    print(s)

Technical sentences identified:
(33.88, 39.08, 'In  here  we  can  see  an  individual  memory  cell  called  charge  trap  flash.')
(39.6, 45.56, 'This  memory  cell  stores  three  bits  of  information  by  trapping  different  levels  of  electrons  on  a  charge  trap.')
(45.9, 58.34, 'Very  few  extra  electrons  are  a  1  1  1  while  a  lot  of  electrons  are  a  0  0  0  and  the  other  levels  of  trapped  electrons  have  other  three  bit  designations.')
(68.62, 73.5, 'However  when  the  memory  cell  is  erased  the  electrons  are  forcibly  removed.')
(74.02, 82.4, "To  reach  a  terabyte  of  storage  capacity  in  a  single  chip  this  memory  cell  is  copied  and  it's  copied  a  lot.")
(82.88, 94.58, 'First  these  memory  cells  are  stacked  100  layers  tall  and  then  these  stacks  of  cells  are  copied  40 ,000  columns  across  which  is  then  copied  50 ,000  rows  down.')
(113.5, 124.74, 'In  order  to  isolate  and  determine  which  row  and  la

In [42]:
# Define minimum and maximum clip durations
min_clip_duration = 30  # Minimum 30 seconds
max_clip_duration = 75  # Maximum 75 seconds

clips = []
current_clip_sentences = []
current_clip_start = None
current_clip_end = None

idx = 0
sentence_count = len(sentence_times)

while idx < sentence_count:
    start, end, sentence = sentence_times[idx]
    is_technical = sentence_contains_keyword(sentence, all_keywords)

    if is_technical:
        # Start new clip
        current_clip_start = start
        current_clip_end = end
        current_clip_sentences.append((start, end, sentence))

        # Expand clip to meet minimum duration
        clip_duration = current_clip_end - current_clip_start
        idx += 1
        while clip_duration < min_clip_duration and idx < sentence_count:
            next_start, next_end, next_sentence = sentence_times[idx]
            current_clip_end = next_end
            current_clip_sentences.append((next_start, next_end, next_sentence))
            clip_duration = current_clip_end - current_clip_start
            idx += 1

        # Trim clip if it exceeds maximum duration
        if clip_duration > max_clip_duration:
            current_clip_end = current_clip_start + max_clip_duration
            clip_duration = max_clip_duration
            # Remove sentences that exceed the maximum duration
            adjusted_sentences = []
            for s_start, s_end, s_sentence in current_clip_sentences:
                if s_end <= current_clip_end:
                    adjusted_sentences.append((s_start, s_end, s_sentence))
                else:
                    break
            current_clip_sentences = adjusted_sentences

        # Save the clip
        clips.append((current_clip_start, current_clip_end, current_clip_sentences.copy()))

        # Reset for next clip
        current_clip_sentences = []
        current_clip_start = None
        current_clip_end = None
    else:
        idx += 1  # Move to the next sentence

print("Clips to be extracted:")
for idx, (start_time, end_time, sentences_in_clip) in enumerate(clips):
    duration = end_time - start_time
    print(f"\nClip {idx+1}: Start - {start_time:.2f}s, End - {end_time:.2f}s, Duration - {duration:.2f}s")
    print("Sentences in clip:")
    for s in sentences_in_clip:
        print(f" - {s[2]}")

Clips to be extracted:

Clip 1: Start - 33.88s, End - 68.12s, Duration - 34.24s
Sentences in clip:
 - In  here  we  can  see  an  individual  memory  cell  called  charge  trap  flash.
 - This  memory  cell  stores  three  bits  of  information  by  trapping  different  levels  of  electrons  on  a  charge  trap.
 - Very  few  extra  electrons  are  a  1  1  1  while  a  lot  of  electrons  are  a  0  0  0  and  the  other  levels  of  trapped  electrons  have  other  three  bit  designations.
 - Measuring  this  value  doesn't  change  the  amount  of  electrons  and  once  electrons  are  placed  on  the  charge  trap  they  stay  trapped  there  for  years.

Clip 2: Start - 68.62s, End - 112.42s, Duration - 43.80s
Sentences in clip:
 - However  when  the  memory  cell  is  erased  the  electrons  are  forcibly  removed.
 - To  reach  a  terabyte  of  storage  capacity  in  a  single  chip  this  memory  cell  is  copied  and  it's  copied  a  lot.
 - First  these  memory  cells  are

In [43]:
if not os.path.exists(video_filename):
    print(f"Video file {video_filename} not found.")
else:
    os.makedirs('clips', exist_ok=True)
    clip_filenames = []
    for idx, (start, end, _) in enumerate(clips):
        output_path = f'clips/clip_{idx+1}.mp4'
        command = [
            'ffmpeg',
            '-y',
            '-i', video_filename,
            '-ss', str(start),
            '-to', str(end),
            '-c', 'copy',
            output_path
        ]
        subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
        clip_filenames.append(output_path)
    print("Video clips extracted:")
    for fname in clip_filenames:
        print(fname)

Video clips extracted:
clips/clip_1.mp4
clips/clip_2.mp4
clips/clip_3.mp4


In [44]:
for fname in clip_filenames:
    print(f"\nDisplaying {fname}:")
    # display(Video(fname, embed=True))
    # Uncomment the following line if you wish to download the clips
    files.download(fname)


Displaying clips/clip_1.mp4:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Displaying clips/clip_2.mp4:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Displaying clips/clip_3.mp4:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>