### Imports

In [None]:
from IPython.display import Audio
import os
import pandas as pd
import librosa
import soundfile as sf
import numpy as np
import pytube
import moviepy.editor as mp
import math
import subprocess

### Clip extraction

* Takes a youtube url as input,
* Downloads the video locally
* Splits the video into X (default 11) segments 
* Gets the highest onset value and its timestamp from each segments
* Outputs 1 second audio files for each segment

# INSERT YOUTUBE URL HERE AND RUN CELL

In [None]:
url = 'https://www.youtube.com/watch?v=dQw4w9WgXcQ'

In [11]:
def download_video(url: str) -> str:
    """
    Download the video from the provided YouTube URL and return the path of the downloaded video file.
    """
    yt = pytube.YouTube(url)
    stream = yt.streams.get_by_itag(22)    
    video_title = yt.title  # get the title of the video
    video_path = stream.download()
    return video_path,video_title



In [27]:
def extract_audio(video_path: str) -> str:
    """
    Extract the audio from the video file and return the path of the extracted audio file.
    """
    audio_path = 'audio_files/audio.wav'
    clip = mp.VideoFileClip(video_path)
    clip.audio.write_audiofile(audio_path)
    
    
    y, sr = librosa.load(audio_path, sr=44100)
    audio_length = len(y) / sr
    audio_length_rounded = np.floor(audio_length / 11) * 11
    y = y[:int(audio_length_rounded * sr)]

    clip.audio.write_audiofile(audio_path)   
    print ("Length of entire audio clip: ", librosa.get_duration(y=y, sr=sr))
    return audio_path



MoviePy - Writing audio in audio_files/audio.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in audio_files/audio.wav


                                                                      

MoviePy - Done.
Length of entire audio clip:  209.0


In [26]:
def extract_loudest_audio(audio_path, num_segments=11, hop_length=512):
    y, sr = librosa.load(audio_path, sr=44100)
    duration = len(y) / sr
    samples_per_segment = int(len(y) / num_segments)
    output_files = []
    
    for i in range(num_segments):
        start_idx = i * samples_per_segment
        end_idx = start_idx + samples_per_segment
        
        if end_idx > len(y):
            print(f"Warning: audio length ({duration}s) is not evenly divisible by num_segments ({num_segments}). Truncating audio.")
            break
        
        segment = y[start_idx:end_idx]
        onset_env = librosa.onset.onset_strength(y=segment, sr=sr, hop_length=hop_length, n_fft=2048, fmax=8000)
        
        if not onset_env.any():
            print(f"Warning: onset batch {i+1} is empty. Skipping segment.")
            continue
        
        loudest_onset_idx = onset_env.argmax()
        start_time = (start_idx + loudest_onset_idx) / sr
        print ("Start time: " ,start_time)
        end_time = start_time + 1 if start_time + 1 < duration else duration
        print ("End time: " ,end_time)
        output_path = f"{i}.mp4"
        command = f'ffmpeg -y -ss {start_time:.2f} -i "{video_path}" -t 1 -c:v libx264 -b:v 1M -c:a aac -strict experimental -threads 1 -loglevel error {output_path}'


        subprocess.call(command, shell=True)
        output_files.append(output_path)

    return output_files



Start time:  0.00029478458049886624
End time:  1.0002947845804988
Start time:  19.282834467120182
End time:  20.282834467120182
Start time:  38.55916099773243
End time:  39.55916099773243
Start time:  57.85922902494331
End time:  58.85922902494331
Start time:  77.11907029478458
End time:  78.11907029478458
Start time:  96.40986394557824
End time:  97.40986394557824
Start time:  115.66816326530612
End time:  116.66816326530612
Start time:  134.96253968253967
End time:  135.96253968253967
Start time:  154.22238095238095
End time:  155.22238095238095
Start time:  173.49666666666667
End time:  174.49666666666667
Start time:  192.7890022675737
End time:  193.7890022675737


In [28]:
video_path, video_title = download_video(url)
audio_path = extract_audio(video_path)
#num_segments is how many clips you want
output_files = extract_loudest_audio(audio_path, num_segments=3)

MoviePy - Writing audio in audio_files/audio.wav


                                                                      

MoviePy - Done.
MoviePy - Writing audio in audio_files/audio.wav


                                                                      

MoviePy - Done.
Length of entire audio clip:  209.0
Start time:  0.00029478458049886624
End time:  1.0002947845804988
Start time:  70.77791383219954
End time:  71.77791383219954
Start time:  141.37244897959184
End time:  142.37244897959184


In [None]:
len(y)