In [1]:
# Run this to activate venv for the terminal instance: .venv\Scripts\activate

import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import ffmpeg
import torch
import torchaudio
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB

# Should be 2.8.0 (torch & torchaudio) and 20250625 (whisper)
print("torch version:", torch.__version__)
print("torchaudio version:", torchaudio.__version__)

# Make sure cuda (gpu) is active!
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


torch version: 2.8.0+cu129
torchaudio version: 2.8.0+cu129
Using device: cuda


**STEP 1**

*"For each video, ffmpeg, a command-line multimedia framework, is used to isolate the audio stream from the video file and set it in two formats: the first as a 16kHz mono .wav file, and the second as a 32kHz mono .wav file."*

ffmpeg parameters reference:
1. out16 = output file path
2. format/-f = formats the container, we want wav
3. acodec = audio codec, this is usually pcm_s16le -> signed 16-bit little-endian
4. ac = number of audio channels, we want mono aka 1
5. ar = sample rate, we want 16kHz/32kHz aka 16000/32000

NOTE: overwrite will be set to true for now while testing for single video.

NOTE: I referred to the ffmpeg documentation [here](https://ffmpeg.org/documentation.html)


In [2]:
def extract_audio_to_wavs(video_path: str, out16: str, out32: str, overwrite: bool=True):
    # 16kHz mono
    extract_16k=(
        ffmpeg.input(video_path).output(out16, format='wav', acodec='pcm_s16le', ac=1, ar=16000)
    )
    if overwrite:
        extract_16k = extract_16k.overwrite_output()
    
    extract_16k.run(quiet=True)
    print("Wrote 16kHz", out16)

    # 32kHz mono
    extract_32k=(
        ffmpeg.input(video_path).output(out32, format='wav', acodec='pcm_s16le', ac=1, ar=32000)
    )
    if overwrite:
        extract_32k = extract_32k.overwrite_output()
    
    extract_32k.run(quiet=True)
    print("Wrote 32kHz", out32)


**STEP 2**

*"Then, the Python library torchaudio loads the files, and both waveforms are then coverted into log-Mel spectogram form."*

Fourier Transform reference:
1. win_length = number of samples per frame
2. hop_length = how far the window moves each step (in samples)
3. n_fft = number of frequency bins, usually chosen as a power of 2

NOTE: we will likely need to modify parameters so that the log-Mel will be plug-and-play when feeding it to CNN14 later on.

NOTE: small inaccuracy pala sa paper, we only need to convert the 32kHz wav into log-Mel since Whisper will convert the 16kHz one automatically.

NOTE: I referred to the torchaudio documentation [here](https://docs.pytorch.org/audio/main/)

In [3]:
def convert_to_log_mel(wav_path: str, sample_rate: int=32000, n_mels: int=80, win_ms: float=25.0, 
                       hop_ms: float=10.0, n_fft: int=None, device: str="cpu"):
    waveform, sr = torchaudio.load(wav_path)

    # Make sure we're passing out32 and not out16
    if sr != sample_rate: 
        raise ValueError(f"Expected {sample_rate} Hz .wav but got {sr} Hz")
    
    # Make sure we're passing mono, downmixes them in case (can remove later)
    if waveform.shape[0]>1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # STFT parameters
    win_length = int(round((win_ms/1000.0)*sample_rate))
    hop_length = int(round((hop_ms/1000.0)*sample_rate))

    # Number of frequency bins per FFT
    if n_fft is None:
        n_fft=1 << (win_length-1).bit_length()

    # Heaviest parts, we assign to GPU (device)
    # Outputs the spectogram
    mel_spec_transform = MelSpectrogram(sample_rate=sample_rate, n_fft=n_fft, win_length=win_length, 
                                        hop_length=hop_length,n_mels=n_mels,power=2.0).to(device)
    # Converts spectogram power values into decibels
    to_decibel = AmplitudeToDB(stype="power").to(device)

    waveform = waveform.to(device)
    mel_spec = mel_spec_transform(waveform)

    # converts to log scale aka decibels
    log_mel = to_decibel(mel_spec)

    return log_mel.squeeze(0).cpu()

# For visualizing the spectogram using pyplot
def plot_log_mel(log_mel, sample_rate, title="Log-Mel Spectogram"):
    log_mel_np = log_mel.detach().cpu().numpy()
    plt.figure(figsize=(10, 4))
    plt.imshow(log_mel_np, 
               origin="lower", 
               aspect="auto", 
               interpolation="nearest")
    plt.colorbar(format="%+2.f dB")
    plt.title(title)
    plt.xlabel("Frames")
    plt.ylabel("Mel bins")
    plt.tight_layout()
    plt.show()

**STEP 3**

*"The spectogram belonging to the 16kHz file is fed into the Whisper model for transcription."*

NOTE: I referred to the Whisper documentation [here](https://pypi.org/project/openai-whisper/)

In [4]:
# def transcribe_with_whisper(wav_16k_path: str, model_size: str="small", task: str="transcribe"):
#     print("Loading the Whisper model: ", model_size)
#     model = whisper.load_model(model_size)
#     result = model.transcribe(wav_16k_path, task=task)
#     return result

**SAMPLE RUN ON A TIKTOK VIDEO**

In [5]:
def process_video(video_path: str, model_size: str="small", out_dir: str ="proc_out"):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True) #16kHz, logMel, and transcript go to out_dir
    audio_dir = out_dir.parent / (out_dir.name + "_32kHz")
    audio_dir.mkdir(parents=True, exist_ok=True) # 32kHz goes to audio_dir

    video = Path(video_path)
    out16 = out_dir / (video.stem + "_16k.wav") # 16kHz wav output
    out32 = audio_dir / (video.stem + "_32k.wav") # 32kHz output
    npy = out_dir / (video.stem + "_32k_logmel.npy") # log-Mel output
    txt = out_dir / (video.stem + "_transcript.txt") # whisper transcript

    # Extract audio
    extract_audio_to_wavs(str(video), str(out16), str(out32))

    # Convert to log-Mel spectogram
    log_mel = convert_to_log_mel(str(out32), sample_rate=32000, n_mels=80)
    np.save(str(npy), log_mel.numpy())
    print("Saved log-Mel:", npy, log_mel.shape)
    # plot_log_mel(log_mel, 32000, "32kHz Log-Mel Spectogram") # if you wanna see the spectogram

    # # Whisper transcription, can disable since jersey is doing Whisper as well
    # res = transcribe_with_whisper(str(out16), model_size=model_size)
    # with open(txt, "w", encoding="utf-8") as f:
    #     f.write(res["text"])
    # print("Transcript saved:", txt)


In [6]:
# video_path = "media/testvid2.mp4"
# process_video(video_path)

media_dir = Path("media")
videos = list(media_dir.glob("*.mp4"))
print(f"{len(videos)} videos found!")

for video in videos:
    print(f"\nProcessing: {video.name}")
    process_video(video)

6 videos found!

Processing: trend3vid6.mp4
Wrote 16kHz proc_out\trend3vid6_16k.wav
Wrote 32kHz proc_out_32kHz\trend3vid6_32k.wav
Saved log-Mel: proc_out\trend3vid6_32k_logmel.npy torch.Size([80, 704])

Processing: trend3vid7.mp4
Wrote 16kHz



 proc_out\trend3vid7_16k.wav
Wrote 32kHz proc_out_32kHz\trend3vid7_32k.wav
Saved log-Mel: proc_out\trend3vid7_32k_logmel.npy torch.Size([80, 1039])

Processing: trend3vid8.mp4
Wrote 16kHz proc_out\trend3vid8_16k.wav
Wrote 32kHz proc_out_32kHz\trend3vid8_32k.wav
Saved log-Mel: proc_out\trend3vid8_32k_logmel.npy torch.Size([80, 681])

Processing: trend5vid2.mp4
Wrote 16kHz proc_out\trend5vid2_16k.wav
Wrote 32kHz proc_out_32kHz\trend5vid2_32k.wav
Saved log-Mel: proc_out\trend5vid2_32k_logmel.npy torch.Size([80, 546])

Processing: trend5vid3.mp4
Wrote 16kHz proc_out\trend5vid3_16k.wav
Wrote 32kHz proc_out_32kHz\trend5vid3_32k.wav
Saved log-Mel: proc_out\trend5vid3_32k_logmel.npy torch.Size([80, 560])

Processing: trend5vid4.mp4
Wrote 16kHz proc_out\trend5vid4_16k.wav
Wrote 32kHz proc_out_32kHz\trend5vid4_32k.wav
Saved log-Mel: proc_out\trend5vid4_32k_logmel.npy torch.Size([80, 551])
