In [None]:
# from google.colab import drive
# import pandas as pd

# drive.mount('/content/drive')

# df = pd.read_csv('/content/drive/My Drive/CS590/vggsound.csv')

# print(df.head())

Mounted at /content/drive
   ---g-f_I2yQ    1        people marching   test
0  --0PQM4-hqg   30     waterfall burbling  train
1  --56QUhyDQM  185         playing tennis  train
2  --5OkAjCI7g   40  people belly laughing  train
3  --8puiAGLhs   30    car engine starting  train
4  --96EN9NUQM  242    alarm clock ringing  train


In [None]:
import os
import csv
import subprocess
import librosa
import matplotlib.pyplot as plt
import numpy as np
import cv2

csv_file = "/content/drive/My Drive/CS590/vggsound.csv"
output_dir = "/content/drive/My Drive/CS590/dataset"
video_dir = os.path.join(output_dir, "videos")
frame_dir = os.path.join(output_dir, "images")
spec_dir = os.path.join(output_dir, "specs")
duration = 10
sample_count = 10
sr = 22050
img_size = 512

os.makedirs(video_dir, exist_ok=True)
os.makedirs(frame_dir, exist_ok=True)
os.makedirs(spec_dir, exist_ok=True)

def download_clip(youtube_id, start_time, duration=10):
    url = f"https://www.youtube.com/watch?v={youtube_id}"
    out_path = os.path.join(video_dir, f"{youtube_id}.mp4")
    cmd = f'yt-dlp --quiet --no-warnings --download-sections "*{start_time}-{start_time+duration}" -f mp4 -o "{out_path}" {url}'
    subprocess.run(cmd, shell=True)
    return out_path

def extract_middle_frame(video_path, out_path):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.set(cv2.CAP_PROP_POS_FRAMES, total // 2)
    success, frame = cap.read()
    if success:
        frame = cv2.resize(frame, (img_size, img_size))
        cv2.imwrite(out_path, frame)
    cap.release()
    return success

def save_mel_spectrogram(video_path, out_path):
    y, _ = librosa.load(video_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    log_mel = librosa.power_to_db(mel, ref=np.max)

    fig = plt.figure(figsize=(5.12, 5.12), dpi=100)
    plt.axis('off')
    plt.imshow(log_mel, cmap='magma', aspect='auto')
    plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
    plt.close(fig)

with open(csv_file) as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        if i >= sample_count:
            break

        try:
            ytid = row[0]
            start = int(float(row[1]))
            label = row[2] if len(row) > 2 else "unknown"

            print(f"Processing {ytid}...")

            video_path = download_clip(ytid, start)
            frame_path = os.path.join(frame_dir, f"{ytid}.jpg")
            spec_path = os.path.join(spec_dir, f"{ytid}.png")

            success = extract_middle_frame(video_path, frame_path)
            if success:
                save_mel_spectrogram(video_path, spec_path)
                os.remove(video_path)
                print(f"Done: {ytid}")
            else:
                print(f"Frame extraction failed: {ytid}")
        except Exception as e:
            print(f"Error processing row {i}: {e}")
            continue

Processing ---g-f_I2yQ...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: ---g-f_I2yQ
Processing --0PQM4-hqg...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --0PQM4-hqg
Processing --56QUhyDQM...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --56QUhyDQM
Processing --5OkAjCI7g...
❌ Frame extraction failed: --5OkAjCI7g
Processing --8puiAGLhs...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --8puiAGLhs
Processing --96EN9NUQM...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --96EN9NUQM
Processing --9O4XZOge4...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --9O4XZOge4
Processing --Aa4M484QM...
❌ Frame extraction failed: --Aa4M484QM
Processing --Bu2xe4OSo...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --Bu2xe4OSo
Processing --CC5pH97q4...


  y, _ = librosa.load(video_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


✔️ Done: --CC5pH97q4


In [None]:
# !pip install -U yt-dlp
# !apt-get install -y ffmpeg
!pip install soundfile

