In [1]:
!apt-get update -qq
!apt-get install -y -qq ffmpeg

!pip install --upgrade -q yt-dlp pysrt tqdm

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.4/104.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pysrt (setup.py) ... [?25l[?25hdone


In [2]:
from pathlib import Path

# Colab 내 로컬 워크스페이스에 dataset 폴더 생성
base = Path('/content/dataset')
dirs = ['videos', 'subtitles', 'audio', 'segments']
for d in dirs:
    (base / d).mkdir(parents=True, exist_ok=True)

print("폴더 구조:")
!find /content/dataset -maxdepth 2 -type d

폴더 구조:
/content/dataset
/content/dataset/segments
/content/dataset/audio
/content/dataset/videos
/content/dataset/subtitles


In [3]:
video_ids = [
    "3S4cNfl0YF0",
    "cQntMUMQyRw",
    "CG4ihzTaGdM",
    "bJ1nqEC3i0A",
    "Tpc39Bv3YJ8",
    "Z4iE3aaNeTM",
    "O91G3Likq3w"
]

# Colab 세션에 저장 (나중에 bash에서 불러쓰기 편하게)
with open('/content/mit_ocw_ids.txt', 'w') as f:
    for vid in video_ids:
        f.write(vid + '\n')

In [4]:
%%bash
COOKIES=/content/cookies.txt

for VID in $(cat /content/mit_ocw_ids.txt); do
  # 1) 비디오(mp4) 다운로드
  yt-dlp --cookies "$COOKIES" -f best \
    -o "/content/dataset/videos/${VID}.%(ext)s" \
    "https://www.youtube.com/watch?v=${VID}" >/dev/null

  # 2) 공식 자막 다운로드
  yt-dlp --cookies "$COOKIES" --all-subs --sub-lang en --skip-download \
    -o "/content/dataset/subtitles/${VID}.%(ext)s" \
    "https://www.youtube.com/watch?v=${VID}" >/dev/null

  # 3) 오디오 추출
  ffmpeg -y -i "/content/dataset/videos/${VID}.mp4" \
    -ar 16000 -ac 1 "/content/dataset/audio/${VID}.wav" \
    -loglevel error
done

         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.
         To let yt-dlp download and merge the best available formats, simply do not pass any format selection.


In [5]:
import pysrt
from pathlib import Path
import subprocess
import json
import re

MIN_DUR = 10.0   # 최소 10초
MAX_DUR = 30.0   # 최대 30초

def clean_text(text: str) -> str:
    # 1) [Music], [Applause] 같은 대괄호 안 설명 제거
    text = re.sub(r"\[.*?\]", "", text)
    # 2) (laughs), (cough) 등 소괄호 안 설명 제거
    # text = re.sub(r"\(.*?\)", "", text)
    # 3) “SPEAKER:” 형태로 붙어있는 발표자 이름 레이블 제거
    text = re.sub(r"^[A-Za-z ]+:\s*", "", text)
    # 4) 연속된 공백 하나로 축소, 앞뒤 공백 제거
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_subs(srt_path: Path):
    subs = pysrt.open(str(srt_path))
    items = []
    for sub in subs:
        start = sub.start.ordinal / 1000.0
        end   = sub.end.ordinal   / 1000.0
        raw   = sub.text.replace('\n', ' ')
        # 여기서 clean_text 적용
        text  = clean_text(raw)
        if text:  # 빈 문자열인 경우 건너뛰기
            items.append((start, end, text))
    return items

def group_subs(sub_items):
    """인접한 자막을 묶어서 MAX_DUR 이내 구간 생성"""
    segments = []
    cur_start, cur_end, cur_text = sub_items[0]
    for start, end, text in sub_items[1:]:
        if end - cur_start <= MAX_DUR:
            cur_end  = end
            cur_text = cur_text + ' ' + text
        else:
            segments.append((cur_start, cur_end, cur_text))
            cur_start, cur_end, cur_text = start, end, text
    segments.append((cur_start, cur_end, cur_text))
    return segments

def split_long_seg(start, end, text):
    """30초 초과 구간은 균등 분할"""
    import math
    total = end - start
    n_chunks = math.ceil(total / MAX_DUR)
    words = text.split()
    per = math.ceil(len(words) / n_chunks)
    out = []
    for i in range(n_chunks):
        t0 = start + i * (total / n_chunks)
        t1 = start + min((i+1) * (total / n_chunks), total + start)
        txt_chunk = ' '.join(words[i*per:(i+1)*per])
        out.append((t0, t1, txt_chunk))
    return out

def refine_segments(raw_segments):
    """MIN_DUR 미만 제거, MAX_DUR 초과는 split_long_seg 적용"""
    final = []
    for s, e, t in raw_segments:
        dur = e - s
        if dur < MIN_DUR:
            continue
        elif dur <= MAX_DUR:
            final.append((s, e, t))
        else:
            final.extend(split_long_seg(s, e, t))
    return final

In [11]:
def cut_audio_and_make_meta(vid_id, max_samples=1000):
    audio_dir = Path('/content/dataset/audio')
    seg_dir   = Path('/content/dataset/segments')
    seg_dir.mkdir(exist_ok=True)
    meta_path = Path('/content/dataset/metadata.jsonl')

    # 1) 자막 불러오기
    raw = load_subs(Path(f'/content/dataset/subtitles/{vid_id}.en.vtt'))
    # 2) 묶기 → 3) 정제
    grouped = group_subs(raw)
    segs    = refine_segments(grouped)

    meta = []
    sid = 0
    for start, end, text in segs:
        if len(meta) >= max_samples:
            break
        duration = end - start
        out_name = f"{vid_id}_{sid}.wav"
        out_path = seg_dir / out_name

        # ffmpeg로 실제 자르기
        subprocess.run([
            'ffmpeg', '-y',
            '-ss', str(start),
            '-t',  str(duration),
            '-i',  str(audio_dir / f'{vid_id}.wav'),
            '-ar', '16000', '-ac', '1',
            str(out_path)
        ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        meta.append({
            "audio_filepath": str(out_path),
            "text": text
        })
        sid += 1

    # JSONL로 저장
    with open(meta_path, 'a', encoding='utf-8') as f:
        for entry in meta:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
    print(f"[{vid_id}] 생성된 세그먼트: {len(meta)}개")

#  예시: 모든 비디오에 대해 반복 실행
for vid in video_ids:
    cut_audio_and_make_meta(vid, max_samples=300)  # 비디오당 최대 200개

[3S4cNfl0YF0] 생성된 세그먼트: 158개
[cQntMUMQyRw] 생성된 세그먼트: 156개
[CG4ihzTaGdM] 생성된 세그먼트: 149개
[bJ1nqEC3i0A] 생성된 세그먼트: 107개
[Tpc39Bv3YJ8] 생성된 세그먼트: 98개
[Z4iE3aaNeTM] 생성된 세그먼트: 107개
[O91G3Likq3w] 생성된 세그먼트: 106개


In [10]:
#!rm -rf /content/dataset/segments/*

In [7]:
!pip install -q datasets[audio] huggingface_hub

In [8]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
!pip install -q datasets[audio] huggingface_hub pandas

from datasets import Dataset, DatasetDict, Audio
import pandas as pd

# 2) metadata.jsonl 을 pandas로 로드
df = pd.read_json("/content/dataset/metadata.jsonl", lines=True)

# 3) Dataset 객체 생성
ds = Dataset.from_pandas(df, split="train")

# 4) 컬럼 이름 변경 (audio_filepath → audio)
ds = ds.rename_column("audio_filepath", "audio")

# 5) Audio 타입으로 캐스팅
ds = ds.cast_column("audio", Audio(sampling_rate=16_000))

# 6) Hugging Face Hub 로그인 (토큰 입력)
from huggingface_hub import notebook_login
notebook_login()

# 7) DatasetDict 로 래핑 & push
repo_id = "yongjune2002/MITOCW-whisper"  # 본인 ID로 바꾸세요
dataset_dict = DatasetDict({"train": ds})

dataset_dict.push_to_hub(
    repo_id=repo_id,
    private=False,   # 공개하려면 False
    token=True
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Map:   0%|          | 0/494 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Map:   0%|          | 0/493 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]