# Speech to Text with Whisper OpenAI

### Import Dependencies

In [1]:
from pydub import AudioSegment
import whisper
from tqdm.notebook import tqdm
import os
import warnings
from datetime import timedelta, datetime
import textwrap
from pydub.utils import mediainfo
import platform
import psutil
import time
import json
import hashlib

In [2]:
# --- Siapkan path file unik ---
def get_unique_filename(base_path):
    if not os.path.exists(base_path):
        return base_path
    name, ext = os.path.splitext(base_path)
    counter = 1
    while True:
        new_path = f"{name}_{counter}{ext}"
        if not os.path.exists(new_path):
            return new_path
        counter += 1

### Load Model & Estimate

In [5]:
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# --- Load Whisper model ---
print("🔧 Load Whisper model...")
model = whisper.load_model("turbo") 
print("✅ Whisper model loaded.")

# --- Load audio file ---
print("🎧 Load audio file...")
audio_path = "assets/audio/FGD peranan unit transformasi organisasi untuk mendukung perubahan disruptif sesi 2.mp3"
audio = AudioSegment.from_file(audio_path)


# --- Extract and display metadata ---
info = mediainfo(audio_path)
duration_sec = float(info.get("duration", 0))
duration_str = str(timedelta(seconds=int(duration_sec)))

print("ℹ️  Metadata Audio:")
print(f"    • File name    : {audio_path.split('/')[-1]}")
print(f"    • Duration     : {duration_str}")
print(f"    • Codec        : {info.get('codec_name', 'N/A')}")
print(f"    • Sample rate  : {info.get('sample_rate', 'N/A')} Hz")
print(f"    • Channels     : {info.get('channels', 'N/A')}")
print(f"    • Bitrate      : {info.get('bit_rate', 'N/A')} bps")
print(f"    • Format       : {info.get('format_name', 'N/A')}")

print("✅ Audio loaded.")

# --- Split audio into 1-minute chunks ---
chunk_duration_ms = 60 * 1000  # 60 detik
chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
print(f"📦 Total chunks: {len(chunks)} (durasi: 1 menit per chunk)")

# Info spek sistem
print("\n💻 Spesifikasi Sistem:")
print(f"    • Platform        : {platform.system()} {platform.release()}")
print(f"    • Processor       : {platform.processor()}")
print(f"    • CPU cores       : {psutil.cpu_count(logical=False)} fisik / {psutil.cpu_count(logical=True)} logical")
print(f"    • RAM             : {round(psutil.virtual_memory().total / (1024**3), 2)} GB")

# --- Estimasi waktu adaptif sebelum proses chunk ---
print("⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...")

# Ambil 1 menit pertama dari audio (atau kurang kalau audio-nya pendek)
sample_duration_ms = min(60 * 1000, len(audio))  # Maks 1 menit
sample_chunk = audio[:sample_duration_ms]
temp_sample_path = "temp_sample_chunk.wav"
sample_chunk.export(temp_sample_path, format="wav")

# Hitung waktu transkripsi
start_time = time.time()
result = model.transcribe(temp_sample_path, language="id")
chunk_text = result["text"].strip()
elapsed = time.time() - start_time

# Estimasi total
estimated_per_chunk = round(elapsed, 2)
estimated_chunks = int(len(audio) / sample_duration_ms)
estimated_total_time = elapsed * estimated_chunks
estimated_total_fmt = str(timedelta(seconds=int(estimated_total_time)))


print(f"    🧮 Estimasi waktu per chunk   : {estimated_per_chunk} detik")
print(f"    🧮 Estimasi total transkripsi : {estimated_total_fmt} untuk ~{estimated_chunks} chunks")

# Hapus file sementara
if os.path.exists(temp_sample_path):
    os.remove(temp_sample_path)

print("✅ Estimasi selesai.\n")
print(f"📄 Contoh transkripsi chunk sample:\n{textwrap.fill(chunk_text, width=100)}\n")

🔧 Load Whisper model...
✅ Whisper model loaded.
🎧 Load audio file...
ℹ️  Metadata Audio:
    • File name    : FGD peranan unit transformasi organisasi untuk mendukung perubahan disruptif sesi 2.mp3
    • Duration     : 3:29:09
    • Codec        : mp3
    • Sample rate  : 44100 Hz
    • Channels     : 2
    • Bitrate      : 192002 bps
    • Format       : mp3
✅ Audio loaded.
📦 Total chunks: 210 (durasi: 1 menit per chunk)

💻 Spesifikasi Sistem:
    • Platform        : Darwin 24.5.0
    • Processor       : arm
    • CPU cores       : 8 fisik / 8 logical
    • RAM             : 8.0 GB
⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...
    🧮 Estimasi waktu per chunk   : 49.54 detik
    🧮 Estimasi total transkripsi : 2:52:33 untuk ~209 chunks
✅ Estimasi selesai.

📄 Contoh transkripsi chunk sample:
Bismillahirrahmanirrahim. Kita mulai lagi ya, sesi 3 nih. Assalamu'alaikum warahmatullahi
wabarakatuh. Waalaikumsalam. Kita siang ini akan dapat sharing dari BRI terkait dengan
Transfor

### Transcribe

In [7]:
def calculate_file_hash(filepath):
    hash_md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

# --- Buat folder output jika belum ada ---
output_dir = "output_transkrip"
os.makedirs(output_dir, exist_ok=True)

# --- Cek checkpoint dan validasi file audio ---
checkpoint_file = os.path.join(output_dir, "checkpoint.json")
audio_hash = calculate_file_hash(audio_path)
start_chunk = 0

if os.path.exists(checkpoint_file):
    with open(checkpoint_file, "r") as f:
        checkpoint_data = json.load(f)
        saved_hash = checkpoint_data.get("audio_hash")
        if saved_hash == audio_hash:
            start_chunk = checkpoint_data.get("last_processed_chunk", 0)
            output_txt_path = checkpoint_data.get("output_txt_path")
            output_vtt_path = checkpoint_data.get("output_vtt_path")
            print(f"⏸️ Melanjutkan dari chunk ke-{start_chunk + 1}")
        else:
            print("⚠️ File audio berbeda. Mulai dari awal.")
            start_chunk = 0
            today_str = datetime.today().strftime("%Y%m%d")
            output_txt_path = get_unique_filename(os.path.join(output_dir, f"transkrip_output_{today_str}.txt"))
            output_vtt_path = get_unique_filename(os.path.join(output_dir, f"transkrip_output_{today_str}.vtt"))
else:
    print("🚀 Memulai proses baru dari awal...")
    start_chunk = 0
    today_str = datetime.today().strftime("%Y%m%d")
    output_txt_path = get_unique_filename(os.path.join(output_dir, f"transkrip_output_{today_str}.txt"))
    output_vtt_path = get_unique_filename(os.path.join(output_dir, f"transkrip_output_{today_str}.vtt"))

# --- Format waktu ---
def format_time(ms):
    return str(timedelta(milliseconds=ms)).split(".")[0]

def format_time_vtt(ms):
    td = timedelta(milliseconds=ms)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    milliseconds = ms % 1000
    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"

# --- Siapkan path file unik (jika diperlukan) ---
def get_unique_filename(base_path):
    if not os.path.exists(base_path):
        return base_path
    name, ext = os.path.splitext(base_path)
    counter = 1
    while True:
        new_path = f"{name}_{counter}{ext}"
        if not os.path.exists(new_path):
            return new_path
        counter += 1

# --- Buka file untuk tulis awal jika dari awal, atau append jika melanjutkan ---
txt_mode = "w" if start_chunk == 0 else "a"
vtt_mode = "w" if start_chunk == 0 else "a"

with open(output_txt_path, txt_mode, encoding="utf-8") as txt_file, \
     open(output_vtt_path, vtt_mode, encoding="utf-8") as vtt_file:

    if start_chunk == 0:
        vtt_file.write("WEBVTT\n\n")  # Header hanya ditulis saat awal

    full_transcript = ""

    progress_bar = tqdm(
        total=len(chunks),
        initial=start_chunk,
        desc="🔄 Transcribing",
        unit="chunk",
        leave=True
    )

    for i in range(start_chunk, len(chunks)):
        start_ms = i * chunk_duration_ms
        end_ms = min((i + 1) * chunk_duration_ms, len(audio))
        chunk_path = f"temp_chunk_{i}.wav"
        chunks[i].export(chunk_path, format="wav")

        try:
            result = model.transcribe(chunk_path, language="id")
            chunk_text = result["text"].strip()

            if len(chunk_text.split()) < 3:
                progress_bar.update(1)
                continue

            start_display = format_time(start_ms)
            end_display = format_time(end_ms)
            start_vtt = format_time_vtt(start_ms)
            end_vtt = format_time_vtt(end_ms)

            # --- Tulis ke file ---
            txt_file.write(f"[{start_display} - {end_display}]\n{chunk_text}\n\n")
            vtt_file.write(f"{i+1}\n{start_vtt} --> {end_vtt}\n{chunk_text}\n\n")

            # --- Tampilkan ke console ---
            wrapped_text = textwrap.fill(chunk_text, width=100)
            tqdm.write(f"{i+1}\n{start_vtt} --> {end_vtt}\n{wrapped_text}\n")

            full_transcript += chunk_text + " "

            # ✅ Update checkpoint
            with open(checkpoint_file, "w") as f:
                json.dump({
                    "last_processed_chunk": i + 1,
                    "audio_hash": audio_hash,
                    "output_txt_path": output_txt_path,
                    "output_vtt_path": output_vtt_path
                }, f)

        finally:
            if os.path.exists(chunk_path):
                os.remove(chunk_path)
            progress_bar.update(1)

    progress_bar.close()

# --- Ringkasan ---
print(f"\n💾 Transkrip lengkap disimpan ke file: {output_txt_path}")
print(f"💾 Subtitle disimpan ke file: {output_vtt_path}")

print("\n📄 Transkrip Ringkas (tanpa timestamp):\n")
print(full_transcript.strip())

⏸️ Melanjutkan dari chunk ke-62


🔄 Transcribing:  29%|##9       | 61/210 [00:00<?, ?chunk/s]

62
01:01:00.000 --> 01:02:00.000
Kita proposal-pil mau bottlenecking ya? Sama approval Pak. Approval. Kalau misalnya kita punya ada
proposal tertentu, kita sekalian approval di situ Pak. Jadi approval, proposal-pil bottlenecking,
semua hal kita eskalasi ke skripot itu. Langsung kita lempar ke yang tertinggi ini. Langsung ke
board of director Pak. Pertanyaan saya, kalau itu saya hilangkan, jalan nggak? Kalau itu dihilangkan
itu pertanyaan bagus Pak. Kalau itu dihilangkan, maksudnya kedisiplinan yang mingguan itu
dihilangkan ya? Ya step-up saya hilangkan. Step-up saya hilangkan, saya cuma adakan approval aja.
Kalau menurut saya ya, sebenarnya kan kalau dilihat di bank ya Pak, mekanisme organisasi kan
sebenarnya sudah cukup rigid ya. Beberapa level putusan itu kan ada beberapa jalan pintu masuk ya.
Misalnya rapat direksi, kemudian beberapa komiti, kemudian sampai bahkan radir komentar. Que más.

63
01:02:00.000 --> 01:03:00.000
dengan komisaris, itu sebenarnya ada levelnya, sebenarnya ken

## Using Faster Whisper (More faster but less accuracy)

In [None]:
!pip install faster-whisper

In [8]:
from pydub import AudioSegment
from faster_whisper import WhisperModel
from tqdm.notebook import tqdm
import os
import warnings
from datetime import timedelta
import textwrap
from pydub.utils import mediainfo
import platform
import psutil
import time
import numpy as np

warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# --- Load Whisper model ---
print("🔧 Load Whisper model...")
# Ganti "turbo" ke model valid seperti "base", "small", "medium", atau "large"
model = WhisperModel("turbo", device="cpu", compute_type="int8")
print("✅ Whisper model loaded.")

# --- Load audio file ---
print("🎧 Load audio file...")
audio_path = "assets/audio/RFOJK20250603_Risalah Rapat Taksonomi Data.m4a"
audio = AudioSegment.from_file(audio_path)

# --- Extract and display metadata ---
info = mediainfo(audio_path)
duration_sec = float(info.get("duration", 0))
duration_str = str(timedelta(seconds=int(duration_sec)))

print("ℹ️  Metadata Audio:")
print(f"    • File name    : {os.path.basename(audio_path)}")
print(f"    • Duration     : {duration_str}")
print(f"    • Codec        : {info.get('codec_name', 'N/A')}")
print(f"    • Sample rate  : {info.get('sample_rate', 'N/A')} Hz")
print(f"    • Channels     : {info.get('channels', 'N/A')}")
print(f"    • Bitrate      : {info.get('bit_rate', 'N/A')} bps")
print(f"    • Format       : {info.get('format_name', 'N/A')}")
print("✅ Audio loaded.")

# --- Split audio into 1-minute chunks ---
chunk_duration_ms = 60 * 1000  # 60 detik
chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
print(f"📦 Total chunks: {len(chunks)} (durasi: 1 menit per chunk)")

# --- System Info ---
print("\n💻 Spesifikasi Sistem:")
print(f"    • Platform        : {platform.system()} {platform.release()}")
print(f"    • Processor       : {platform.processor()}")
print(f"    • CPU cores       : {psutil.cpu_count(logical=False)} fisik / {psutil.cpu_count(logical=True)} logical")
print(f"    • RAM             : {round(psutil.virtual_memory().total / (1024**3), 2)} GB")

# --- Estimasi waktu adaptif sebelum proses chunk ---
print("⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...")

# Ambil 1 menit pertama dari audio (atau kurang kalau audio-nya pendek)
sample_duration_ms = min(60 * 1000, len(audio))  # Maks 1 menit
sample_chunk = audio[:sample_duration_ms]
temp_sample_path = "temp_sample_chunk.wav"
sample_chunk.export(temp_sample_path, format="wav")

# Hitung waktu transkripsi
start_time = time.time()
segments, _ = model.transcribe(temp_sample_path, language="id")
chunk_text = " ".join([seg.text.strip() for seg in segments]).strip()
elapsed = time.time() - start_time

estimated_per_chunk = round(elapsed, 2)
estimated_chunks = int(len(audio) / sample_duration_ms)
estimated_total_time = elapsed * estimated_chunks
estimated_total_fmt = str(timedelta(seconds=int(estimated_total_time)))

print(f"    🧮 Estimasi waktu per chunk   : {estimated_per_chunk} detik")
print(f"    🧮 Estimasi total transkripsi : {estimated_total_fmt} untuk ~{estimated_chunks} chunks")

# Hapus file sementara
if os.path.exists(temp_sample_path):
    os.remove(temp_sample_path)

print("✅ Estimasi selesai.\n")
print(f"📄 Contoh transkripsi chunk sample:\n{textwrap.fill(chunk_text, width=100)}\n")

🔧 Load Whisper model...
✅ Whisper model loaded.
🎧 Load audio file...
ℹ️  Metadata Audio:
    • File name    : RFOJK20250603_Risalah Rapat Taksonomi Data.m4a
    • Duration     : 1:46:19
    • Codec        : aac
    • Sample rate  : 48000 Hz
    • Channels     : 1
    • Bitrate      : 67341 bps
    • Format       : mov,mp4,m4a,3gp,3g2,mj2
✅ Audio loaded.
📦 Total chunks: 107 (durasi: 1 menit per chunk)

💻 Spesifikasi Sistem:
    • Platform        : Darwin 24.5.0
    • Processor       : arm
    • CPU cores       : 8 fisik / 8 logical
    • RAM             : 8.0 GB
⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...
    🧮 Estimasi waktu per chunk   : 26.86 detik
    🧮 Estimasi total transkripsi : 0:47:27 untuk ~106 chunks
✅ Estimasi selesai.

📄 Contoh transkripsi chunk sample:
Seperti itu, supaya nanti pada saat collecting agregasi di BUM juga tidak mebebani dari MDM yang
cukup tinggi untuk melakukan agregasi Karena dari sisi elemen datanya sama, kemudian dari si Putri
juga, si Pu

🧠 Mulai proses transkripsi...



🔄 Transcribing:   0%|          | 0/107 [00:00<?, ?it/s]

1
00:00:00.000 --> 00:01:00.000
Seperti itu, supaya nanti pada saat collecting agregasi di BUM juga tidak mebebani dari MDM yang
cukup tinggi untuk melakukan agregasi  Karena dari sisi elemen datanya sama, kemudian dari si Putri
juga, si Putri kan juga melakukan pengumpulan data sanksi  Ibaratnya nanti dari masing-masing bidang
yang belum punya tools untuk pencatatan sanksi akan masuk ke si Putri nanti  Itu harapannya nanti di
elemen data ini juga bisa menjadi acuan di seluruh bidang  Dan mungkin ini lebih tepatnya, nanti ini
perlu ditapkan dalam bentuk apa ini?  Apakah perlu dituangkan nanti di dalam EDM, enterprise data
model yang sudah dimiliki DPDES, supaya nanti jadi acuan bagus atau seperti apa?  Karena selama ini
ketika ditanya Tasolomi, kita kirim cuma Excel-Excel ini  Dan izin, sorry, dan Excel ini sebenarnya
kita  Apa?

2
00:01:00.000 --> 00:02:00.000
Dari draft awal, kita compile-compile nih dari masukan dari PPDP, PFML, Perbankan  Tapi untuk kayak
mereview apakah ini udah t

KeyboardInterrupt: 