# Speech to Text with Whisper OpenAI

### Import Dependencies

In [None]:
from pydub import AudioSegment
import whisper
from tqdm.notebook import tqdm
import os
import warnings
from datetime import timedelta, datetime
import textwrap
from pydub.utils import mediainfo
import platform
import psutil
import time

### Load Model & Estimate

In [None]:
warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# --- Load Whisper model ---
print("🔧 Load Whisper model...")
model = whisper.load_model("turbo") 
print("✅ Whisper model loaded.")

# --- Load audio file ---
print("🎧 Load audio file...")
audio_path = "/Users/naufalnashif/Desktop/automation-risalah-rapat-with-ai/assets/audio/RFOJK20250603_Risalah Rapat Taksonomi Data.m4a"
audio = AudioSegment.from_file(audio_path)


# --- Extract and display metadata ---
info = mediainfo(audio_path)
duration_sec = float(info.get("duration", 0))
duration_str = str(timedelta(seconds=int(duration_sec)))

print("ℹ️  Metadata Audio:")
print(f"    • File name    : {audio_path.split('/')[-1]}")
print(f"    • Duration     : {duration_str}")
print(f"    • Codec        : {info.get('codec_name', 'N/A')}")
print(f"    • Sample rate  : {info.get('sample_rate', 'N/A')} Hz")
print(f"    • Channels     : {info.get('channels', 'N/A')}")
print(f"    • Bitrate      : {info.get('bit_rate', 'N/A')} bps")
print(f"    • Format       : {info.get('format_name', 'N/A')}")

print("✅ Audio loaded.")

# --- Split audio into 1-minute chunks ---
chunk_duration_ms = 60 * 1000  # 60 detik
chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
print(f"📦 Total chunks: {len(chunks)} (durasi: 1 menit per chunk)")

# Info spek sistem
print("\n💻 Spesifikasi Sistem:")
print(f"    • Platform        : {platform.system()} {platform.release()}")
print(f"    • Processor       : {platform.processor()}")
print(f"    • CPU cores       : {psutil.cpu_count(logical=False)} fisik / {psutil.cpu_count(logical=True)} logical")
print(f"    • RAM             : {round(psutil.virtual_memory().total / (1024**3), 2)} GB")

# --- Estimasi waktu adaptif sebelum proses chunk ---
print("⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...")

# Ambil 1 menit pertama dari audio (atau kurang kalau audio-nya pendek)
sample_duration_ms = min(60 * 1000, len(audio))  # Maks 1 menit
sample_chunk = audio[:sample_duration_ms]
temp_sample_path = "temp_sample_chunk.wav"
sample_chunk.export(temp_sample_path, format="wav")

# Hitung waktu transkripsi
start_time = time.time()
result = model.transcribe(temp_sample_path, language="id")
chunk_text = result["text"].strip()
elapsed = time.time() - start_time

# Estimasi total
estimated_per_chunk = round(elapsed, 2)
estimated_chunks = int(len(audio) / sample_duration_ms)
estimated_total_time = elapsed * estimated_chunks
estimated_total_fmt = str(timedelta(seconds=int(estimated_total_time)))


print(f"    🧮 Estimasi waktu per chunk   : {estimated_per_chunk} detik")
print(f"    🧮 Estimasi total transkripsi : {estimated_total_fmt} untuk ~{estimated_chunks} chunks")

# Hapus file sementara
if os.path.exists(temp_sample_path):
    os.remove(temp_sample_path)

print("✅ Estimasi selesai.\n")
print(f"📄 Contoh transkripsi chunk sample:\n{textwrap.fill(chunk_text, width=100)}\n")

🔧 Load Whisper model...
✅ Whisper model loaded.
🎧 Load audio file...
ℹ️  Metadata Audio:
    • File name    : RFOJK20250603_Risalah Rapat Taksonomi Data.m4a
    • Duration     : 1:46:19
    • Codec        : aac
    • Sample rate  : 48000 Hz
    • Channels     : 1
    • Bitrate      : 67341 bps
    • Format       : mov,mp4,m4a,3gp,3g2,mj2
✅ Audio loaded.
📦 Total chunks: 107 (durasi: 1 menit per chunk)

💻 Spesifikasi Sistem:
    • Platform        : Darwin 24.5.0
    • Processor       : arm
    • CPU cores       : 8 fisik / 8 logical
    • RAM             : 8.0 GB
⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...
    🧮 Estimasi waktu per chunk   : 27.07 detik
    🧮 Estimasi total transkripsi : 0:47:49 untuk ~106 chunks
✅ Estimasi selesai.

📄 Contoh transkripsi chunk sample:
Seperti itu, supaya nanti pada saat collecting agregasi di MDM juga tidak mebebani dari MDM yang
cukup tinggi untuk melakukan agregasi Karena dari sisi elemen datanya sama, kemudian dari si Putri
juga, si Pu

### Transcribe

In [None]:
# --- Transkripsi per chunk ---
print("🧠 Mulai proses transkripsi...\n")
full_transcript = ""
formatted_chunks = []  # Untuk display di Notebook
text_for_file = ""      # Untuk .txt
vtt_chunks = []         # Untuk .vtt

def format_time(ms):
    """Format untuk display dan txt: HH:MM:SS"""
    return str(timedelta(milliseconds=ms)).split(".")[0]

def format_time_vtt(ms):
    """Format untuk VTT: HH:MM:SS.mmm"""
    td = timedelta(milliseconds=ms)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    milliseconds = ms % 1000
    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"

for i, chunk in enumerate(tqdm(chunks, desc="🔄 Transcribing")):
    start_ms = i * chunk_duration_ms
    end_ms = min((i + 1) * chunk_duration_ms, len(audio))

    chunk_path = f"temp_chunk_{i}.wav"
    chunk.export(chunk_path, format="wav")

    try:
        result = model.transcribe(chunk_path, language="id")
        chunk_text = result["text"].strip()

        if len(chunk_text.split()) < 3:
            continue  # Skip jika chunk terlalu pendek

        start_display = format_time(start_ms)
        end_display = format_time(end_ms)
        start_vtt = format_time_vtt(start_ms)
        end_vtt = format_time_vtt(end_ms)

        # Format untuk notebook display
        formatted_chunk = f"## 🎧 Chunk {i+1}/{len(chunks)}\n🕒 {start_display} - {end_display}\n{chunk_text}\n"
        formatted_chunks.append(formatted_chunk)

        # Format untuk file .txt
        text_for_file += f"[{start_display} - {end_display}]\n{chunk_text}\n\n"

        # Format untuk .vtt
        vtt_chunks.append(f"{i+1}\n{start_vtt} --> {end_vtt}\n{chunk_text}\n")
        
        wrapped_text = textwrap.fill(chunk_text, width=100)
        print(f"{i+1}\n{start_vtt} --> {end_vtt}\n{wrapped_text}\n")
        # print(f"{chunk_text}")
        # Untuk ringkasan
        full_transcript += chunk_text + " "

    finally:
        if os.path.exists(chunk_path):
            os.remove(chunk_path)


# Buat folder output jika belum ada
output_dir = "output_transkrip"
os.makedirs(output_dir, exist_ok=True)

# Format nama file dengan tanggal hari ini
today_str = datetime.today().strftime("%Y%m%d")
base_txt_name = f"transkrip_output_{today_str}.txt"
base_vtt_name = f"transkrip_output_{today_str}.vtt"

# Fungsi untuk menghasilkan nama file unik
def get_unique_filename(base_path):
    if not os.path.exists(base_path):
        return base_path
    name, ext = os.path.splitext(base_path)
    counter = 1
    while True:
        new_path = f"{name}_{counter}{ext}"
        if not os.path.exists(new_path):
            return new_path
        counter += 1

# Path akhir yang aman dari overwrite
output_txt_path = get_unique_filename(os.path.join(output_dir, base_txt_name))
output_vtt_path = get_unique_filename(os.path.join(output_dir, base_vtt_name))

# --- Simpan ke file .txt ---
with open(output_txt_path, "w", encoding="utf-8") as f:
    f.write(text_for_file.strip())
print(f"💾 Transkrip lengkap disimpan ke file: {output_txt_path}")

# --- Simpan ke file .vtt ---
with open(output_vtt_path, "w", encoding="utf-8") as f:
    f.write("WEBVTT\n\n")
    f.write("\n".join(vtt_chunks))
print(f"💾 Subtitle disimpan ke file: {output_vtt_path}")

# --- Ringkasan akhir ---
print("\n📄 Transkrip Ringkas (tanpa timestamp):\n")
print(full_transcript.strip())

## Using Faster Whisper (More faster but less accuracy)

In [None]:
!pip install faster-whisper

In [8]:
from pydub import AudioSegment
from faster_whisper import WhisperModel
from tqdm.notebook import tqdm
import os
import warnings
from datetime import timedelta
import textwrap
from pydub.utils import mediainfo
import platform
import psutil
import time
import numpy as np

warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")

# --- Load Whisper model ---
print("🔧 Load Whisper model...")
# Ganti "turbo" ke model valid seperti "base", "small", "medium", atau "large"
model = WhisperModel("turbo", device="cpu", compute_type="int8")
print("✅ Whisper model loaded.")

# --- Load audio file ---
print("🎧 Load audio file...")
audio_path = "assets/audio/RFOJK20250603_Risalah Rapat Taksonomi Data.m4a"
audio = AudioSegment.from_file(audio_path)

# --- Extract and display metadata ---
info = mediainfo(audio_path)
duration_sec = float(info.get("duration", 0))
duration_str = str(timedelta(seconds=int(duration_sec)))

print("ℹ️  Metadata Audio:")
print(f"    • File name    : {os.path.basename(audio_path)}")
print(f"    • Duration     : {duration_str}")
print(f"    • Codec        : {info.get('codec_name', 'N/A')}")
print(f"    • Sample rate  : {info.get('sample_rate', 'N/A')} Hz")
print(f"    • Channels     : {info.get('channels', 'N/A')}")
print(f"    • Bitrate      : {info.get('bit_rate', 'N/A')} bps")
print(f"    • Format       : {info.get('format_name', 'N/A')}")
print("✅ Audio loaded.")

# --- Split audio into 1-minute chunks ---
chunk_duration_ms = 60 * 1000  # 60 detik
chunks = [audio[i:i + chunk_duration_ms] for i in range(0, len(audio), chunk_duration_ms)]
print(f"📦 Total chunks: {len(chunks)} (durasi: 1 menit per chunk)")

# --- System Info ---
print("\n💻 Spesifikasi Sistem:")
print(f"    • Platform        : {platform.system()} {platform.release()}")
print(f"    • Processor       : {platform.processor()}")
print(f"    • CPU cores       : {psutil.cpu_count(logical=False)} fisik / {psutil.cpu_count(logical=True)} logical")
print(f"    • RAM             : {round(psutil.virtual_memory().total / (1024**3), 2)} GB")

# --- Estimasi waktu adaptif sebelum proses chunk ---
print("⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...")

# Ambil 1 menit pertama dari audio (atau kurang kalau audio-nya pendek)
sample_duration_ms = min(60 * 1000, len(audio))  # Maks 1 menit
sample_chunk = audio[:sample_duration_ms]
temp_sample_path = "temp_sample_chunk.wav"
sample_chunk.export(temp_sample_path, format="wav")

# Hitung waktu transkripsi
start_time = time.time()
segments, _ = model.transcribe(temp_sample_path, language="id")
chunk_text = " ".join([seg.text.strip() for seg in segments]).strip()
elapsed = time.time() - start_time

estimated_per_chunk = round(elapsed, 2)
estimated_chunks = int(len(audio) / sample_duration_ms)
estimated_total_time = elapsed * estimated_chunks
estimated_total_fmt = str(timedelta(seconds=int(estimated_total_time)))

print(f"    🧮 Estimasi waktu per chunk   : {estimated_per_chunk} detik")
print(f"    🧮 Estimasi total transkripsi : {estimated_total_fmt} untuk ~{estimated_chunks} chunks")

# Hapus file sementara
if os.path.exists(temp_sample_path):
    os.remove(temp_sample_path)

print("✅ Estimasi selesai.\n")
print(f"📄 Contoh transkripsi chunk sample:\n{textwrap.fill(chunk_text, width=100)}\n")

🔧 Load Whisper model...
✅ Whisper model loaded.
🎧 Load audio file...
ℹ️  Metadata Audio:
    • File name    : RFOJK20250603_Risalah Rapat Taksonomi Data.m4a
    • Duration     : 1:46:19
    • Codec        : aac
    • Sample rate  : 48000 Hz
    • Channels     : 1
    • Bitrate      : 67341 bps
    • Format       : mov,mp4,m4a,3gp,3g2,mj2
✅ Audio loaded.
📦 Total chunks: 107 (durasi: 1 menit per chunk)

💻 Spesifikasi Sistem:
    • Platform        : Darwin 24.5.0
    • Processor       : arm
    • CPU cores       : 8 fisik / 8 logical
    • RAM             : 8.0 GB
⏳ Menghitung estimasi waktu transkripsi adaptif (1 menit)...
    🧮 Estimasi waktu per chunk   : 26.86 detik
    🧮 Estimasi total transkripsi : 0:47:27 untuk ~106 chunks
✅ Estimasi selesai.

📄 Contoh transkripsi chunk sample:
Seperti itu, supaya nanti pada saat collecting agregasi di BUM juga tidak mebebani dari MDM yang
cukup tinggi untuk melakukan agregasi Karena dari sisi elemen datanya sama, kemudian dari si Putri
juga, si Pu

In [None]:
# --- Transkripsi per chunk ---
print("🧠 Mulai proses transkripsi...\n")
full_transcript = ""
formatted_chunks = []  # Untuk display di Notebook
text_for_file = ""      # Untuk .txt
vtt_chunks = []         # Untuk .vtt

def format_time(ms):
    """Format untuk display dan txt: HH:MM:SS"""
    return str(timedelta(milliseconds=ms)).split(".")[0]

def format_time_vtt(ms):
    """Format untuk VTT: HH:MM:SS.mmm"""
    td = timedelta(milliseconds=ms)
    total_seconds = int(td.total_seconds())
    hours = total_seconds // 3600
    minutes = (total_seconds % 3600) // 60
    seconds = total_seconds % 60
    milliseconds = ms % 1000
    return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"

for i, chunk in enumerate(tqdm(chunks, desc="🔄 Transcribing")):
    start_ms = i * chunk_duration_ms
    end_ms = min((i + 1) * chunk_duration_ms, len(audio))

    chunk_path = f"temp_chunk_{i}.wav"
    chunk.export(chunk_path, format="wav")

    try:
        segments, _ = model.transcribe(chunk_path, language="id")

        # Ambil teks dari setiap segment:
        chunk_text = " ".join([seg.text for seg in segments]).strip()

        if len(chunk_text.split()) < 3:
            continue  # Skip jika chunk terlalu pendek

        start_display = format_time(start_ms)
        end_display = format_time(end_ms)
        start_vtt = format_time_vtt(start_ms)
        end_vtt = format_time_vtt(end_ms)

        # Format untuk notebook display
        formatted_chunk = f"## 🎧 Chunk {i+1}/{len(chunks)}\n🕒 {start_display} - {end_display}\n{chunk_text}\n"
        formatted_chunks.append(formatted_chunk)

        # Format untuk file .txt
        text_for_file += f"[{start_display} - {end_display}]\n{chunk_text}\n\n"

        # Format untuk .vtt
        vtt_chunks.append(f"{i+1}\n{start_vtt} --> {end_vtt}\n{chunk_text}\n")
        
        wrapped_text = textwrap.fill(chunk_text, width=100)
        print(f"{i+1}\n{start_vtt} --> {end_vtt}\n{wrapped_text}\n")
        # print(f"{chunk_text}")
        # Untuk ringkasan
        full_transcript += chunk_text + " "

    finally:
        if os.path.exists(chunk_path):
            os.remove(chunk_path)


# Buat folder output jika belum ada
output_dir = "output_transkrip"
os.makedirs(output_dir, exist_ok=True)

# Format nama file dengan tanggal hari ini
today_str = datetime.today().strftime("%Y%m%d")
base_txt_name = f"transkrip_output_{today_str}.txt"
base_vtt_name = f"transkrip_output_{today_str}.vtt"

# Fungsi untuk menghasilkan nama file unik
def get_unique_filename(base_path):
    if not os.path.exists(base_path):
        return base_path
    name, ext = os.path.splitext(base_path)
    counter = 1
    while True:
        new_path = f"{name}_{counter}{ext}"
        if not os.path.exists(new_path):
            return new_path
        counter += 1

# Path akhir yang aman dari overwrite
output_txt_path = get_unique_filename(os.path.join(output_dir, base_txt_name))
output_vtt_path = get_unique_filename(os.path.join(output_dir, base_vtt_name))

# --- Simpan ke file .txt ---
with open(output_txt_path, "w", encoding="utf-8") as f:
    f.write(text_for_file.strip())
print(f"💾 Transkrip lengkap disimpan ke file: {output_txt_path}")

# --- Simpan ke file .vtt ---
with open(output_vtt_path, "w", encoding="utf-8") as f:
    f.write("WEBVTT\n\n")
    f.write("\n".join(vtt_chunks))
print(f"💾 Subtitle disimpan ke file: {output_vtt_path}")

# --- Ringkasan akhir ---
print("\n📄 Transkrip Ringkas (tanpa timestamp):\n")
print(full_transcript.strip())

🧠 Mulai proses transkripsi...



🔄 Transcribing:   0%|          | 0/107 [00:00<?, ?it/s]

1
00:00:00.000 --> 00:01:00.000
Seperti itu, supaya nanti pada saat collecting agregasi di BUM juga tidak mebebani dari MDM yang
cukup tinggi untuk melakukan agregasi  Karena dari sisi elemen datanya sama, kemudian dari si Putri
juga, si Putri kan juga melakukan pengumpulan data sanksi  Ibaratnya nanti dari masing-masing bidang
yang belum punya tools untuk pencatatan sanksi akan masuk ke si Putri nanti  Itu harapannya nanti di
elemen data ini juga bisa menjadi acuan di seluruh bidang  Dan mungkin ini lebih tepatnya, nanti ini
perlu ditapkan dalam bentuk apa ini?  Apakah perlu dituangkan nanti di dalam EDM, enterprise data
model yang sudah dimiliki DPDES, supaya nanti jadi acuan bagus atau seperti apa?  Karena selama ini
ketika ditanya Tasolomi, kita kirim cuma Excel-Excel ini  Dan izin, sorry, dan Excel ini sebenarnya
kita  Apa?

2
00:01:00.000 --> 00:02:00.000
Dari draft awal, kita compile-compile nih dari masukan dari PPDP, PFML, Perbankan  Tapi untuk kayak
mereview apakah ini udah t

KeyboardInterrupt: 