In [None]:
# 6_audio_to_text_preparation_whisper.ipynb
"""
Prepare audio-to-text training data from Indigenous language recordings using OpenAI's Whisper.
Includes segmentation, transcription, and export to plain text for LLM fine-tuning.
"""

In [None]:
# 📦 Step 1: Install dependencies (run once)
!pip install git+https://github.com/openai/whisper.git
!pip install ffmpeg-python pandas

In [None]:

# 🧠 Step 2: Import libraries
import whisper
import os
import pandas as pd
from pathlib import Path
from datetime import datetime
import ffmpeg

In [None]:

# 📂 Step 3: Set your data directory
audio_dir = "../datasets/audio/"  # Your audio files here (.mp3, .wav, etc.)
output_dir = "../datasets/audio_transcripts/"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:

# 📦 Step 4: Load Whisper model
# Choose from: tiny, base, small, medium, large (larger = more accurate)
model_size = "small"
model = whisper.load_model(model_size)
print(f"✅ Whisper {model_size} model loaded.")

In [None]:

# 🎧 Step 5: Transcribe audio files
def transcribe_audio_file(filepath):
    try:
        print(f"🎙️ Transcribing {filepath.name}...")
        result = model.transcribe(str(filepath), language="auto")
        return {
            "filename": filepath.name,
            "text": result["text"].strip(),
            "language": result.get("language", "unknown")
        }
    except Exception as e:
        print(f"⚠️ Failed on {filepath.name}: {e}")
        return None

results = []

for audio_file in Path(audio_dir).glob("*.[mw][ap][34]"):  # .mp3, .mp4, .wav
    result = transcribe_audio_file(audio_file)
    if result:
        results.append(result)

In [None]:

# 📄 Step 6: Save as CSV and plain text
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
csv_path = f"{output_dir}transcripts_{timestamp}.csv"
txt_path = f"{output_dir}corpus_text_{timestamp}.txt"

df = pd.DataFrame(results)
df.to_csv(csv_path, index=False)

# Export only transcript text to plain .txt file
with open(txt_path, "w", encoding="utf-8") as f:
    for row in results:
        f.write(row["text"] + "\n")

print(f"✅ Transcription complete.")
print(f"📝 CSV saved to: {csv_path}")
print(f"📄 Plain text saved to: {txt_path}")


🔤 What This Notebook Produces
transcripts_YYYYMMDD.csv
→ A CSV file with filename, transcript, and detected language.

corpus_text_YYYYMMDD.txt
→ A clean plain-text file ready for inclusion in LLM fine-tuning datasets.

🧾 Recommendations for Ethical Audio Use
Add a consent_form_template.md in ethics-protocols/ for contributors.

Use the language column to filter outputs for only Kanien'kéha or your target language.

Encourage Elders or fluent speakers to review transcripts before inclusion in training sets.

