In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI
import json

# Load API keys
load_dotenv()

# Initialize clients
openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Paths
DATA_DIR = Path("../data/sample_audio")
REFERENCE_FILE = Path("../data/reference_phrases.json")

In [2]:
def transcribe_whisper(audio_path, language="zh"):
    """
    Transcribe audio using OpenAI Whisper
    
    Args:
        audio_path: Path to audio file
        language: Language code (zh for Mandarin)
    
    Returns:
        dict with transcription and metadata
    """
    with open(audio_path, "rb") as audio_file:
        response = openai_client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file,
            language=language,
            response_format="verbose_json"  # Get timestamps too
        )
    
    return {
        'text': response.text,
        'language': response.language,
        'duration': response.duration,
        'segments': response.segments if hasattr(response, 'segments') else None
    }

Test it out

In [3]:
test_file = DATA_DIR / "ni_hao_01.wav"

if test_file.exists():
    print(f"Transcribing: {test_file.name}")
    result = transcribe_whisper(test_file)
    
    print(f"\nTranscription: {result['text']}")
    print(f"Duration: {result['duration']:.2f}s")
    print(f"Language detected: {result['language']}")
else:
    print(f"File not found: {test_file}")

File not found: ../data/sample_audio/ni_hao_01.wav


In [None]:
# Incorrectly transcribes as `ma1`
# Doesn't expect someone to just say ma4 on its own?
test_file = DATA_DIR / "ma_scold.wav" 

if test_file.exists():
    print(f"Transcribing: {test_file.name}")
    result = transcribe_whisper(test_file)
    
    print(f"\nTranscription: {result['text']}")
    print(f"Duration: {result['duration']:.2f}s")
    print(f"Language detected: {result['language']}")
else:
    print(f"File not found: {test_file}")

Transcribing: ma_scold.wav

Transcription: 妈
Duration: 3.00s
Language detected: chinese


### Batch transcribe all recordings using Whisper

In [None]:
import pandas as pd
from opencc import OpenCC


def transcribe_all_recordings():
    """Transcribe all WAV files and store results"""
    
    results = []
    wav_files = sorted(DATA_DIR.glob("*.wav"))
    cc = OpenCC('t2s')  # t2s = traditional to simplified
    
    print(f"Found {len(wav_files)} audio files\n")
    
    for audio_file in wav_files:
        print(f"Processing: {audio_file.name}...", end=" ")
        
        try:
            result = transcribe_whisper(audio_file)
            results.append({
                'filename': audio_file.name,
                # sometimes Whisper returns traditional characters
                # in the transcription; we convert to simplified
                'transcription': cc.convert(result['text']),
                'duration': result['duration'],
                'language': result['language']
            })
            print("✓")
            
        except Exception as e:
            print(f"✗ Error: {e}")
            results.append({
                'filename': audio_file.name,
                'transcription': None,
                'error': str(e)
            })
    
    return pd.DataFrame(results)

# Run batch transcription
transcriptions_df = transcribe_all_recordings()
transcriptions_df

Found 10 audio files

Processing: chi_fan.wav... ✓
Processing: jin_tian_tian_qi_hen_hao.wav... ✓
Processing: ma_horse.wav... ✓
Processing: ma_ma.wav... ✓
Processing: ma_scold.wav... ✓
Processing: ni_hao.wav... ✓
Processing: ni_hao_incorrect.wav... ✓
Processing: wo_zai_xue_zhong_wen.wav... ✓
Processing: xie_xie.wav... ✓
Processing: zhi_dao.wav... ✓


Unnamed: 0,filename,transcription,duration,language
0,chi_fan.wav,吃饭,3.0,chinese
1,jin_tian_tian_qi_hen_hao.wav,今天天气很好,3.0,chinese
2,ma_horse.wav,马,3.0,chinese
3,ma_ma.wav,妈妈,3.0,chinese
4,ma_scold.wav,妈,3.0,chinese
5,ni_hao.wav,你好,3.0,chinese
6,ni_hao_incorrect.wav,Ni Hao,3.0,chinese
7,wo_zai_xue_zhong_wen.wav,我在学中文,3.0,chinese
8,xie_xie.wav,谢谢,3.0,chinese
9,zhi_dao.wav,知道,3.0,chinese


## Compare with correct transcriptions

In [11]:
import pandas as pd

# Load reference phrases
with open(REFERENCE_FILE, 'r', encoding='utf-8') as f:
    reference_data = json.load(f)

# Create DataFrame from reference
reference_df = pd.DataFrame(reference_data['phrases'])

# Extract base filename (without .wav extension) from transcriptions
transcriptions_df['base_filename'] = transcriptions_df['filename'].str.replace('.wav', '')

# Merge on filename
comparison_df = transcriptions_df.merge(
    reference_df[['filename', 'chinese', 'pinyin', 'english', 'notes']],
    left_on='base_filename',
    right_on='filename',
    how='left',
    suffixes=('', '_expected')
)

# Rename for clarity
comparison_df = comparison_df.rename(columns={
    'chinese': 'expected',
    'transcription': 'actual'
})

# Check match
comparison_df['match'] = comparison_df['actual'] == comparison_df['expected']

print("Transcription Accuracy:")
display(comparison_df[['filename', 'actual', 'expected', 'pinyin', 'match']])

# Summary
accuracy = comparison_df['match'].sum() / len(comparison_df) * 100
print(f"\nOverall Accuracy: {accuracy:.1f}%")
print(f"Correct: {comparison_df['match'].sum()}/{len(comparison_df)}")

Transcription Accuracy:


Unnamed: 0,filename,actual,expected,pinyin,match
0,chi_fan.wav,吃饭,吃饭,chī fàn,True
1,jin_tian_tian_qi_hen_hao.wav,今天天气很好,今天天气很好,jīn tiān tiān qì hěn hǎo,True
2,ma_horse.wav,马,马,mǎ,True
3,ma_ma.wav,妈妈,妈妈,mā ma,True
4,ma_scold.wav,妈,骂,mà,False
5,ni_hao.wav,你好,你好,nǐ hǎo,True
6,ni_hao_incorrect.wav,Ni Hao,你好,nǐ hǎo (incorrect tones),False
7,wo_zai_xue_zhong_wen.wav,我在学中文,我在学中文,wǒ zài xué zhōng wén,True
8,xie_xie.wav,谢谢,谢谢,xiè xiè,True
9,zhi_dao.wav,知道,知道,zhī dào,True



Overall Accuracy: 80.0%
Correct: 8/10


In [14]:
transcriptions_df.to_csv("../data/transcriptions_df.csv", index=False)

### Calculate API costs

In [12]:
# Whisper pricing: $0.006 per minute
total_duration = transcriptions_df['duration'].sum()
total_cost = (total_duration / 60) * 0.006

print(f"Total audio processed: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")
print(f"Estimated cost: ${total_cost:.4f}")
print(f"\nCost per recording: ${total_cost/len(transcriptions_df):.4f}")

Total audio processed: 30.00 seconds (0.50 minutes)
Estimated cost: $0.0030

Cost per recording: $0.0003
