<a href="https://colab.research.google.com/github/nattaran/health-tequity-case-nasrin/blob/main/VoicePipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mount Google Drive

# Mount Google Drine

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Required Packages

In [None]:
!pip install -r "/content/drive/MyDrive/health-tequity-case/requirements.txt"



Collecting git+https://github.com/openai/whisper.git (from -r /content/drive/MyDrive/health-tequity-case/requirements.txt (line 15))
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-98zsrgsx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-98zsrgsx
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python>=0.2.0 (from -r /content/drive/MyDrive/health-tequity-case/requirements.txt (line 18))
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting jiwer>=3.0.3 (from -r /content/drive/MyDrive/health-tequity-case/requirements.txt (line 24))
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting python-Levenshtein>=0.25.0 (from -r /co

# Load OpenAI API Key (From secretes)

In [None]:
from google.colab import userdata
api_key = userdata.get("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("Add OPENAI_API_KEY in the Secrets panel (left sidebar, key icon).")

In [None]:
!find /content/drive/MyDrive/health-tequity-case -name "synthetic_bp_one_person.csv"


/content/drive/MyDrive/health-tequity-case/Data/BloodPressure/synthetic_bp_one_person.csv


In [None]:
import os
if not os.path.exists("/content/vosk_models/vosk-model-small-es-0.42"):
    !mkdir -p /content/vosk_models
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip -O /content/vosk_models/vosk-model-small-es.zip
    !unzip -q /content/vosk_models/vosk-model-small-es.zip -d /content/vosk_models/

# Define Paths and Create Folders

In [None]:
import json, re, pandas as pd, whisper, Levenshtein
from openai import OpenAI
from jiwer import wer, mer, wil, process_words
import warnings
warnings.filterwarnings("ignore")
import os

BASE_PATH = "/content/drive/MyDrive/health-tequity-case"

# --- Define key folders ---
AUDIO_INPUT_FOLDER = os.path.join(BASE_PATH, "Input_Audio_Files")        # Spanish question audio files
AUDIO_OUTPUT_FOLDER = os.path.join(BASE_PATH, "Data", "audio_out")       # Spanish TTS answers
CSV_OUTPUT_FOLDER = os.path.join(BASE_PATH, "Data", "csv_results")       # WER, CER, SER + pipeline outputs
BP_DATA_FOLDER = os.path.join(BASE_PATH, "Data", "BloodPressure")        # Blood pressure dataset

# --- Create required folders if they don‚Äôt exist ---
for folder in [AUDIO_OUTPUT_FOLDER, CSV_OUTPUT_FOLDER, BP_DATA_FOLDER]:
    os.makedirs(folder, exist_ok=True)

# --- Validate Input Audio Folder ---
if not os.path.exists(AUDIO_INPUT_FOLDER):
    raise FileNotFoundError(f"‚ùå Input folder not found: {AUDIO_INPUT_FOLDER}")

# --- Collect available audio files ---
audio_files = [f for f in os.listdir(AUDIO_INPUT_FOLDER) if f.lower().endswith(('.wav', '.mp3', '.m4a'))]
if not audio_files:
    raise ValueError(f"‚ùå No audio files found in {AUDIO_INPUT_FOLDER}")

print(f"‚úÖ Found {len(audio_files)} Spanish audio file(s): {audio_files}")

# --- Blood Pressure dataset check ---
csv_path = os.path.join(BP_DATA_FOLDER, "synthetic_bp_one_person.csv")

if not os.path.exists(csv_path):
    print(f"‚ö†Ô∏è Blood pressure dataset not found at:\n   {csv_path}")
    print("üëâ Please upload your synthetic_bp_one_person.csv to this folder before running the pipeline.")
else:
    print(f"‚úÖ Found blood pressure dataset: {csv_path}")

# --- Initialize OpenAI client ---
client = OpenAI(api_key=api_key)
print("‚úÖ OpenAI client initialized successfully.")


‚úÖ Found 6 Spanish audio file(s): ['q2_es.wav', 'q1_es.wav', 'q4_es.wav', 'q3_es.wav', 'q6_es.wav', 'q5_es.wav']
‚úÖ Found blood pressure dataset: /content/drive/MyDrive/health-tequity-case/Data/BloodPressure/synthetic_bp_one_person.csv
‚úÖ OpenAI client initialized successfully.


# ASR Transcrioption Generation Using openAI Whisper Model
**Audio -> Transcription -> English Transcription**

In [None]:
def transcribe_spanish_audio(model, audio_path):
    print(f"üéß Transcribing: {audio_path}")
    result = model.transcribe(audio_path, language="spanish", task="transcribe", verbose=False)
    return result["text"].strip(), result["language"]

def translate_spanish_to_english(spanish_text: str) -> str:
    """Translate Spanish transcription to English."""
    prompt = f"Translate the following Spanish medical question into clear English:\n\n{spanish_text}"
    result = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    return result.choices[0].message.content.strip()

def process_and_translate_audio(audio_folder, audio_files, output_csv):
    model = whisper.load_model("base")
    all_results = []

    print("\nüéØ STARTING SPANISH TRANSCRIPTION + TRANSLATION\n" + "="*60)
    for i, audio_file in enumerate(audio_files, 1):
        audio_path = os.path.join(audio_folder, audio_file)
        if not os.path.exists(audio_path):
            print(f"‚ö†Ô∏è {audio_file} not found, skipping...")
            continue

        spanish_text, detected_lang = transcribe_spanish_audio(model, audio_path)
        english_text = translate_spanish_to_english(spanish_text)

        all_results.append({
            "audio_file": audio_file,
            "spanish_transcription": spanish_text,
            "english_translation": english_text,
            "language_detected": detected_lang
        })

        print(f"\n[{i}] {audio_file}")
        print(f"üá™üá∏ {spanish_text}")
        print(f"üá¨üáß {english_text}")

    df = pd.DataFrame(all_results)
    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Transcriptions + translations saved to {output_csv}")
    return df


# ASR Evaluation (WER, CER, SER)


In [None]:
def compute_cer(reference: str, hypothesis: str) -> float:
    reference, hypothesis = reference.strip(), hypothesis.strip()
    if not reference:
        return 1.0 if hypothesis else 0.0
    return Levenshtein.distance(reference, hypothesis) / len(reference)

def compute_sentence_error(reference: str, hypothesis: str) -> int:
    return 0 if reference.strip() == hypothesis.strip() else 1

def evaluate_asr_performance(ground_truth_csv, transcribed_csv, output_csv):
    gt_df = pd.read_csv(ground_truth_csv)
    tr_df = pd.read_csv(transcribed_csv)
    gt_df.columns = [c.lower().strip() for c in gt_df.columns]
    tr_df.columns = [c.lower().strip() for c in tr_df.columns]
    df = pd.merge(gt_df, tr_df, on="audio_file", how="inner")

    results = []
    print(f"\nüéØ Evaluating {len(df)} files for ASR performance...\n")
    for _, row in df.iterrows():
        ref, hyp = str(row["ground_truth"]), str(row["spanish_transcription"])
        m = process_words(ref, hyp)
        wer_score = round(m.wer, 4)
        subs, dels, ins = m.substitutions, m.deletions, m.insertions
        cer = round(compute_cer(ref, hyp), 4)
        ser = compute_sentence_error(ref, hyp)
        results.append({
            "audio_file": row["audio_file"],
            "WER": wer_score, "Substitutions": subs,
            "Deletions": dels, "Insertions": ins,
            "CER": cer, "SER": ser
        })
        print(f"üéß {row['audio_file']} ‚Üí WER: {wer_score}, CER: {cer}, SER: {ser}")

    out_df = pd.DataFrame(results)
    out_df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ ASR metrics saved to: {output_csv}")
    return out_df

# *GPT Data Analysis + Translation + TTS*

In [None]:
# ================================================================
# 3Ô∏è‚É£ GPT DATA ANALYSIS + TRANSLATION + TTS
# ================================================================
SYSTEM = """
You are a careful data analyst.
You receive a synthetic blood pressure dataset with columns: date, age, sex, systolic, diastolic.
Do ALL analysis yourself using ONLY the CSV provided.
Answer questions like: daily readings, averages, trends, comparisons, etc.
Return JSON:
{ "answer": "<English answer>", "computed_fields": { "numeric values used" } }
"""

def ask_gpt(question_en, csv_block):
    user = f"CSV data:\n{csv_block}\n\nQUESTION:\n{question_en}"
    resp = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=[{"role": "system", "content": SYSTEM}, {"role": "user", "content": user}]
    ).choices[0].message.content
    clean = re.sub(r"^```json|```$", "", resp.strip(), flags=re.M | re.I)
    start, end = clean.find("{"), clean.rfind("}")
    return json.loads(clean[start:end+1])

def translate_to_spanish(english_text):
    prompt = f"Translate this English medical answer into clear, neutral Spanish:\n{english_text}"
    return client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    ).choices[0].message.content.strip()

def text_to_speech_spanish(text, filename, voice="alloy"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with client.audio.speech.with_streaming_response.create(
        model="gpt-4o-mini-tts", voice=voice, input=text
    ) as response:
        response.stream_to_file(filename)
    print(f"üîä Saved Spanish audio: {filename}")
    return filename


# Main PIPELINE

In [None]:
# ================================================================
# 4Ô∏è‚É£ MAIN PIPELINE
# ================================================================
def run_full_pipeline(csv_path, audio_folder, audio_files):
    # Step 1 ‚Äî Transcribe and Translate Spanish Audio
    trans_csv = os.path.join(CSV_OUTPUT_FOLDER, "audio_translations.csv")
    trans_df = process_and_translate_audio(audio_folder, audio_files, trans_csv)

    # Step 2 ‚Äî Evaluate ASR (WER, CER, SER)
    gt_csv = os.path.join(audio_folder, "ground_truth.csv")
    asr_csv = os.path.join(CSV_OUTPUT_FOLDER, "asr_metrics.csv")
    asr_df = evaluate_asr_performance(gt_csv, trans_csv, asr_csv)

    # Step 3 ‚Äî Load Blood Pressure Data
    df_bp = pd.read_csv(csv_path)
    csv_block = df_bp.to_csv(index=False)

    results = []
    for i, row in trans_df.iterrows():
        q_num = i + 1
        q_en = row["english_translation"]
        print(f"\nüîπ Q{q_num}: {q_en}")

        try:
            ans = ask_gpt(q_en, csv_block)
            ans_en = ans.get("answer", "").strip()
            ans_es = translate_to_spanish(ans_en)

            audio_file = os.path.join(AUDIO_OUTPUT_FOLDER, f"answer_{q_num}_es.wav")
            text_to_speech_spanish(ans_es, audio_file)

            results.append({
                "question_number": q_num,
                "audio_file_in": row["audio_file"],
                "spanish_question": row["spanish_transcription"],
                "english_question": q_en,
                "english_answer": ans_en,
                "spanish_answer": ans_es,
                "audio_answer_file": audio_file,
                "computed_fields": json.dumps(ans.get("computed_fields", {}))
            })
            print(f"‚úÖ Completed Q{q_num}")

        except Exception as e:
            print(f"‚ùå Error Q{q_num}: {e}")

    # Step 4 ‚Äî Save Final Results
    final_csv = os.path.join(CSV_OUTPUT_FOLDER, "final_pipeline_results.csv")
    pd.DataFrame(results).to_csv(final_csv, index=False)
    print(f"\n‚úÖ All results saved to {final_csv}")
    return results

In [None]:
# ================================================================
# 5Ô∏è‚É£ RUN
# ================================================================
csv_path = os.path.join(BASE_PATH, BP_DATA_FOLDER, "synthetic_bp_one_person.csv")
run_full_pipeline(csv_path, AUDIO_INPUT_FOLDER, audio_files)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 139M/139M [00:13<00:00, 10.6MiB/s]



üéØ STARTING SPANISH TRANSCRIPTION + TRANSLATION
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q2_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 470/470 [00:03<00:00, 138.64frames/s]



[1] q2_es.wav
üá™üá∏ ¬øCu√°les fueron mis valores de presi√≥n arterial durante la √∫ltima semana?
üá¨üáß "What were my blood pressure values over the past week?"
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q1_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 458/458 [00:03<00:00, 145.44frames/s]



[2] q1_es.wav
üá™üá∏ ¬øCu√°les son mis presiones arteriales hist√≥lica y diast√≥lica hoy?
üá¨üáß "What are my systolic and diastolic blood pressures today?"
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q4_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 398/398 [00:02<00:00, 134.59frames/s]



[3] q4_es.wav
üá™üá∏ ¬øCu√°les son los rango normales para una persona como yo?
üá¨üáß What are the normal ranges for someone like me?
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q3_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 369/369 [00:02<00:00, 136.59frames/s]



[4] q3_es.wav
üá™üá∏ ¬øCu√°l es la tendencia de mis valores de presi√≥n arterial?
üá¨üáß What is the trend of my blood pressure values?
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q6_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1574/1574 [00:05<00:00, 300.83frames/s]



[5] q6_es.wav
üá™üá∏ ¬øEn qu√© d√≠a mi presi√≥n arterial excedi√≥ los niveles normales? Compare mi presi√≥n arterial promedio en la primera semana y la √∫ltima semana de este mes. ¬øCu√°l fue mi presi√≥n arterial diast√≥lica m√°s baja este mes?
üá¨üáß On what day did my blood pressure exceed normal levels? Compare my average blood pressure in the first week and the last week of this month. What was my lowest diastolic blood pressure this month?
üéß Transcribing: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/q5_es.wav


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 328/328 [00:02<00:00, 120.66frames/s]



[6] q5_es.wav
üá™üá∏ ¬øCu√°l era mi presi√≥n arterial el 10 de octubre?
üá¨üáß What was my blood pressure on October 10?

‚úÖ Transcriptions + translations saved to /content/drive/MyDrive/health-tequity-case/Data/csv_results/audio_translations.csv

üéØ Evaluating 6 files for ASR performance...

üéß q1_es.wav ‚Üí WER: 0.1111, CER: 0.0156, SER: 1
üéß q2_es.wav ‚Üí WER: 0.0, CER: 0.0, SER: 0
üéß q3_es.wav ‚Üí WER: 0.0, CER: 0.0, SER: 0
üéß q4_es.wav ‚Üí WER: 0.1, CER: 0.0175, SER: 1
üéß q5_es.wav ‚Üí WER: 0.0, CER: 0.0, SER: 0
üéß q6_es.wav ‚Üí WER: 0.0, CER: 0.0, SER: 0

‚úÖ ASR metrics saved to: /content/drive/MyDrive/health-tequity-case/Data/csv_results/asr_metrics.csv

üîπ Q1: "What were my blood pressure values over the past week?"
üîä Saved Spanish audio: /content/drive/MyDrive/health-tequity-case/Data/audio_out/answer_1_es.wav
‚úÖ Completed Q1

üîπ Q2: "What are my systolic and diastolic blood pressures today?"
üîä Saved Spanish audio: /content/drive/MyDrive/health-

[{'question_number': 1,
  'audio_file_in': 'q2_es.wav',
  'spanish_question': '¬øCu√°les fueron mis valores de presi√≥n arterial durante la √∫ltima semana?',
  'english_question': '"What were my blood pressure values over the past week?"',
  'english_answer': 'Over the past week, your blood pressure readings were as follows: On 2025-10-09, your systolic was 160 mmHg and diastolic was 102 mmHg. On 2025-10-10, your systolic was 160 mmHg and diastolic was 101 mmHg. On 2025-10-11, your systolic was 152 mmHg and diastolic was 94 mmHg. On 2025-10-12, your systolic was 157 mmHg and diastolic was 98 mmHg. On 2025-10-13, your systolic was 144 mmHg and diastolic was 100 mmHg. On 2025-10-14, your systolic was 145 mmHg and diastolic was 91 mmHg. On 2025-10-15, your systolic was 124 mmHg and diastolic was 81 mmHg.',
  'spanish_answer': 'Durante la semana pasada, sus lecturas de presi√≥n arterial fueron las siguientes: El 9 de octubre de 2025, su presi√≥n sist√≥lica fue de 160 mmHg y su presi√≥n dia

In [None]:
# !pip install vosk pydub
# !apt-get install ffmpeg


In [None]:
!mkdir -p /content/vosk_models
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-es-0.42.zip -O /content/vosk_models/vosk-model-small-es.zip
!unzip -q /content/vosk_models/vosk-model-small-es.zip -d /content/vosk_models/


In [None]:
# ================================================================
# 6Ô∏è‚É£ OUTPUT AUDIO (TTS) ASR EVALUATION USING VOSK
# ================================================================
import os, json, wave
import pandas as pd
from vosk import Model, KaldiRecognizer
from jiwer import process_words
import Levenshtein
from pydub import AudioSegment

# ================================================================
# üîß Audio Conversion Helper
# ================================================================
def convert_to_wav(input_path, output_path, target_sr=16000):
    """
    Converts any audio file (MP3, M4A, WAV) to mono 16kHz RIFF WAV for Vosk.
    """
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(target_sr).set_channels(1)
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"‚ùå Failed to convert {input_path}: {e}")
        return None

# ================================================================
# üîä Vosk Transcription
# ================================================================
def transcribe_with_vosk(audio_path, model_path="/content/vosk_models/vosk-model-small-es-0.42"):
    """
    Transcribes a Spanish audio file using Vosk offline ASR model.
    """
    if not os.path.exists(model_path):
        raise FileNotFoundError("‚ùå Vosk model not found. Please download and unzip it first.")

    model = Model(model_path)
    wf = wave.open(audio_path, "rb")
    if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [16000, 22050, 44100]:
        raise ValueError(f"‚ö†Ô∏è Unsupported audio format in {audio_path}. Convert to mono 16kHz WAV first.")

    rec = KaldiRecognizer(model, wf.getframerate())
    rec.SetWords(True)

    result_text = ""
    while True:
        data = wf.readframes(4000)
        if len(data) == 0:
            break
        if rec.AcceptWaveform(data):
            part = json.loads(rec.Result())
            result_text += part.get("text", "") + " "
    part = json.loads(rec.FinalResult())
    result_text += part.get("text", "")
    wf.close()

    return result_text.strip()

# ================================================================
# üßÆ Evaluate TTS ‚Üí Text using Vosk ASR
# ================================================================
def evaluate_output_asr(
    tts_csv,
    output_csv=os.path.join(CSV_OUTPUT_FOLDER, "output_asr_metrics.csv"),
    model_path="/content/vosk_models/vosk-model-small-es-0.42"
):
    """
    Evaluates TTS Spanish audio output using Vosk ASR model.
    Compares transcribed text vs. ground truth Spanish answers.
    """
    if not os.path.exists(tts_csv):
        raise FileNotFoundError(f"‚ùå Missing final results CSV: {tts_csv}")
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"‚ùå Vosk model not found at {model_path}. Download before running.")

    df = pd.read_csv(tts_csv)
    results = []

    print("\nüéØ Evaluating TTS ‚Üí Spanish ASR transcription quality\n" + "="*60)
    for i, row in df.iterrows():
        gt = str(row["spanish_answer"])
        audio_file = row["audio_answer_file"]
        if not os.path.exists(audio_file):
            print(f"‚ö†Ô∏è Missing audio: {audio_file}")
            continue

        try:
            # Convert to proper WAV
            tmp_wav = os.path.join(AUDIO_OUTPUT_FOLDER, f"tmp_{i}.wav")
            converted_path = convert_to_wav(audio_file, tmp_wav)
            if not converted_path:
                print(f"‚ö†Ô∏è Could not convert {audio_file}, skipping...")
                continue

            # Transcribe with Vosk
            hyp = transcribe_with_vosk(converted_path, model_path)

            # Compute metrics
            measures = process_words(gt, hyp)
            wer_score = round(measures.wer, 4)
            subs, dels, ins = measures.substitutions, measures.deletions, measures.insertions
            cer = round(Levenshtein.distance(gt, hyp) / max(len(gt), 1), 4)
            ser = 0 if gt.strip() == hyp.strip() else 1

            results.append({
                "audio_file": os.path.basename(audio_file),
                "ground_truth": gt,
                "vosk_transcription": hyp,
                "WER": wer_score,
                "Substitutions": subs,
                "Deletions": dels,
                "Insertions": ins,
                "CER": cer,
                "SER": ser
            })

            print(f"üéß {os.path.basename(audio_file)} ‚Üí WER={wer_score}, CER={cer}, SER={ser}")

            # Clean up temp file
            os.remove(converted_path)

        except Exception as e:
            print(f"‚ùå Error processing {audio_file}: {e}")

    out_df = pd.DataFrame(results)
    out_df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Output ASR evaluation saved to: {output_csv}")
    return out_df


In [None]:
# Evaluate the Spanish TTS outputs using Vosk ASR
final_results_csv = os.path.join(CSV_OUTPUT_FOLDER, "final_pipeline_results.csv")
output_asr_metrics = evaluate_output_asr(final_results_csv)
display(output_asr_metrics.head())




üéØ Evaluating TTS ‚Üí Spanish ASR transcription quality
üéß answer_1_es.wav ‚Üí WER=1.0629, CER=0.9241, SER=1
üéß answer_2_es.wav ‚Üí WER=0.8889, CER=0.7604, SER=1
üéß answer_3_es.wav ‚Üí WER=0.5, CER=0.3179, SER=1
üéß answer_4_es.wav ‚Üí WER=0.3462, CER=0.1712, SER=1
üéß answer_5_es.wav ‚Üí WER=0.7843, CER=0.7692, SER=1
üéß answer_6_es.wav ‚Üí WER=0.9231, CER=1.0455, SER=1

‚úÖ Output ASR evaluation saved to: /content/drive/MyDrive/health-tequity-case/Data/csv_results/output_asr_metrics.csv


Unnamed: 0,audio_file,ground_truth,vosk_transcription,WER,Substitutions,Deletions,Insertions,CER,SER
0,answer_1_es.wav,"Durante la semana pasada, sus lecturas de pres...",durante la semana pasada sus lecturas de presi...,1.0629,92,1,76,0.9241,1
1,answer_2_es.wav,Su presi√≥n arterial sist√≥lica hoy es de 110 mm...,su presi√≥n arterial hist√≥rica hoy es de ciento...,0.8889,7,0,9,0.7604,1
2,answer_3_es.wav,"Para una mujer de 68 a√±os, se considera que la...",para una mujer de sesenta y ocho a√±os se consi...,0.5,15,0,9,0.3179,1
3,answer_4_es.wav,Los valores de la presi√≥n arterial muestran un...,los valores de la presi√≥n arterial muestran un...,0.3462,31,0,5,0.1712,1
4,answer_5_es.wav,Su presi√≥n arterial super√≥ los niveles normale...,su presi√≥n arterial super√≥ los niveles normale...,0.7843,39,0,41,0.7692,1


In [None]:
# ================================================================
# 6Ô∏è‚É£ OUTPUT AUDIO (TTS) ASR EVALUATION USING WHISPER
# ================================================================
import whisper, re, unicodedata, Levenshtein, pandas as pd
from jiwer import process_words

def normalize_text(text):
    """Lowercase, strip accents, remove punctuation for fair WER/CER."""
    text = text.lower()
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def evaluate_output_asr_whisper(
    tts_csv,
    output_csv=os.path.join(CSV_OUTPUT_FOLDER, "output_asr_metrics_whisper.csv"),
    model_size="base"
):
    """
    Evaluate the Spanish TTS audios using Whisper.
    Computes WER, CER, and SER against ground truth Spanish answers.
    """
    if not os.path.exists(tts_csv):
        raise FileNotFoundError(f"‚ùå Missing final results CSV: {tts_csv}")

    print(f"üéØ Loading Whisper ({model_size}) model ...")
    model = whisper.load_model(model_size)

    df = pd.read_csv(tts_csv)
    results = []

    print("\nüéß Evaluating Spanish TTS output audios\n" + "="*60)
    for i, row in df.iterrows():
        gt = str(row["spanish_answer"]).strip()
        audio_file = row["audio_answer_file"]
        if not os.path.exists(audio_file):
            print(f"‚ö†Ô∏è Missing audio file: {audio_file}")
            continue

        try:
            # Transcribe with Whisper
            result = model.transcribe(audio_file, language="es", task="transcribe", verbose=False)
            hyp = result["text"].strip()

            # Normalize both texts
            gt_norm = normalize_text(gt)
            hyp_norm = normalize_text(hyp)

            # Compute metrics
            measures = process_words(gt_norm, hyp_norm)
            wer_score = round(measures.wer, 4)
            subs, dels, ins = measures.substitutions, measures.deletions, measures.insertions
            cer = round(Levenshtein.distance(gt_norm, hyp_norm) / max(len(gt_norm), 1), 4)
            ser = 0 if gt_norm == hyp_norm else 1

            results.append({
                "audio_file": os.path.basename(audio_file),
                "ground_truth": gt,
                "whisper_transcription": hyp,
                "WER": wer_score,
                "Substitutions": subs,
                "Deletions": dels,
                "Insertions": ins,
                "CER": cer,
                "SER": ser
            })

            print(f"‚úÖ {os.path.basename(audio_file)} ‚Üí WER={wer_score}, CER={cer}, SER={ser}")

        except Exception as e:
            print(f"‚ùå Error processing {audio_file}: {e}")

    out_df = pd.DataFrame(results)
    out_df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Whisper ASR evaluation saved to: {output_csv}")
    return out_df


In [21]:
final_results_csv = os.path.join(CSV_OUTPUT_FOLDER, "final_pipeline_results.csv")
output_asr_metrics = evaluate_output_asr_whisper(final_results_csv, model_size="base")
display(output_asr_metrics.head())


üéØ Loading Whisper (base) model ...

üéß Evaluating Spanish TTS output audios


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7260/7260 [01:54<00:00, 63.34frames/s]


‚úÖ answer_1_es.wav ‚Üí WER=0.327, CER=0.2259, SER=1


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 885/885 [00:03<00:00, 229.36frames/s]


‚úÖ answer_2_es.wav ‚Üí WER=0.6111, CER=0.4526, SER=1


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2112/2112 [00:07<00:00, 299.71frames/s]


‚úÖ answer_3_es.wav ‚Üí WER=0.1875, CER=0.1569, SER=1


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4701/4701 [00:14<00:00, 323.16frames/s]


‚úÖ answer_4_es.wav ‚Üí WER=0.1058, CER=0.023, SER=1


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5145/5145 [00:15<00:00, 334.20frames/s]


‚úÖ answer_5_es.wav ‚Üí WER=0.1863, CER=0.2265, SER=1


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 780/780 [00:03<00:00, 244.29frames/s]

‚úÖ answer_6_es.wav ‚Üí WER=0.3846, CER=0.3333, SER=1

‚úÖ Whisper ASR evaluation saved to: /content/drive/MyDrive/health-tequity-case/Data/csv_results/output_asr_metrics_whisper.csv





Unnamed: 0,audio_file,ground_truth,whisper_transcription,WER,Substitutions,Deletions,Insertions,CER,SER
0,answer_1_es.wav,"Durante la semana pasada, sus lecturas de pres...","Durante la semana pasada, sus lecturas de pres...",0.327,26,0,26,0.2259,1
1,answer_2_es.wav,Su presi√≥n arterial sist√≥lica hoy es de 110 mm...,Supresi√≥n arterial cist√≥lica hoy es de 110 mil...,0.6111,5,2,4,0.4526,1
2,answer_3_es.wav,"Para una mujer de 68 a√±os, se considera que la...","Para una mijer de 68 a√±os, se considera que la...",0.1875,4,0,5,0.1569,1
3,answer_4_es.wav,Los valores de la presi√≥n arterial muestran un...,Los valores de la presi√≥n arterial muestran un...,0.1058,6,0,5,0.023,1
4,answer_5_es.wav,Su presi√≥n arterial super√≥ los niveles normale...,Supresi√≥n arterial super√≥ los niveles normales...,0.1863,11,1,7,0.2265,1
