<a href="https://colab.research.google.com/github/nattaran/HealthTequity-LLM/blob/main/Generate_Spanish_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎧 Generate Spanish Audio for Medical Questions  
### 🩺 HealthTequity Case Study – Synthetic Dataset Creation

This notebook supports the **HealthTequity LLM pipeline** by:

- ✅ Translating **English medical questions** into **Spanish**
- 🔊 Generating **spoken Spanish audio** (`.wav`) for each question using `gTTS`
- 🗂 Saving the Spanish transcription and audio filename in a CSV (`ground_truth.csv`) for later **ASR evaluation**

---

### 📁 Folder Structure

All files are stored under your project root at:
```plaintext
/content/drive/MyDrive/HealthTequity-LLM/


# **📘 Colab Notebook Title: Generate_Spanish_Audio.ipynb**

In [None]:
# 📦 Install dependencies (run once per session)
!pip install -q gTTS deep-translator pandas


In [None]:
# 📂 Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')



In [None]:
# ============================================================
# 🎧 Generate Spanish Audio for HealthTequity Case Study
# ============================================================
# Author: Nasrin Attaran
# Description:
#   Translates English questions into Spanish, generates Spanish
#   audio (.wav), and saves a single ground_truth.csv file in
#   /data/synthetic_csv for ASR evaluation.
# ============================================================

# --- Install required libraries ---


# --- Imports ---
from gtts import gTTS
from deep_translator import GoogleTranslator
import os, pandas as pd

# ------------------------------------------------------------
# 📁 Project Folders
# ------------------------------------------------------------
PROJECT_ROOT = "/content/drive/MyDrive/HealthTequity-LLM"
AUDIO_OUTPUT_FOLDER = os.path.join(PROJECT_ROOT, "data/Spanish_audio")
GROUNDTRUTH_FOLDER  = os.path.join(PROJECT_ROOT, "data/synthetic_csv")

# ------------------------------------------------------------
# ⚙️ Function Definition
# ------------------------------------------------------------
def generate_spanish_audio_from_english(
    english_questions: list[str],
    audio_output_folder: str,
    groundtruth_folder: str,
    prefix: str = "q"
) -> pd.DataFrame:
    """
    Translates English questions into Spanish, generates Spanish audio,
    and saves only 'ground_truth.csv' (no generated_questions.csv).

    Args:
        english_questions: List of English question strings.
        audio_output_folder: Folder to save generated .wav files.
        groundtruth_folder: Folder to save ground_truth.csv.
        prefix: Prefix for generated audio files (default 'q').

    Returns:
        DataFrame with Spanish transcriptions and filenames.
    """

    # ============================================================
    # 🧹 Step 1 – Prepare output folders
    # ============================================================
    os.makedirs(audio_output_folder, exist_ok=True)
    os.makedirs(groundtruth_folder, exist_ok=True)

    # Clean only .wav files in audio folder
    for f in os.listdir(audio_output_folder):
        if f.endswith(".wav"):
            os.remove(os.path.join(audio_output_folder, f))
    print(f"📁 Audio folder ready: {audio_output_folder}")
    print(f"📁 Ground truth folder ready: {groundtruth_folder}")

    # ============================================================
    # 🌍 Step 2 – Translate English → Spanish & generate audio
    # ============================================================
    translator = GoogleTranslator(source="en", target="es")
    results = []

    for i, q_en in enumerate(english_questions, 1):
        try:
            q_es = translator.translate(q_en)
            audio_file = f"{prefix}{i}_es.wav"
            audio_path = os.path.join(audio_output_folder, audio_file)

            tts = gTTS(text=q_es, lang="es")
            tts.save(audio_path)

            results.append({"audio_file": audio_file, "ground_truth": q_es})
            print(f"🎧 {audio_file} → {q_es}")

        except Exception as e:
            print(f"⚠️ Error generating audio for question {i}: {e}")

    # ============================================================
    # 🧾 Step 3 – Save Ground Truth Only
    # ============================================================
    gt_csv = os.path.join(groundtruth_folder, "ground_truth.csv")
    pd.DataFrame(results).to_csv(gt_csv, index=False, encoding="utf-8-sig")

    # ============================================================
    # ✅ Step 4 – Summary
    # ============================================================
    print(f"\n✅ {len(results)} Spanish audio files generated.")
    print(f"📄 Ground truth CSV saved to: {gt_csv}")

    return pd.DataFrame(results)


# ------------------------------------------------------------
# 📋 Sample English Questions
# ------------------------------------------------------------
english_questions = [
    "What are my systolic and diastolic blood pressures today?",
    "What were my blood pressure readings over the last week?",
    "What is the overall trend of my blood pressure this month?",
    "What are the normal blood pressure ranges for someone my age?",
    "What was my blood pressure on October 10th?",
    "What was my systolic blood pressure on October 12th?",
    "What was my diastolic blood pressure on September 30th?",
    "On which day did my systolic pressure exceed 140 mm Hg?",
    "Compare my average blood pressure from the first week to the last week of this month.",
    "What was the lowest diastolic blood pressure recorded this month?",
    "Show me the highest and lowest systolic values recorded so far.",
    "What is the max and min of diastolic and systolic blood pressure this month?"
]

# ------------------------------------------------------------
# 🚀 Run the Audio Generation Pipeline
# ------------------------------------------------------------
df_gt = generate_spanish_audio_from_english(
    english_questions,
    audio_output_folder=AUDIO_OUTPUT_FOLDER,
    groundtruth_folder=GROUNDTRUTH_FOLDER
)

# Optional preview
df_gt.head()
