<a href="https://colab.research.google.com/github/nattaran/health-tequity-case-nasrin/blob/main/Generate_Spanish_Audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
 !pip install gtts

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/98.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gtts
  Attempting uninstall: click
    Found existing installation: click 8.3.0
    Uninstalling click-8.3.0:
      Successfully uninstalled click-8.3.0
Successfully installed click-8.1.8 gtts-2.5.4


In [7]:
!pip install -q deep-translator

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

#Text -> Spanish Audio Generator (Clean Folder Each Run)


In [8]:
# ================================================================
# 🗣️  ENGLISH → SPANISH AUDIO GENERATION MODULE (Colab + Drive)
# ================================================================

from gtts import gTTS
from deep_translator import GoogleTranslator
import os, pandas as pd

def generate_spanish_audio_from_english(
    english_questions: list[str],
    output_folder: str,
    prefix: str = "q"
):
    """
    Translates each English question into Spanish, generates new Spanish audio files,
    and saves both 'generated_questions.csv' and 'ground_truth.csv' for later pipeline steps.

    Args:
        english_questions: list of English question strings
        output_folder: path to save generated .wav files
        prefix: filename prefix for generated audio files (default 'q')

    Returns:
        DataFrame containing English text, Spanish translation, and audio filenames
    """

    # ============================================================
    # 🧹 Step 1: Create or clean the output folder
    # ============================================================
    if os.path.exists(output_folder):
        print(f"🧹 Cleaning old files in {output_folder} ...")
        for f in os.listdir(output_folder):
            fp = os.path.join(output_folder, f)
            if os.path.isfile(fp):
                os.remove(fp)
    else:
        os.makedirs(output_folder, exist_ok=True)
        print(f"📁 Created new folder: {output_folder}")

    # ============================================================
    # 🌍 Step 2: Translate English → Spanish and generate audio
    # ============================================================
    translator = GoogleTranslator(source="en", target="es")
    results = []

    for i, question_en in enumerate(english_questions, 1):
        try:
            # Translate English → Spanish
            question_es = translator.translate(question_en)

            # Generate audio (Spanish)
            tts = gTTS(text=question_es, lang="es")
            audio_filename = f"{prefix}{i}_es.wav"
            audio_path = os.path.join(output_folder, audio_filename)
            tts.save(audio_path)

            results.append({
                "question_number": i,
                "english_text": question_en,
                "spanish_text": question_es,
                "audio_file": audio_filename
            })

            print(f"🎧 {audio_filename} generated → {question_es}")

        except Exception as e:
            print(f"⚠️ Error generating audio for question {i}: {e}")

    # ============================================================
    # 🧾 Step 3: Save CSV files
    # ============================================================

    # Summary of generated questions
    summary_csv = os.path.join(output_folder, "generated_questions.csv")
    pd.DataFrame(results).to_csv(summary_csv, index=False, encoding="utf-8-sig")

    # Ground truth file for WER evaluation
    gt_df = pd.DataFrame({
        "audio_file": [r["audio_file"] for r in results],
        "ground_truth": [r["spanish_text"] for r in results]
    })
    gt_csv = os.path.join(output_folder, "ground_truth.csv")
    gt_df.to_csv(gt_csv, index=False, encoding="utf-8-sig")

    # ============================================================
    # ✅ Step 4: Display Summary
    # ============================================================
    print(f"\n✅ {len(results)} Spanish audio files generated and saved to: {output_folder}")
    print(f"📄 Summary saved to: {summary_csv}")
    print(f"📄 Ground truth saved to: {gt_csv}")

    return pd.DataFrame(results)


In [10]:
# ============================================================
# 1️⃣ Mount Google Drive
# ============================================================
from google.colab import drive
drive.mount('/content/drive')

# ============================================================
# 2️⃣ Define your folder path in Drive
# ============================================================
AUDIO_INPUT_FOLDER = "/content/drive/MyDrive/health-tequity-case/Input_Audio_Files"

# ============================================================
# 3️⃣ Define your English questions
# ============================================================
english_questions = [
    "What are my systolic and diastolic blood pressures today?",
    "What were my blood pressure values over the last week?",
    "What is the trend of my blood pressure values?",
    "What are the normal ranges for a person like me?",
    "What was my blood pressure on October 10th?",
    "On which day did my blood pressure exceed normal levels?"
    "Compare my average blood pressure in the first week and last week of this month."
    "What was my lowest diastolic blood pressure this month?"


]

# ============================================================
# 4️⃣ Generate Spanish audio files and CSVs
# ============================================================
audio_summary_df = generate_spanish_audio_from_english(
    english_questions,
    output_folder=AUDIO_INPUT_FOLDER
)

display(audio_summary_df)


Mounted at /content/drive
🧹 Cleaning old files in /content/drive/MyDrive/health-tequity-case/Input_Audio_Files ...
🎧 q1_es.wav generated → ¿Cuáles son mis presiones arteriales sistólica y diastólica hoy?
🎧 q2_es.wav generated → ¿Cuáles fueron mis valores de presión arterial durante la última semana?
🎧 q3_es.wav generated → ¿Cuál es la tendencia de mis valores de presión arterial?
🎧 q4_es.wav generated → ¿Cuáles son los rangos normales para una persona como yo?
🎧 q5_es.wav generated → ¿Cuál era mi presión arterial el 10 de octubre?
🎧 q6_es.wav generated → ¿En qué día mi presión arterial excedió los niveles normales? Compare mi presión arterial promedio en la primera semana y la última semana de este mes. ¿Cuál fue mi presión arterial diastólica más baja este mes?

✅ 6 Spanish audio files generated and saved to: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files
📄 Summary saved to: /content/drive/MyDrive/health-tequity-case/Input_Audio_Files/generated_questions.csv
📄 Ground tru

Unnamed: 0,question_number,english_text,spanish_text,audio_file
0,1,What are my systolic and diastolic blood press...,¿Cuáles son mis presiones arteriales sistólica...,q1_es.wav
1,2,What were my blood pressure values over the la...,¿Cuáles fueron mis valores de presión arterial...,q2_es.wav
2,3,What is the trend of my blood pressure values?,¿Cuál es la tendencia de mis valores de presió...,q3_es.wav
3,4,What are the normal ranges for a person like me?,¿Cuáles son los rangos normales para una perso...,q4_es.wav
4,5,What was my blood pressure on October 10th?,¿Cuál era mi presión arterial el 10 de octubre?,q5_es.wav
5,6,On which day did my blood pressure exceed norm...,¿En qué día mi presión arterial excedió los ni...,q6_es.wav
