In [None]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch

# Load pre-trained German wav2vec2 model
model_name = "facebook/wav2vec2-large-xlsr-53-german"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

In [2]:
!pip install gTTS

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4


In [3]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [4]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.metadata (4.2 kB)
Collecting semant

In [5]:
from gtts import gTTS
from pydub import AudioSegment
import os

os.makedirs("samples", exist_ok=True)

PHRASES = {
    "Wie geht es Ihnen?": "samples/wie_geht_es_ihnen.wav",
    "Können Sie langsamer sprechen?": "samples/koennen_sie_langsamer.wav",
    "Wo haben Sie Schmerzen?": "samples/wo_haben_sie_schmerzen.wav",
    "Ich verstehe nicht. Können Sie das wiederholen?": "samples/ich_verstehe_nicht.wav",
    "Danke, dass Sie gewartet haben.": "samples/danke_dass_sie_gewartet_haben.wav"
}

for text, file_path in PHRASES.items():
    temp_mp3 = file_path.replace(".wav", ".mp3")

    # Generate and save as MP3
    tts = gTTS(text=text, lang='de')
    tts.save(temp_mp3)

    # Convert to WAV
    sound = AudioSegment.from_mp3(temp_mp3)
    sound.export(file_path, format="wav")

    # Clean up MP3
    os.remove(temp_mp3)

    print(f"Saved: {file_path}")

Saved: samples/wie_geht_es_ihnen.wav
Saved: samples/koennen_sie_langsamer.wav
Saved: samples/wo_haben_sie_schmerzen.wav
Saved: samples/ich_verstehe_nicht.wav
Saved: samples/danke_dass_sie_gewartet_haben.wav


In [6]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (161 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

In [7]:
!pip install --upgrade gradio



In [8]:
def transcribe_audio(file_path):
    speech, rate = torchaudio.load(file_path)
    if rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=16000)
        speech = resampler(speech)
    input_values = processor(speech.squeeze().numpy(), return_tensors="pt", padding="longest").input_values
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription.lower()

In [None]:
import gradio as gr
import os
import numpy as np
import soundfile as sf
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import Levenshtein
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import tempfile
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- Global Config ---
SAMPLE_RATE = 16000
MAX_RECORDING_TIME = 5  # seconds
PHRASES = {
    "Wie geht es Ihnen?": "samples/wie_geht_es_ihnen.wav",
    "Können Sie langsamer sprechen?": "samples/können_sie_langsamer.wav",
    "Wo haben Sie Schmerzen?": "samples/schmerzen.wav",
    "Ich verstehe nicht. Können Sie das wiederholen?": "samples/verstehe_nicht.wav",
    "Danke, dass Sie gewartet haben.": "samples/danke_gewartet.wav"
}
ATTEMPTS_HISTORY = []

# --- Load Models ---
try:
    stt_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-german")
    stt_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-german")
    stt_model = stt_model.to("cpu")  # Use CPU for compatibility
    embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
except Exception as e:
    print(f"Error loading models: {str(e)}")
    exit(1)

# --- Helper Functions ---

def transcribe_audio(file_path):
    try:
        speech, rate = torchaudio.load(file_path)
        if rate != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=rate, new_freq=SAMPLE_RATE)
            speech = resampler(speech)
        input_values = stt_processor(speech.squeeze().numpy(), return_tensors="pt", padding="longest").input_values
        with torch.no_grad():
            logits = stt_model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = stt_processor.batch_decode(predicted_ids)[0]
        return transcription.lower()
    except Exception as e:
        return f"Transcription error: {str(e)}"

def calculate_levenshtein(reference, hypothesis):
    try:
        return round(Levenshtein.ratio(reference.lower(), hypothesis.lower()) * 100, 2)
    except:
        return 0.0

def calculate_cosine_similarity(reference, hypothesis):
    try:
        embeddings = embedding_model.encode([reference, hypothesis], convert_to_tensor=True)
        cos_sim = util.cos_sim(embeddings[0], embeddings[1]).item()
        return round(cos_sim * 100, 2)
    except:
        return 0.0

def save_recording(audio_data):
    try:
        # Gradio Audio returns (sample_rate, audio_data)
        if isinstance(audio_data, tuple) and len(audio_data) == 2:
            input_sample_rate, audio_array = audio_data
        else:
            audio_array = audio_data
            input_sample_rate = SAMPLE_RATE

        # Convert to NumPy array if not already
        audio_array = np.array(audio_array, dtype=np.float32)

        # Ensure mono audio (average channels if stereo)
        if audio_array.ndim > 1 and audio_array.shape[1] > 1:
            audio_array = np.mean(audio_array, axis=1)

        # Resample if input sample rate differs from SAMPLE_RATE
        if input_sample_rate != SAMPLE_RATE:
            audio_tensor = torch.from_numpy(audio_array).float()
            resampler = torchaudio.transforms.Resample(orig_freq=input_sample_rate, new_freq=SAMPLE_RATE)
            audio_array = resampler(audio_tensor).numpy()

        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            sf.write(tmpfile.name, audio_array, SAMPLE_RATE)
            return tmpfile.name
    except Exception as e:
        return f"Error saving audio: {str(e)}"

# --- Gradio Function ---
def process_audio(selected_phrase, audio_data):
    global ATTEMPTS_HISTORY

    if not selected_phrase or audio_data is None:
        return "Please select a phrase and record audio.", "", "", "", None

    reference_text = selected_phrase.lower()
    file_path = save_recording(audio_data)
    if isinstance(file_path, str) and file_path.startswith("Error"):
        return file_path, "", "", "", None

    user_text = transcribe_audio(file_path)
    if user_text.startswith("Transcription error"):
        return user_text, "", "", "", None

    levenshtein_score = calculate_levenshtein(reference_text, user_text)
    cosine_score = calculate_cosine_similarity(reference_text, user_text)

    avg_score = (levenshtein_score + cosine_score) / 2
    ATTEMPTS_HISTORY.append(avg_score)

    # Plot improvement
    fig, ax = plt.subplots()
    ax.plot(ATTEMPTS_HISTORY, marker='o', linestyle='-', color='green')
    ax.set_title("Improvement Over Time")
    ax.set_xlabel("Attempt Number")
    ax.set_ylabel("Score (%)")
    ax.grid(True)

    feedback = ""
    if avg_score > 85:
        feedback = "Excellent pronunciation!"
    elif avg_score > 65:
        feedback = "Good effort. Try again to improve!"
    else:
        feedback = "Keep practicing!"

    return (
        user_text,
        f"{levenshtein_score}%",
        f"{cosine_score}%",
        feedback,
        fig
    )

def reset_attempts():
    global ATTEMPTS_HISTORY
    ATTEMPTS_HISTORY = []
    return gr.update(value=None), gr.update(value=None), gr.update(value=None), gr.update(value=""), gr.update(value=None)

# --- Validate Audio Files ---
def get_valid_audio_file(phrase):
    file_path = PHRASES.get(phrase, "")
    if os.path.exists(file_path):
        return file_path
    return None

# --- Gradio Interface ---
with gr.Blocks(title="NurseSpeak - German Pronunciation Practice") as demo:
    gr.Markdown("# 🎯 NurseSpeak - Improve Your German Pronunciation\n### Choose a phrase, record your voice, and get instant feedback!")

    with gr.Row():
        phrase_dropdown = gr.Dropdown(choices=list(PHRASES.keys()), label="Select a Phrase", value=list(PHRASES.keys())[0])
        play_button = gr.Audio(label="Original Audio", value=get_valid_audio_file(list(PHRASES.keys())[0]))

    phrase_dropdown.change(fn=get_valid_audio_file, inputs=phrase_dropdown, outputs=play_button)

    with gr.Row():
        audio_input = gr.Audio(type="numpy", label="Record Your Voice")
        submit_btn = gr.Button("Submit Recording")

    with gr.Row():
        output_text = gr.Textbox(label="Your Transcription")
        lev_score = gr.Textbox(label="Levenshtein Match (%)")
        cos_score = gr.Textbox(label="Cosine Similarity (%)")
        feedback = gr.Textbox(label="Feedback")

    plot_output = gr.Plot(label="Performance Over Time")

    submit_btn.click(
        fn=process_audio,
        inputs=[phrase_dropdown, audio_input],
        outputs=[output_text, lev_score, cos_score, feedback, plot_output]
    )

    gr.Button("Reset Progress").click(fn=reset_attempts, outputs=[
        audio_input, output_text, lev_score, feedback, plot_output
    ])

# Try launching with automatic port selection
try:
    demo.launch(debug=True)  # Let Gradio choose an available port
except Exception as e:
    print(f"Error launching Gradio interface: {str(e)}")
    print("Try running the script again or specifying a different port with 'demo.launch(server_port=7861)'")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2e8033248eee9d73cd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
