<a href="https://colab.research.google.com/github/oeam96/EasyTranscriptApp/blob/main/Transcript_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ======================================================
# 1.  Runtime deps  (‚âà40 s on a fresh Colab VM)
# ======================================================
!apt-get update && apt-get install -y ffmpeg
!pip install -q --upgrade "gradio>=4" "faster-whisper>=1.0" ffmpeg-python
!nvidia-smi

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

In [None]:
# ======================================================
# 1‚ÄÜ  Imports & helpers
# ======================================================
import os, subprocess, time, gc, tempfile, pathlib, re, shlex
import gradio as gr, torch
from faster_whisper import WhisperModel

# ---------- ffmpeg: any-format ‚Üí AAC .m4a -------------------------------
def convert_to_m4a(inp: str, audio_bitrate: str = "192k") -> str:
    """
    Return path of an .m4a copy of *inp*.
    Tries GPU video decode; falls back to CPU on failure.
    """
    base, _ = os.path.splitext(inp)
    out = base + ".m4a"
    ext_video = {"mp4", "mov", "mkv", "avi", "flv", "webm"}
    use_gpu = inp.rsplit(".", 1)[-1].lower() in ext_video

    cmd = ["ffmpeg", "-y"]
    if use_gpu:
        cmd += ["-hwaccel", "cuda", "-c:v", "h264_cuvid"]
    cmd += ["-i", inp, "-vn", "-c:a", "aac", "-b:a", audio_bitrate, out]

    try:
        subprocess.run(cmd, check=True, capture_output=True, text=True)
    except subprocess.CalledProcessError as e:
        if use_gpu:                          # retry once on CPU
            print("GPU decode failed ‚Üí retrying on CPU")
            return convert_to_m4a(inp, audio_bitrate)
        else:
            print("FFmpeg stderr:\n", e.stderr)
            raise
    return out
# ------------------------------------------------------------------------

# ---------- strip timestamps -------------------------------------------
_PAT = [
    re.compile(r"^\s*\d+(?:\.\d+)?s\s*-->\s*\d+(?:\.\d+)?s:\s*(.*)$"),
    re.compile(r"^\s*\d+(?:\.\d+)?s\s*‚Üí\s*\d+(?:\.\d+)?s\s*\|\s*(.*)$"),
]
def _clean_line(l: str) -> str:
    for p in _PAT:
        m = p.match(l)
        if m:
            return m.group(1).strip()
    return l.strip()
# ------------------------------------------------------------------------

# ---------- Whisper model ----------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL  = WhisperModel(
    "small",
    device=DEVICE,
    compute_type="float16" if DEVICE == "cuda" else "float32"
)
# ------------------------------------------------------------------------

# ======================================================
# 2‚ÄÜ  Streaming worker (always 6 outputs)
# ======================================================
def _out(status="", elapsed="", words="", length="", transcript="", fpath=None):
    """Always return exactly six values."""
    return status, elapsed, words, length, transcript, fpath

def transcribe_stream(audio_file: str):
    yield _out("‚è≥ preparing input‚Ä¶")

    # -------- ensure .m4a ----------------------------------------------
    if not audio_file.endswith(".m4a"):
        yield _out("üöÄ converting with FFmpeg‚Ä¶")
        try:
            t_conv = time.perf_counter()
            audio_file = convert_to_m4a(audio_file)
            yield _out(f"‚úÖ FFmpeg done in {time.perf_counter()-t_conv:0.1f}s")
        except subprocess.CalledProcessError as e:
            yield _out("‚ùå FFmpeg failed", transcript=f"Conversion error:\n{e}")
            return
    else:
        yield _out("üéß audio already .m4a")

    # -------- Whisper streaming ---------------------------------------
    t0 = time.perf_counter()
    lines, last_end = [], 0
    status = "üìù decoding‚Ä¶"

    segs, _ = MODEL.transcribe(
        audio_file,
        beam_size=1,
        language="es",
        best_of=5,
        temperature=(0.0, 0.2, 0.4, 0.6),
        compression_ratio_threshold=2.4,
        repetition_penalty=1.1,
        condition_on_previous_text=False,
        vad_filter=True
    )

    for s in segs:
        last_end = s.end
        lines.append(f"{s.start:7.2f}s ‚Üí {s.end:7.2f}s | {s.text}")
        elapsed  = f"{time.perf_counter() - t0:0.1f} s"
        words    = str(len(" ".join(_clean_line(l) for l in lines).split()))
        length_m = f"{last_end/60:0.2f}"
        yield _out(status, elapsed, words, length_m, "\n".join(lines))

    # -------- create .txt ---------------------------------------------
    clean = " ".join(_clean_line(l) for l in lines)
    tmp   = tempfile.NamedTemporaryFile(delete=False, suffix=".txt").name
    pathlib.Path(tmp).write_text(clean, encoding="utf-8")

    elapsed  = f"{time.perf_counter() - t0:0.1f} s"
    words    = str(len(clean.split()))
    length_m = f"{last_end/60:0.2f}"
    yield _out("üéâ finished", elapsed, words, length_m, "\n".join(lines), tmp)
    gc.collect()

# ======================================================
# 3‚ÄÜ  GUI
# ======================================================
css_rule = """
* {font-family:'Segoe UI','Helvetica Neue',Arial,sans-serif !important;}
.grp-box {border:1px solid var(--block-border-color); padding:8px; border-radius:4px;}
"""

with gr.Blocks(title="Audio-to-Text", theme=gr.themes.Monochrome(),
               css=css_rule) as demo:

    gr.Markdown("### üéôÔ∏è Audio-to-Text Transcriber")
    gr.Markdown(
        "To create a Memory Aid Document visit: "
        "[chatgpt Memory-Aid](https://chatgpt.com/g/g-68498babc37c8191bb25104819f9862e-ayudasmemoria-rdc)"
    )

    with gr.Row(equal_height=True):
        file_in = gr.File(label="Upload audio / video",
                          file_types=["audio", "video"], type="filepath")

        with gr.Group(elem_classes="grp-box"):
            with gr.Row(equal_height=True):
                status_tx = gr.Textbox(label="Status", lines=1, interactive=False)
                time_tx   = gr.Textbox(label="Processing time", lines=1, interactive=False)
            with gr.Row(equal_height=True):
                words_tx  = gr.Textbox(label="Words count", lines=1, interactive=False)
                len_tx    = gr.Textbox(label="Transcription length (min)",
                                       lines=1, interactive=False)

    run_btn       = gr.Button("Transcribe ‚ñ∂Ô∏é", variant="primary", size="lg")
    transcript_tx = gr.Textbox(label="Transcript", lines=13, interactive=False)
    download_bt   = gr.File(label="Download clean .txt")

    run_btn.click(
        fn=transcribe_stream,
        inputs=file_in,
        outputs=[status_tx, time_tx, words_tx, len_tx,
                 transcript_tx, download_bt]     # exactly 6 outputs
    )

# ======================================================
# 4‚ÄÜ  Launch
# ======================================================
demo.launch(share=True, inbrowser=True, height=1300, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://6f14ae57af9a719f75.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7874 <> https://6f14ae57af9a719f75.gradio.live


