In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

DRIVE = '/content/drive/MyDrive/SignBridge_Data'
os.makedirs(f"{DRIVE}/speech/raw_audio", exist_ok=True)
os.makedirs(f"{DRIVE}/speech/transcripts", exist_ok=True)

os.makedirs("nlp/stt", exist_ok=True)

!pip install openai-whisper librosa ffmpeg-python --quiet



[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/803.2 kB[0m [31m27.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


In [3]:
# Check if GPU is available
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

print("\nEnvironment setup complete!")

PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA device: Tesla T4

Environment setup complete!


In [4]:
import whisper
import time

model_size = "medium"
start_time = time.time()

model = whisper.load_model(model_size)

load_time = time.time() - start_time
print(f"Model '{model_size}' loaded in {load_time:.2f} seconds")


print(f"\nModel Properties")
print(f"- Size: {model_size}")
print(f"- Parameters: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")
print(f"- Language detection: Yes")

100%|█████████████████████████████████████| 1.42G/1.42G [00:44<00:00, 34.2MiB/s]


Model 'medium' loaded in 59.12 seconds

Model Properties
- Size: medium
- Parameters: 762.3M
- Language detection: Yes


In [35]:
!pip install sounddevice scipy --quiet
# Install librosa if not already installed
!pip install librosa --quiet

In [5]:
# Step 3: Audio recording with voice activity detection
from IPython.display import Javascript, display, HTML
from google.colab import output
from base64 import b64decode
import time
import os
import numpy as np

def record_with_vad(max_seconds=30, silence_threshold=2):
    """
    Record audio with voice activity detection to stop recording after silence.

    Args:
        max_seconds: Maximum recording duration in seconds
        silence_threshold: Number of seconds of silence to trigger stop
    """
    print(f"Recording up to {max_seconds} seconds. Will stop after {silence_threshold}s of silence.")
    print("Speak clearly and press Start Recording when ready.")

    # Fixed HTML with better contrast - dark text on light background
    display(HTML("""
    <div style="background-color:#f8f9fa;padding:12px;border-radius:6px;margin-bottom:12px;border:1px solid #ddd;">
        <p style="font-weight:bold;color:#202124;font-size:16px;margin-bottom:8px;">📣 Recording Tips:</p>
        <ul style="color:#202124;font-size:14px;margin-left:20px;">
            <li>Recording will automatically stop after a few seconds of silence</li>
            <li>You can also click Stop when you're done speaking</li>
            <li>Speak clearly at a normal pace</li>
        </ul>
    </div>
    """))

    js = Javascript("""
    async function recordAudio() {
        const div = document.createElement('div');
        div.style = 'padding: 10px; border: 1px solid #ddd; border-radius: 8px;';

        const audio = document.createElement('audio');
        const recordButton = document.createElement('button');
        recordButton.textContent = '🎙️ Start Recording';
        recordButton.style = 'background-color:#4CAF50;color:white;border:none;padding:10px 20px;border-radius:4px;font-size:16px;cursor:pointer;';

        const statusLabel = document.createElement('p');
        statusLabel.textContent = 'Ready to record';

        const timerLabel = document.createElement('p');
        timerLabel.textContent = '00:00';

        const visualizer = document.createElement('canvas');
        visualizer.width = 300;
        visualizer.height = 60;
        visualizer.style = 'width:100%;background-color:#f5f5f5;border-radius:4px;';

        div.appendChild(recordButton);
        div.appendChild(statusLabel);
        div.appendChild(timerLabel);
        div.appendChild(visualizer);
        div.appendChild(audio);
        document.body.appendChild(div);

        try {
            // Request high-quality audio
            const stream = await navigator.mediaDevices.getUserMedia({
                audio: {
                    echoCancellation: true,
                    noiseSuppression: true,
                    autoGainControl: true,
                    sampleRate: 48000
                }
            });

            // Set up audio processing for voice activity detection
            const audioContext = new AudioContext();
            const source = audioContext.createMediaStreamSource(stream);
            const analyser = audioContext.createAnalyser();
            analyser.fftSize = 2048;
            source.connect(analyser);

            // For visualizer
            const canvasCtx = visualizer.getContext('2d');
            const bufferLength = analyser.frequencyBinCount;
            const dataArray = new Uint8Array(bufferLength);

            // Voice activity detection variables
            let silenceStart = null;
            let isSpeaking = false;
            const silenceThreshold = 15; // Threshold for detecting speech vs silence

            // Set up recorder
            const mediaRecorder = new MediaRecorder(stream, {
                mimeType: 'audio/webm;codecs=opus',
                audioBitsPerSecond: 128000
            });

            const chunks = [];
            let startTime;
            let timerInterval;

            mediaRecorder.addEventListener('dataavailable', event => {
                chunks.push(event.data);
            });

            // Function to visualize audio and detect silence
            function visualize() {
                if (!mediaRecorder || mediaRecorder.state !== 'recording') return;

                analyser.getByteFrequencyData(dataArray);

                // Calculate volume
                let sum = 0;
                for(let i = 0; i < bufferLength; i++) {
                    sum += dataArray[i];
                }
                let average = sum / bufferLength;

                // Detect speech/silence
                if (average > silenceThreshold) {
                    isSpeaking = true;
                    silenceStart = null;
                    statusLabel.textContent = 'Recording: Speech detected';
                } else if (isSpeaking) {
                    if (!silenceStart) {
                        silenceStart = Date.now();
                        statusLabel.textContent = 'Recording: Silence detected';
                    } else if ((Date.now() - silenceStart) > """ + str(silence_threshold * 1000) + """) {
                        statusLabel.textContent = 'Recording stopped: Silence detected';
                        stop();
                        return;
                    }
                }

                // Clear canvas
                canvasCtx.fillStyle = '#f5f5f5';
                canvasCtx.fillRect(0, 0, visualizer.width, visualizer.height);

                // Draw visualization
                canvasCtx.lineWidth = 2;
                canvasCtx.strokeStyle = isSpeaking ? '#4CAF50' : '#999';
                canvasCtx.beginPath();

                const sliceWidth = visualizer.width * 1.0 / bufferLength;
                let x = 0;

                for(let i = 0; i < bufferLength; i++) {
                    const v = dataArray[i] / 128.0;
                    const y = v * visualizer.height/2;

                    if(i === 0) {
                        canvasCtx.moveTo(x, y);
                    } else {
                        canvasCtx.lineTo(x, y);
                    }

                    x += sliceWidth;
                }

                canvasCtx.lineTo(visualizer.width, visualizer.height/2);
                canvasCtx.stroke();

                requestAnimationFrame(visualize);
            }

            const updateTimer = () => {
                const now = Date.now();
                const diff = now - startTime;
                const seconds = Math.floor(diff / 1000);
                const minutes = Math.floor(seconds / 60);
                const remainingSeconds = seconds % 60;
                timerLabel.textContent = `${minutes.toString().padStart(2, '0')}:${remainingSeconds.toString().padStart(2, '0')}`;

                // Check max duration
                if (seconds >= """ + str(max_seconds) + """) {
                    stop();
                }
            };

            const start = () => {
                chunks.length = 0;
                mediaRecorder.start(100); // Get data every 100ms for smoother updates
                startTime = Date.now();
                timerInterval = setInterval(updateTimer, 100);
                visualize();

                recordButton.textContent = '⏹️ Stop Recording';
                recordButton.style.backgroundColor = '#f44336';
                statusLabel.textContent = 'Recording...';
                recordButton.onclick = stop;
                isSpeaking = false;
            };

            const stop = () => {
                if (mediaRecorder.state === 'inactive') return;

                mediaRecorder.stop();
                clearInterval(timerInterval);

                stream.getTracks().forEach(track => track.stop());
                audioContext.close();

                recordButton.textContent = '✅ Processing...';
                recordButton.disabled = true;
                recordButton.style.backgroundColor = '#9e9e9e';
            };

            recordButton.onclick = start;

            const recording = new Promise(resolve => {
                mediaRecorder.addEventListener('stop', () => {
                    const blob = new Blob(chunks, {'type': 'audio/webm'});
                    audio.src = URL.createObjectURL(blob);
                    audio.controls = true;
                    statusLabel.textContent = 'Recording complete! You can play it below:';

                    // Convert to base64
                    const reader = new FileReader();
                    reader.onload = () => {
                        resolve(reader.result.split(',')[1]);
                    };
                    reader.readAsDataURL(blob);
                });
            });

            return await recording;
        } catch (err) {
            statusLabel.textContent = 'Error: ' + err.message;
            console.error('Recording error:', err);
            return null;
        }
    }
    """)

    display(js)
    s = output.eval_js('recordAudio()')

    if s is None:
        print("Recording failed or was canceled.")
        return None

    # Save as webm first (original format from browser)
    binary = b64decode(s)
    timestamp = int(time.time())
    webm_path = f"{DRIVE}/speech/raw_audio/recording_{timestamp}.webm"

    with open(webm_path, 'wb') as f:
        f.write(binary)

    # Convert to WAV using ffmpeg (better for whisper)
    print("Converting audio to WAV format...")

    wav_path = f"{DRIVE}/speech/raw_audio/recording_{timestamp}.wav"
    !ffmpeg -i "{webm_path}" -ar 16000 -ac 1 -c:a pcm_s16le -hide_banner -loglevel error "{wav_path}"

    # Remove the webm file
    !rm "{webm_path}"

    print(f"Recording saved to {wav_path}")
    return wav_path

In [6]:
# Step 4: Audio transcription with improved options
import librosa

def transcribe_with_options(audio_path):
    """
    Transcribe audio file with optimized settings and proper metrics.

    Args:
        audio_path: Path to the audio file

    Returns:
        Transcription result dictionary
    """
    if not os.path.exists(audio_path):
        print(f"Error: Audio file not found at {audio_path}")
        return None

    # Verify the audio file and get its duration
    try:
        # Load audio and get duration
        y, sr = librosa.load(audio_path, sr=16000)
        audio_duration = librosa.get_duration(y=y, sr=sr)
        print(f"Audio loaded successfully. Duration: {audio_duration:.2f} seconds")

        # If audio is too short or appears to be silent, there might be an issue
        if audio_duration < 0.1:
            print("Warning: Audio file is extremely short or might be empty.")
    except Exception as e:
        print(f"Warning: Could not verify audio with librosa: {e}")
        audio_duration = None

    print("\nTranscribing with Whisper...")
    start_time = time.time()

    # Perform transcription with specific options for better results
    result = model.transcribe(
        audio_path,
        language="en",  # Force English language detection
        fp16=False      # Use full precision (may help with quality)
    )

    # Calculate metrics
    transcription_time = time.time() - start_time

    # Use librosa's duration if whisper's duration is invalid
    if result.get("duration", 0) <= 0:
        result["duration"] = audio_duration if audio_duration is not None else 0

    # Calculate RTF only if we have valid duration
    rtf = transcription_time / result["duration"] if result["duration"] > 0 else 0

    # Display results
    print(f"\n--- Transcription Results ---")
    print(f"Text: {result['text']}")
    print(f"Time taken: {transcription_time:.2f} seconds")
    print(f"Audio duration: {result['duration']:.2f} seconds")
    print(f"Real-time factor (RTF): {rtf:.2f}x")
    print("----------------------------")

    # Save transcription to file
    transcript_path = f"{DRIVE}/speech/transcripts/{os.path.splitext(os.path.basename(audio_path))[0]}_transcript.txt"
    with open(transcript_path, 'w', encoding='utf-8') as f:
        f.write(result['text'])

    print(f"Transcript saved to {transcript_path}")

    return result

In [9]:
#step 5: test regular transcription
def test_regular_transcription():
  print("Testing regular transcription...")
  audio_path = record_with_vad(max_seconds=30, silence_threshold=2)
  if audio_path:
    transcription_result = transcribe_with_options(audio_path)
    return audio_path, transcription_result
  return None, None

#run the test
test_audio_path, test_result = test_regular_transcription()

Testing regular transcription...
Recording up to 30 seconds. Will stop after 2s of silence.
Speak clearly and press Start Recording when ready.


<IPython.core.display.Javascript object>

Converting audio to WAV format...
Recording saved to /content/drive/MyDrive/SignBridge_Data/speech/raw_audio/recording_1761152859.wav
Audio loaded successfully. Duration: 3.78 seconds

Transcribing with Whisper...

--- Transcription Results ---
Text:  What's up bro, how are you?
Time taken: 0.81 seconds
Audio duration: 3.78 seconds
Real-time factor (RTF): 0.21x
----------------------------
Transcript saved to /content/drive/MyDrive/SignBridge_Data/speech/transcripts/recording_1761152859_transcript.txt
