# **Transcription Test**
This notebook is designed to test speech-to-text **without dependencies** on FastAPI (`app.py`) and the database (`session.py`).
- Uses **Wav2Vec2** for transcription.
- Simulates **real-time audio buffering**.
- Processes **dummy audio data**.
- **No database or NLP dependencies.**

In [None]:
import numpy as np
import pyaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os
from pyctcdecode import build_ctcdecoder

In [None]:
# Mock functions to remove dependency on FastAPI and database
def process_text(text):
    return text  # Just return raw text for standalone execution

class AudioCollector:
    """
    A class to accumulate audio data and process it using Wav2Vec2 for speech-to-text.
    """
    _instance = None
    
    def __new__(cls):
        
        '''
        facebook/wav2vec2-base-960h - Base model, trained on 960 hrs of English speech
        facebook/wav2vec2-large-960h - Large model, trained on 960 hrs of English speech
        facebook/wav2vec2-large-xlsr-53 - Large model, trained on 53 languages (multilingual)
        '''
        MODEL_ID = "facebook/wav2vec2-base-960h" # {facebook/wav2vec2-large-960h, facebook/wav2vec2-large-xlsr-53}
        
        if cls._instance is None:
            print("Initializing AudioCollector...")
            cls._instance = super(AudioCollector, cls).__new__(cls)
            cls._instance.audio_buffer = bytearray()
            #cls._instance.session_audio = bytearray()       # accumulates full session audio for later processing

            # cls._instance.model = whisper.load_model("medium")  # or "base" for faster processing
            cls._instance.processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
            cls._instance.model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
            cls._instance.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            cls._instance.model.to(cls._instance.device)
        return cls._instance
    
    def add_chunk(self, chunk: bytes):
        """Add new PCM data to both the real-time and session buffers."""
        self.audio_buffer.extend(chunk)
        #self.session_audio.extend(chunk)

    def reset_buffer(self):
        """Reset only the real-time audio buffer (keeping session audio intact)."""
        self.audio_buffer = bytearray()

    def transcribe_audio_segment(self):
        """
        Preprocesses the audio and runs inference using the XLS-R model.
        Returns the transcription as text.
        """
        
        """Convert the buffer to text using Wav2Vec2."""
        if len(self.audio_buffer) == 0:
            return "No audio data to transcribe."

        sample_rate = 16000
        sample_width = 2  # bytes per sample
        #frame_duration_ms = 20
        #frame_size = int(sample_rate * (frame_duration_ms / 1000.0) * sample_width)  # ~640 bytes

        samples = np.frombuffer(self.audio_buffer, dtype=np.int16).astype(np.float32) / 32768.0

        if sample_rate != 16000:
                print(f"Warning: Sample rate is {sample_rate}Hz. Expected 16000Hz.")

        input_values = self.processor(samples, sampling_rate=sample_rate, return_tensors="pt").input_values
        input_values = input_values.to(self.device)
        
        with torch.no_grad():
            logits = self.model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)[0]

        self.reset_buffer()
        return transcription


In [4]:
collector = AudioCollector()

# Simulate a chunk of raw PCM audio (normally this would be captured from a microphone)
dummy_audio_chunk = np.random.randint(-32768, 32767, 16000, dtype=np.int16).tobytes()

collector.add_chunk(dummy_audio_chunk)
transcription = collector.transcribe_audio_segment()
print("Transcription result:", transcription)

Transcription result: 


# **Transcription Test with Different Models**

Models we are using are
- **facebook/wav2vec2-base-960h** - Base model, trained on 960 hrs of English speech
- **facebook/wav2vec2-large-960h** - Large model, trained on 960 hrs of English speech
- **facebook/wav2vec2-large-xlsr-53** - Large model, trained on 53 languages (multilingual)

In [30]:
import torch
import numpy as np
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor


'''
facebook/wav2vec2-base-960h - Base model, trained on 960 hrs of English speech
facebook/wav2vec2-large-960h - Large model, trained on 960 hrs of English speech
facebook/wav2vec2-large-xlsr-53 - Large model, trained on 53 languages (multilingual)
'''

MODEL_ID = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def transcribe_audio(file_path):
    print(f"Loading file: {file_path}")

    audio_samples, sample_rate = sf.read(file_path)

    if sample_rate != 16000:
        print(f"Warning: Sample rate is {sample_rate}Hz. Expected 16000Hz.")

    input_values = processor(audio_samples, sampling_rate=16000, return_tensors="pt").input_values
    input_values = input_values.to(device)

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription

if __name__ == "__main__":
    import soundfile as sf

    file_path = "/Users/munhuikim/Desktop/med-sync-be/test_audios/day1_consultation01_doctor.wav"
    if os.path.exists(file_path):
        print("File found! Trying to open it...")
        try:
            audio_samples, sample_rate = sf.read(file_path)
            print("File opened successfully! Sample rate:", sample_rate)
        except Exception as e:
            print("Error reading the file:", e)
    else:
        print("File not found!")

    transcription_result = transcribe_audio(file_path)
    print("Transcription Result:", transcription_result)


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


File found! Trying to open it...
File opened successfully! Sample rate: 16000
Loading file: /Users/munhuikim/Desktop/med-sync-be/test_audios/day1_consultation01_doctor.wav
Transcription Result: HELI I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN SO IT I HER THAT AND AND AND WE YOU STIN DIME IN AR WHATH YOU MEEN BY DIRAG YOU MEA NOU GANTS A TALLIN MOR OFTEN OR ARE YOUR STOOLS MOR LOOSE  CN AND HAVD ME TIMES TA DAY AR YOU GOIN I SA OR LAST COUPLOF DAYS SAIX SOME TIMES A DAN AND YOU MAN MENTIONIS  MAIN WAR TREE HAVEN NAIN YO THE INGS LIK BLOOD IN YOUR STOOLS ICAN AND YOU MENTIN YOV HAS SOME PAIN YOUR TOMY AS WELL WWHERE ABAUT AS A PAIN EXACTLY ONCE I AN WHAT SIDE IS THAT NET SI AGAN AND CAN YOU DESCRIME THE PAINTOME ICA AND IS TA PAIN IS THAT IS E TER ALL THE TIME TAT AS A COMING GO AA IS TA PAIN MOV ANYWHERE ELSE WITH ON PETWASE YOUR BACK ACAN NE AND YOU MENTION O E PENN QY WEE CAN SHAKEHERS WELL WOULD YOUOU BY SHAKE ID YOU MEAN YOU BEEN HAVING N AN YOU BEVENING FEVERISH FERSOMP

In [15]:
import os

file_path = "/Users/munhuikim/Desktop/med-sync-be/test_audios/day1_consultation01_doctor.wav"
print("File exists:", os.path.exists(file_path))


File exists: True


**Post processing**
1. Clean up the text (remove extra spaces, eliminating repeated words, fixing punctuation)
2. Check spelling + auto correction
3. Check grammar
4. Use LLM (gpt) - not sure

In [20]:
import re

def clean_transcription(text):
    """Basic cleaning: removes extra spaces, duplicates, and noise."""
    text = re.sub(r'\s+', ' ', text)  # rm extra spaces
    text = re.sub(r'([.,!?])\1+', r'\1', text)  # rm repeated punctuation
    text = re.sub(r'\b(\w+)\s+\1\b', r'\1', text)  # rm duplicate words
    text = text.strip()
    return text

raw_text = "HELI I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN SO IT I HER THAT AND AND AND WE YOU STIN DIME IN AR WHATH YOU MEEN BY DIRAG YOU MEA NOU GANTS A TALLIN MOR OFTEN OR ARE YOUR STOOLS MOR LOOSE  CN AND HAVD ME TIMES TA DAY AR YOU GOIN I SA OR LAST COUPLOF DAYS SAIX SOME TIMES A DAN AND YOU MAN MENTIONIS  MAIN WAR TREE HAVEN NAIN YO THE INGS LIK BLOOD IN YOUR STOOLS ICAN AND YOU MENTIN YOV HAS SOME PAIN YOUR TOMY AS WELL WWHERE ABAUT AS A PAIN EXACTLY ONCE I AN WHAT SIDE IS THAT NET SI AGAN AND CAN YOU DESCRIME THE PAINTOME ICA AND IS TA PAIN IS THAT IS E TER ALL THE TIME TAT AS A COMING GO AA IS TA PAIN MOV ANYWHERE ELSE WITH ON PETWASE YOUR BACK ACAN NE AND YOU MENTION O E PENN QY WEE CAN SHAKEHERS WELL WOULD YOUOU BY SHAKE ID YOU MEAN YOU BEEN HAVING N AN YOU BEVENING FEVERISH FERSOMPLE  DOU MEATURE YOUR TEMPERTOR THEN AGA AC AND YOU OTHE SIMPE LIKE SWETTING OR A NIGHT"
cleaned_text = clean_transcription(raw_text)
print(cleaned_text)  
# Output: "HELLO I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN"


HELI I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN SO IT I HER THAT AND AND WE YOU STIN DIME IN AR WHATH YOU MEEN BY DIRAG YOU MEA NOU GANTS A TALLIN MOR OFTEN OR ARE YOUR STOOLS MOR LOOSE CN AND HAVD ME TIMES TA DAY AR YOU GOIN I SA OR LAST COUPLOF DAYS SAIX SOME TIMES A DAN AND YOU MAN MENTIONIS MAIN WAR TREE HAVEN NAIN YO THE INGS LIK BLOOD IN YOUR STOOLS ICAN AND YOU MENTIN YOV HAS SOME PAIN YOUR TOMY AS WELL WWHERE ABAUT AS A PAIN EXACTLY ONCE I AN WHAT SIDE IS THAT NET SI AGAN AND CAN YOU DESCRIME THE PAINTOME ICA AND IS TA PAIN IS THAT IS E TER ALL THE TIME TAT AS A COMING GO AA IS TA PAIN MOV ANYWHERE ELSE WITH ON PETWASE YOUR BACK ACAN NE AND YOU MENTION O E PENN QY WEE CAN SHAKEHERS WELL WOULD YOUOU BY SHAKE ID YOU MEAN YOU BEEN HAVING N AN YOU BEVENING FEVERISH FERSOMPLE DOU MEATURE YOUR TEMPERTOR THEN AGA AC AND YOU OTHE SIMPE LIKE SWETTING OR A NIGHT


In [23]:
'''
2. Spell checking + correction
- Use SymSpell library to correct spelling mistakes in the transcription
- Test whether it fixes mispelled words in the transcription

'''
# Install SymSpell if not installed
# pip install symspellpy

from symspellpy import SymSpell, Verbosity
import requests
import os

dictionary_path = "frequency_dictionary_en_82_765.txt"
if not os.path.exists(dictionary_path):
    url = "https://raw.githubusercontent.com/wolfgarbe/SymSpell/master/src/SymSpell/frequency_dictionary_en_82_765.txt"
    response = requests.get(url)
    with open(dictionary_path, "wb") as f:
        f.write(response.content)

sym_spell = SymSpell(max_dictionary_edit_distance=2)
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

def correct_spelling(text):
    words = text.split()
    corrected_words = []
    
    for word in words:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
        if suggestions:
            corrected_words.append(suggestions[0].term)  # Take the most probable correction
        else:
            corrected_words.append(word)  # Keep original word if no suggestion is found

    return ' '.join(corrected_words)

raw_text = "HELPI TISS MORNINE YAN"
fixed_text = correct_spelling(raw_text)
print("Corrected Text:", fixed_text)  
# Expected Output: "HELP THIS MORNING YAN"


Corrected Text: HELPI TISS MORNINE YAN


In [24]:
'''
3. Simple grammar correction
Grammar correction model is HuggingFace
It fixes bad sentence structures and improves readability automatically.
'''

from transformers import pipeline

corrector = pipeline("text2text-generation", model="facebook/bart-large-cnn")

def correct_grammar(text):
    corrected_text = corrector(text, max_length=200)[0]['generated_text']
    return corrected_text

raw_text = "HELI I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN SO IT I HER THAT AND AND AND WE YOU STIN DIME IN AR WHATH YOU MEEN BY DIRAG YOU MEA NOU GANTS A TALLIN MOR OFTEN OR ARE YOUR STOOLS MOR LOOSE  CN AND HAVD ME TIMES TA DAY AR YOU GOIN I SA OR LAST COUPLOF DAYS SAIX SOME TIMES A DAN AND YOU MAN MENTIONIS  MAIN WAR TREE HAVEN NAIN YO THE INGS LIK BLOOD IN YOUR STOOLS ICAN AND YOU MENTIN YOV HAS SOME PAIN YOUR TOMY AS WELL WWHERE ABAUT AS A PAIN EXACTLY ONCE I AN WHAT SIDE IS THAT NET SI AGAN AND CAN YOU DESCRIME THE PAINTOME ICA AND IS TA PAIN IS THAT IS E TER ALL THE TIME TAT AS A COMING GO AA IS TA PAIN MOV ANYWHERE ELSE WITH ON PETWASE YOUR BACK ACAN NE AND YOU MENTION O E PENN QY WEE CAN SHAKEHERS WELL WOULD YOUOU BY SHAKE ID YOU MEAN YOU BEEN HAVING N AN YOU BEVENING FEVERISH FERSOMPLE  DOU MEATURE YOUR TEMPERTOR THEN AGA AC AND YOU OTHE SIMPE LIKE SWETTING OR A NIGHT"
grammar_fixed_text = correct_grammar(raw_text)
print(grammar_fixed_text)


Device set to use cpu


"I've Got A Question For You" is a weekly, offbeat look at some of our most popular questions. This week, we ask you to explain the difference between pain, painkillers, and painkillers. We also ask you if you have ever felt a pain in your back, or if you've ever had a sore throat? And what do you think the best painkillers are?


In [28]:
'''
4. Use GPT to rewrite the transcription in proper English
Since the transcription is completely unreadable, 
ask llm to rewrite it into proper English.

limitation: privacy concerns, as the transcription may contain sensitive information.
'''

'\n4. Use GPT to rewrite the transcription in proper English\nSince the transcription is completely unreadable, \nask llm to rewrite it into proper English.\n\nlimitation: privacy concerns, as the transcription may contain sensitive information.\n'

In [29]:
def improve_transcription(text):
    text = clean_transcription(text)
    text = correct_spelling(text)
    text = correct_grammar(text)
    return text

raw_transcription = "HELI I AND FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN SO IT I HER THAT AND AND AND WE YOU STIN DIME IN AR WHATH YOU MEEN BY DIRAG YOU MEA NOU GANTS A TALLIN MOR OFTEN OR ARE YOUR STOOLS MOR LOOSE  CN AND HAVD ME TIMES TA DAY AR YOU GOIN I SA OR LAST COUPLOF DAYS SAIX SOME TIMES A DAN AND YOU MAN MENTIONIS  MAIN WAR TREE HAVEN NAIN YO THE INGS LIK BLOOD IN YOUR STOOLS ICAN AND YOU MENTIN YOV HAS SOME PAIN YOUR TOMY AS WELL WWHERE ABAUT AS A PAIN EXACTLY ONCE I AN WHAT SIDE IS THAT NET SI AGAN AND CAN YOU DESCRIME THE PAINTOME ICA AND IS TA PAIN IS THAT IS E TER ALL THE TIME TAT AS A COMING GO AA IS TA PAIN MOV ANYWHERE ELSE WITH ON PETWASE YOUR BACK ACAN NE AND YOU MENTION O E PENN QY WEE CAN SHAKEHERS WELL WOULD YOUOU BY SHAKE ID YOU MEAN YOU BEEN HAVING N AN YOU BEVENING FEVERISH FERSOMPLE  DOU MEATURE YOUR TEMPERTOR THEN AGA AC AND YOU OTHE SIMPE LIKE SWETTING OR A NIGHT"
better_transcription = improve_transcription(raw_transcription)
print("Improved Transcription:", better_transcription)


Improved Transcription: "I've Got A Problem" is a weekly, offbeat look at the world through the eyes of a young man. This week's episode focuses on the relationship between the male and female characters in the book. The characters are called Heli I and FRU STO YET A AN W A HAPPI HELPY TISS MORNINE YAN. The book is published by Simon & Schuster, publisher of Harper Collins.
