In [15]:
# pip install PyAudioWPatch

import pyaudiowpatch as pyaudio
import time
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration

import wave

def save_wave(audio_frames, idx):
    # Save the recorded audio to a WAV file
    wav_file = f"temp_audio_{idx}.wav"
    with wave.open(wav_file, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(audio_frames))

device='cuda'
# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
# forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")#for french to french
forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")#for french to english
# forced_decoder_ids = None # for english to english


# Initialize PyAudio
p = pyaudio.PyAudio()

    # Get default WASAPI info
wasapi_info = p.get_host_api_info_by_type(pyaudio.paWASAPI)
# Get default WASAPI speakers
default_speakers = p.get_device_info_by_index(wasapi_info["defaultOutputDevice"])
if not default_speakers["isLoopbackDevice"]:
    for loopback in p.get_loopback_device_info_generator():
        """
        Try to find loopback device with same name(and [Loopback suffix]).
        Unfortunately, this is the most adequate way at the moment.
        """
        if default_speakers["name"] in loopback["name"]:
            default_speakers = loopback
            break


# Settings for recording audio
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000  # Whisper expects 16kHz input
CHUNK = 1024  # Number of frames per buffer
TRANSCRIPTION_INTERVAL = 20  # Interval for transcription in seconds

# Open a stream to record audio

stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, input_device_index=default_speakers["index"])

print("Listening for audio... Speak now.")

audio_buffer = np.array([], dtype=np.float32)  # Buffer to store accumulated audio
last_transcription_time = time.time()  # Initialize the last transcription time
audio_frames = []  # Store raw audio frames


while True:
    try:

        # Read a chunk of audio
        data = stream.read(CHUNK, exception_on_overflow=False)  # Read a chunk of audio (1024 samples per chunk
        audio_frames.append(data)  # Save raw audio data for MP3 conversion
        audio_chunk = np.frombuffer(data, np.int16).flatten().astype(np.float32) / 32768.0
        
        

        audio_buffer = np.append(audio_buffer, audio_chunk)  # Accumulate sound in the buffer

        # Check if it's time to perform transcription
        current_time = time.time()
        if current_time - last_transcription_time >= TRANSCRIPTION_INTERVAL:
            if audio_buffer.size > 0:  # Ensure there's audio to transcribe

                start_translation_time = time.time()

                input_features = processor(audio_buffer, sampling_rate=RATE, return_tensors="pt").input_features

                # Generate token ids
                predicted_ids = model.generate(input_features.to(device), forced_decoder_ids=forced_decoder_ids)

                # Decode token ids to text
                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

                translation_duration = time.time() - start_translation_time



                # Print the transcription
                print(f"Transcription ({translation_duration:0.3f}s): {transcription[0]}")

                # Clear buffer after transcription
                audio_buffer = np.array([], dtype=np.float32)

            last_transcription_time = current_time  # Update last transcription time

    except KeyboardInterrupt:
        print("Stopped listening.")
        save_wave(audio_frames, idx=1)
        break

# Stop and close the stream
stream.stop_stream()
stream.close()
p.terminate()



Exception ignored in: <function Wave_write.__del__ at 0x000001F15B8D2430>
Traceback (most recent call last):
  File "c:\Users\Asus\miniconda3\lib\wave.py", line 326, in __del__
    self.close()
  File "c:\Users\Asus\miniconda3\lib\wave.py", line 444, in close
    self._ensure_header_written(0)
  File "c:\Users\Asus\miniconda3\lib\wave.py", line 462, in _ensure_header_written
    raise Error('# channels not specified')
wave.Error: # channels not specified
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Listening for audio... Speak now.


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Transcription (4.233s):  I'm going to buy a croissant and a baguette. I would like a croissant, please. Hello! I'm going to take a croissant and a chocolate bread, please. And with this...
Stopped listening.


In [3]:
import pyaudiowpatch as pyaudio
import numpy as np
# Settings for recording audio
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000  # Whisper expects 16kHz input
CHUNK = 1024  # Number of frames per buffer
TRANSCRIPTION_INTERVAL = 20  # Interval for transcription in seconds

# Initialize PyAudio
p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
   dev = p.get_device_info_by_index(i)
   if dev['maxInputChannels']>0:
      stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, input_device_index=i)
      for _ in range(10):
         data = stream.read(CHUNK, exception_on_overflow=False)  # Read a chunk of audio (1024 samples per chunk
         audio_chunk = np.frombuffer(data, np.int16).flatten().astype(np.float32) / 32768.0

         print(f'device-{i}, audio_chunk:{audio_chunk.max()}')



p.terminate()

device-0, audio_chunk:3.0517578125e-05
device-0, audio_chunk:0.00128173828125
device-0, audio_chunk:0.00152587890625
device-0, audio_chunk:0.00213623046875
device-0, audio_chunk:0.002166748046875
device-0, audio_chunk:0.0025634765625
device-0, audio_chunk:0.00213623046875
device-0, audio_chunk:0.002471923828125
device-0, audio_chunk:0.002777099609375
device-0, audio_chunk:0.002227783203125
device-1, audio_chunk:0.00146484375
device-1, audio_chunk:0.001678466796875
device-1, audio_chunk:0.00201416015625
device-1, audio_chunk:0.002593994140625
device-1, audio_chunk:0.002777099609375
device-1, audio_chunk:0.002288818359375
device-1, audio_chunk:0.00262451171875
device-1, audio_chunk:0.00201416015625
device-1, audio_chunk:0.0029296875
device-1, audio_chunk:0.0018310546875
device-2, audio_chunk:0.000152587890625
device-2, audio_chunk:0.000152587890625
device-2, audio_chunk:0.000152587890625
device-2, audio_chunk:0.00018310546875
device-2, audio_chunk:0.000152587890625
device-2, audio_chunk:

OSError: [Errno -9997] Invalid sample rate

In [13]:
"""A simple example of recording from speakers ('What you hear') using the WASAPI loopback device"""


# Spinner is a helper class that is in the same examples folder.
# It is optional, you can safely delete the code associated with it.

import pyaudiowpatch as pyaudio
import time
import wave

DURATION = 5.0
CHUNK_SIZE = 512

filename = "loopback_record.wav"
    
    
p = pyaudio.PyAudio()






wave_file = wave.open(filename, 'wb')
wave_file.setnchannels(1)
wave_file.setsampwidth(pyaudio.get_sample_size(pyaudio.paInt16))
wave_file.setframerate(int(default_speakers["defaultSampleRate"]))

def callback(in_data, frame_count, time_info, status):
    """Write frames and return PA flag"""
    wave_file.writeframes(in_data)
    return (in_data, pyaudio.paContinue)

with p.open(format=pyaudio.paInt16,
        channels=default_speakers["maxInputChannels"],
        rate=int(default_speakers["defaultSampleRate"]),
        frames_per_buffer=CHUNK_SIZE,
        input=True,
        input_device_index=default_speakers["index"],
        stream_callback=callback
) as stream:
    """
    Opena PA stream via context manager.
    After leaving the context, everything will
    be correctly closed(Stream, PyAudio manager)            
    """
    time.sleep(DURATION) # Blocking execution while playing

wave_file.close()

# stream.stop_stream()
# stream.close()
p.terminate()

In [14]:
default_speakers

{'index': 28,
 'structVersion': 2,
 'name': 'Headset (realme Buds Wireless 3 Hands-Free AG Audio) [Loopback]',
 'hostApi': 2,
 'maxInputChannels': 1,
 'maxOutputChannels': 0,
 'defaultLowInputLatency': 0.003,
 'defaultLowOutputLatency': 0.0,
 'defaultHighInputLatency': 0.01,
 'defaultHighOutputLatency': 0.0,
 'defaultSampleRate': 16000.0,
 'isLoopbackDevice': True}