In [4]:
# If using Windows
# pip install PyAudioWPatch soundcard soundfile

# if using Linux
# sudo apt install libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0
# sudo apt install ffmpeg libav-tools
# sudo pip install pyaudio


from scipy.signal import resample
import pyaudio # import pyaudiowpatch as pyaudio (windows)
import time
import numpy as np
from datetime import datetime
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import wave
import matplotlib.pyplot as plt

def save_wave(audio_frames, idx):
    # Save the recorded audio to a WAV file
    wav_file = f"temp_audio_{idx}.wav"
    with wave.open(wav_file, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(audio_frames))

In [11]:


MIXED_PRECISION = True
INPUT_DEVICE = {1:'pc_micro_phone', 2:'pc_speaker', 3:'bluetooth_speaker', 4:'bluetooth_microphone'}
""" for pc_speaker, go to the sound icon and right click-> choose sounds -> recordings -> choose streo Mix as default  """
INPUT_DEVICE_IDX = 2


device='cuda' if torch.cuda.is_available() else 'cpu'
# Optimize PyTorch for CPU
# torch.set_num_threads(6)  # Adjust to the number of CPU cores available
# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)

for params in model.parameters():
    params.requires_grad = False
    
model.eval()

# forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")#for french to french
forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="translate")#for french to english
# forced_decoder_ids = None # for english to english



# Initialize PyAudio
p = pyaudio.PyAudio()


default_speakers =  p.get_default_input_device_info()


print(f'The loopback device is {default_speakers}')

# Settings for recording audio
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = int(default_speakers['defaultSampleRate'] ) 
WISPER_RATE = 16000# Whisper expects 16kHz input
CHUNK = 1024  # Number of frames per buffer
TRANSCRIPTION_INTERVAL = 10  # Interval for transcription in seconds

# Open a stream to record audio

stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK, input_device_index=default_speakers["index"])

print("Listening for audio... Speak now.")

audio_buffer = np.array([], dtype=np.float32)  # Buffer to store accumulated audio
last_transcription_time = time.time()  # Initialize the last transcription time
audio_frames = []  # Store raw audio frames

# Open a text file to save the transcriptions
transcription_file = open("transcriptions_cpu.txt", "a")


while True:
    try:

        # Read a chunk of audio
        data = stream.read(CHUNK, exception_on_overflow=False)  # Read a chunk of audio (1024 samples per chunk
        audio_frames.append(data)  # Save raw audio data for MP3 conversion
        audio_chunk = np.frombuffer(data, np.int16).flatten().astype(np.float32) / 32768.0
        downsampled_chunk = resample(audio_chunk, int(len(audio_chunk) * WISPER_RATE / RATE))  # Resample to 16kHz

        
        if np.abs(audio_chunk).mean() > 0.01:
            audio_buffer = np.append(audio_buffer, downsampled_chunk)
            
        print(np.abs(audio_chunk).mean())

        # Check if it's time to perform transcription
        current_time = time.time()
        if current_time - last_transcription_time >= TRANSCRIPTION_INTERVAL:
            if audio_buffer.size > 0:  # Ensure there's audio to transcribe

                start_translation_time = time.time()

                input_features = processor(audio_buffer, sampling_rate=WISPER_RATE, return_tensors="pt").input_features
                # Generate token ids
                with torch.no_grad():
                    predicted_ids = model.generate(input_features.to(device), forced_decoder_ids=forced_decoder_ids)
                # Decode token ids to text
                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
                transcription_text = transcription[0]

                translation_duration = time.time() - start_translation_time

                # Save the transcription with a timestamp to the file
                timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                transcription_file.write(f"[{timestamp}] {transcription_text}\n")
                transcription_file.flush()  # Ensure it's written to the file immediately


                # Print the transcription
                print(f"Transcription ({translation_duration:0.3f}s / audio_len({audio_buffer.size})): {transcription_text}")

                # Clear buffer after transcription
                audio_buffer = np.array([], dtype=np.float32)

            last_transcription_time = current_time  # Update last transcription time

    except KeyboardInterrupt:
        print("Stopped listening.")
        # save_wave(audio_frames, idx=1)
        break

# Stop and close the stream
stream.stop_stream()
stream.close()
p.terminate()



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


The loopback device is {'index': 2, 'structVersion': 2, 'name': 'default', 'hostApi': 0, 'maxInputChannels': 32, 'maxOutputChannels': 32, 'defaultLowInputLatency': 0.008684807256235827, 'defaultLowOutputLatency': 0.008684807256235827, 'defaultHighInputLatency': 0.034807256235827665, 'defaultHighOutputLatency': 0.034807256235827665, 'defaultSampleRate': 44100.0}
Listening for audio... Speak now.
0.0017241538
0.0015336275
0.0016931891
0.0017182529
0.0017878115
0.001507014
0.0016484559
0.0013958216
0.0017108321
0.0017400384
0.0014868975
0.0024459958
0.0016168058
0.0012611449
0.00206092
0.0017777979
0.002091229
0.0024159849
0.0015548766
0.0021038651
0.0016075373
0.0021899045
0.0016685128
0.0014237761
0.0013531446
0.0017706156
0.0013680756
0.0017315149
0.0018932223
0.0020320117
0.0014073849
0.0015192032
0.0014170408
0.0022553802
0.0020716786
0.0018748939
0.0047809184
0.0031222403
0.0023206472
0.0020730197
0.0019168854
0.0018161535
0.002676338
0.0021605492
0.0019904375
0.0019566715
0.0019472

2024-10-08 17:14:16.063115: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-08 17:14:16.085940: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Transcription (8.254s / audio_len(106477)):  Hello everyone and welcome back to my channel. Today we are going to talk about how we can use blah blah blah for blah blah blah in the blah blah blah.
0.010469615
0.008298993
0.007018566
0.006121218
0.0035159886
0.0039303005
0.003566295
0.002532959
0.0022655725
0.0028819442
0.0032390654
0.04323587
0.06375724
0.05258605
0.03024596
0.02040118
0.022981882
0.0472323
0.07997754
0.091682285
0.08007887
0.05085808
0.04566881
0.07547021
0.095885545
0.10231084
0.089874
0.029761195
0.041471213
0.07279208
0.08417165
0.12116334
0.101495
0.063239634
0.073292196
0.045088857
0.034614384
0.016103059
0.023236066
0.0109297335
0.014644891
0.008585602
0.010432273
0.005670756
0.0060833097
0.0045537353
0.0038855672
0.0036033094
0.019996166
0.023992121
0.026272535
0.03474301
0.033054143
0.035141855
0.03932509
0.030156404
0.009057641
0.017849147
0.010588288
0.02771023
0.020217419
0.013631165
0.0054695904
0.0055033267
0.0085555315
0.010076284
0.050402254
0.027581483

In [None]:
import pyaudio # import pyaudiowpatch as pyaudio (windows)
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))

{'index': 0, 'structVersion': 2, 'name': 'sof-hda-dsp: - (hw:0,7)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.016, 'defaultLowOutputLatency': -1.0, 'defaultHighInputLatency': 0.096, 'defaultHighOutputLatency': -1.0, 'defaultSampleRate': 16000.0}
{'index': 1, 'structVersion': 2, 'name': 'pulse', 'hostApi': 0, 'maxInputChannels': 32, 'maxOutputChannels': 32, 'defaultLowInputLatency': 0.008684807256235827, 'defaultLowOutputLatency': 0.008684807256235827, 'defaultHighInputLatency': 0.034807256235827665, 'defaultHighOutputLatency': 0.034807256235827665, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'default', 'hostApi': 0, 'maxInputChannels': 32, 'maxOutputChannels': 32, 'defaultLowInputLatency': 0.008684807256235827, 'defaultLowOutputLatency': 0.008684807256235827, 'defaultHighInputLatency': 0.034807256235827665, 'defaultHighOutputLatency': 0.034807256235827665, 'defaultSampleRate': 44100.0}


In [9]:
# pip install PyAudioWPatch

# import time
# from transformers import  pipeline
# import torch

# device = 'cuda' if torch.cuda.is_available() else 'cpu'


# pipe  = pipeline("automatic-speech-recognition",
#                     "openai/whisper-small", 
#                     chunk_length_s=30,
#                     stride_length_s=5,
#                     return_timestamps=True,
#                     device=device, 
#                     generate_kwargs = {"language": 'fr', "task": "translate"}) # if you don't have GPU, r

for params in pipe.model.parameters():
    params.requires_grad = False
    
pipe.model.eval()
with torch.no_grad():
    transcription = pipe("temp_audio_1.wav" )



formatted_lyrics = ""
for line in transcription['chunks']:
    text = line["text"]
    formatted_lyrics += f"{text}\n"

print(formatted_lyrics.strip())

Hello everyone and welcome back to my channel. Today we are going to talk about
 Hiking Face and how we can use it for multitude of things.
 In this video we have multiple things such as aloo, matter I'm going to do it again. Hello everyone and welcome to a new video on Learn to French.
 My name is Eden and I teach French.
 to another video on learn to French. If you are new here, my name is Eden and I teach French. Today we will work on your oral comprehension. This video is for the learners of level B1.
 So this video is for B1 learners. However, if you are A2 or even A1, you can always participate.
 It's still great content for you to practice your listening comprehension skills.
 So, how are we going to get this video?
 First, you're going to listen to the texts.
 So first, you're going to listen to the texts.
 I will be reading them for you.
 Then, you're going to answer a few questions. Then, you're Then you will answer a few questions.
 Then you are going to answer a few compreh