In [90]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoProcessor, AutoModelForCTC

import onnxruntime
from onnxruntime.quantization import quantize_dynamic, QuantType
import numpy as np

In [91]:
import sounddevice as sd
import numpy as np
import wave

# Parameters
samplerate = 44100  # Standard sampling rate (44.1kHz)
duration = 10  # Duration in seconds to record
filename = "recorded_audio.wav"  # Output file name

# Record audio
print("Recording...")
audio_data = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=2, dtype='int16')
sd.wait()  # Wait until recording is finished
print("Recording finished.")

# Save the recorded audio to a .wav file
with wave.open(filename, 'wb') as wf:
    wf.setnchannels(2)  # Stereo
    wf.setsampwidth(2)  # 2 bytes for 'int16'
    wf.setframerate(samplerate)
    wf.writeframes(audio_data.tobytes())

print(f"Audio saved as {filename}")


Recording...
Recording finished.
Audio saved as recorded_audio.wav


In [92]:
processor = AutoProcessor.from_pretrained("arbml/wav2vec2-large-xlsr-53-arabic-egyptian")
model = AutoModelForCTC.from_pretrained("arbml/wav2vec2-large-xlsr-53-arabic-egyptian")

In [93]:
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

audio_path = "recorded_audio.wav"
speech_array, sampling_rate = torchaudio.load(audio_path)

# Check if audio is stereo (2 channels) and convert to mono if necessary
if speech_array.shape[0] > 1:  # If there are more than 1 channel (stereo)
    speech_array = torch.mean(speech_array, dim=0)  # Convert to mono by averaging channels


# Resample the audio to 16 kHz if it's not already at that sample rate
resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
speech_array = resampler(speech_array).squeeze().numpy()  # Convert to 1D array (mono)

# Check the length of the audio
audio_length = len(speech_array)
print(f"Original audio length (samples): {audio_length}")

Original audio length (samples): 160000


In [94]:
# Define the desired duration in seconds and calculate expected length
desired_duration_sec = 10  # Example: 10 seconds
expected_length = 16_000 * desired_duration_sec  # 16,000 samples per second at 16 kHz

# If the audio is shorter than the expected length, pad with zeros
if len(speech_array) < expected_length:
    speech_array = np.pad(speech_array, (0, expected_length - len(speech_array)), 'constant')
# If the audio is longer than the expected length, truncate it
elif len(speech_array) > expected_length:
    speech_array = speech_array[:expected_length]

# Ensure it's a 1D array and convert to a tensor
speech_array = torch.tensor(speech_array).unsqueeze(0)

In [95]:
# Process the audio
input_values = processor(speech_array.squeeze().numpy(), return_tensors="pt").input_values

# Perform speech-to-text (inference)
with torch.no_grad():
    logits = model(input_values).logits

# Decode the logits to text
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])

print("Transcription:", transcription)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Transcription: خل بالك إجريمين صرعة بشرعة ده هيضرابك هياء اله مط من عصلوت اع ال هو راية وهتلسكين أمان نو عالموج
