In [1]:
%pip install sounddevice torchaudio soundfile ipywidgets transformers sentencepiece

Note: you may need to restart the kernel to use updated packages.


## Recording Stage

In [2]:
import sounddevice as sd
import soundfile as sf
import ipywidgets as widgets
from IPython.display import display
import numpy as np
import torchaudio
import torch

def record_audio_torchaudio_compatible(duration=5, samplerate=16_000, filename='recording.wav'):
    """
    Record audio in a Jupyter notebook and save it as a WAV file.
    Output format is compatible with `torchaudio.load`.
    
    Parameters:
    duration (float): Recording duration in seconds
    samplerate (int): Sample rate in Hz
    filename (str): Output filename (must end in .wav)
    
    Returns:
    torch.Tensor: Recorded audio data in shape [channels, samples]
    int: Original sample rate
    """
    
    # Create widgets
    record_button = widgets.Button(description="Start Recording")
    status_label = widgets.Label(value="Press button to start recording")
    audio_player_widget = widgets.Output()

    recording_data = {'audio': None}

    def on_button_click(b):
        # Disable button and update status
        record_button.disabled = True
        status_label.value = f"Recording for {duration} seconds..."
        
        # Record audio
        recording = sd.rec(
            int(samplerate * duration),
            samplerate=samplerate,
            channels=1,
            dtype='float32'  # Match `torchaudio` float32 format
        )
        sd.wait()  # Wait until recording is finished
        
        # Reshape to match `torchaudio` format [channels, samples]
        recording = torch.tensor(recording.T)  # Transpose for [1, samples]
        
        # Save the recording
        sf.write(filename, recording.numpy().T, samplerate)  # Convert back for saving
        
        # Store recording data
        recording_data['audio'] = recording
        
        # Update status
        status_label.value = f"Recording saved to {filename}"
        
        # Display the audio player
        with audio_player_widget:
            display(f"Audio saved: {filename}")
        
        # Enable button again
        record_button.disabled = False

    # Attach the button click event
    record_button.on_click(on_button_click)
    
    # Display widgets
    display(widgets.VBox([record_button, status_label, audio_player_widget]))
    
    # Return audio data and sample rate after recording
    def get_audio():
        return recording_data['audio'], samplerate

    return get_audio

# Example usage:
"""
get_audio = record_audio_torchaudio_compatible(duration=5, filename='my_recording.wav')

# After recording, fetch audio and play
audio, samplerate = get_audio()
if audio is not None:
    print("Audio shape:", audio.shape)
    print("Sample rate:", samplerate)
"""

get_audio = record_audio_torchaudio_compatible(duration=5, filename='backup.wav')

VBox(children=(Button(description='Start Recording', style=ButtonStyle()), Label(value='Press button to start …

## Translation Stage

In [3]:
from transformers import AutoProcessor, SeamlessM4Tv2Model
import torchaudio

audio, sample_rate = get_audio()


processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")

# audio =  torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000) # must be a 16 kHz waveform array
audio_inputs = processor(audios=audio, return_tensors="pt")

## translate the audio
output_tokens = model.generate(**audio_inputs, tgt_lang="spa", generate_speech=False)
translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
print(f"Translation from audio: {translated_text_from_audio}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Translation from audio: Hola, hola, mi nombre es Matthew.


## Llama Fact-Check?
## our model is fairly big ~10GB

In [None]:
# we can do a quick inference and see if there are any red flags

## protect family & friends from scams
## key parts -> smaller & multi-lingual

# maybe use llama stack here??