In [1]:
%%capture
!pip install fairseq2
!pip install pydub sentencepiece
!pip install git+https://github.com/facebookresearch/seamless_communication.git

In [2]:
import io
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import mmap
import numpy
import soundfile
import tqdm
import pandas as pd
import torchaudio
import torch

from collections import defaultdict
from IPython.display import Audio, display
from pathlib import Path
from pydub import AudioSegment

from seamless_communication.inference import Translator
from seamless_communication.streaming.dataloaders.s2tt import SileroVADSilenceRemover

In [20]:
model_name = "seamlessM4T_medium"
vocoder_name = "vocoder_v2" if model_name == "seamlessM4T_v2_large" else "vocoder_36langs"

print("====Vocoder Name====/n", vocoder_name)

translator = Translator(
    model_name,
    vocoder_name,
    device=torch.device("cuda:0"),
    dtype=torch.float16,
)

====Vocoder Name====/n vocoder_36langs


Using the cached checkpoint of seamlessM4T_medium. Set `force` to `True` to download again.
Using the cached tokenizer of seamlessM4T_medium. Set `force` to `True` to download again.
Using the cached checkpoint of vocoder_36langs. Set `force` to `True` to download again.


In [21]:
# Load the dataset
from datasets import load_dataset, Audio
ds = load_dataset('sartifyllc/Sartify_ITU_Zindi_Testdataset', split="test")
ds = ds.cast_column("audio", Audio(decode=False))
ds


Dataset({
    features: ['audio', 'filename', 'record_id'],
    num_rows: 4089
})

In [None]:
import os
import soundfile as sf
import numpy as np

os.makedirs("waves", exist_ok=True)

audio_path = []

def normalize_audio(audio, mode="rms", sr=16000):
    if mode == "peak":
        peak = np.max(np.abs(audio))
        if peak > 0:
            return audio / peak
    elif mode == "rms":
        rms = np.sqrt(np.mean(audio**2))
        target_rms = 0.1  # adjust based on model training
        if rms > 0:
            return audio * (target_rms / rms)
    return audio

for i, item in enumerate(ds):
    audio_bytes = item['audio']['bytes']

    # Decode audio
    audio_array, sr = sf.read(io.BytesIO(audio_bytes))

    # Normalize audio
    audio_array = normalize_audio(audio_array)

    # Save to disk
    out_path = f"waves/{item['record_id']}.wav"
    sf.write(out_path, audio_array, sr)

    audio_path.append(out_path)

In [23]:
import torchaudio

predictions = []
batch_size=8
for i in tqdm.tqdm(range(0, len(audio_path), batch_size)):
    batch_files = audio_path[i:i+batch_size]
    
    # Load audio files as tensors
    audio_tensors = []
    for file in batch_files:
        waveform, sr = torchaudio.load(file)
        audio_tensors.append(waveform)

    for waveform in audio_tensors:
        pred, _ = translator.predict(waveform, "asr", "swh")
        predictions.append(str(pred))


100%|██████████| 512/512 [19:46<00:00,  2.32s/it]


In [26]:
import re
def extract_from_cstring(text):
    match = re.search(r"CString\('(.+?)'\)", text)
    return match.group(1) if match else None


In [27]:
result = [

          {
              "filename": ds[i]['filename'],
              "text": extract_from_cstring(predictions[i])

          }

          for i in range(len(ds))
]

In [28]:
import numpy as np
sub = pd.DataFrame(result)

sub['text'] = np.where(sub['text'] == "", " ", sub['text'] )


sub.to_csv('facebook_submission_.csv', index=False)

sub

Unnamed: 0,filename,text
0,451f6d89-9b85-46c3-ad8d-bfcb1c9a4e8f.wav,upana baina ya mita sitini na nne na sabini na...
1,507e10f8-0b2b-4bc0-9b69-94f96d907fb6.wav,Mwanadamu vibilia si na hivi sabato ikifika kw...
2,576fe4af-849e-4354-9320-368678330425.wav,Mke wangu amebadilika sana tabia zimebadilika ...
3,afc27663-229a-42b8-b47d-72ac6dfda574.wav,Kuna wataalamu wengi wa sayansi hivyo bado tun...
4,fbb7dd49-c974-4c57-ae67-1270b20859e2.wav,Zamani ziliitwa mataifa ya kati kwa upande mmoja.
...,...,...
4084,aec51d0c-0fed-4425-826f-5ac14e9931f5.wav,makumbusho ya Jimbo la Sutton huonyesha mikuta...
4085,24f9de28-612c-411c-a851-120e02ddd170.wav,ambazo jumla yake ni mia tisa arobaini na tisa
4086,da4f005f-aac4-4724-b7a7-cb0ba925f282.wav,tanzania imebarikiwa kuwa na vivutio vizuri vy...
4087,3931cc72-c9bb-45ce-8953-17b00dc4d800.wav,na hapo ndipo watu hao walianza kupata majina ...
