In [3]:
import os
import re
import librosa 
import pickle

In [4]:
#hinglish dict
with open('hinglish_dict.pkl', 'rb') as f:
    hinglish_dict = pickle.load(f)

#hindi transcriptions(unformatted)
transcription = open('test/transcription.txt', 'r')
lines = transcription.readlines()
unformatted_transcripts = []
for line in lines:
    unformatted_transcripts.append(line.strip())

In [7]:
audio_path = 'test/audio'
audio_files = os.listdir(audio_path)

In [8]:
len(audio_files)

3843

In [9]:
formatted_transcripts = []

for transcript in unformatted_transcripts:
  trans = re.sub(r'\d+_\d+', '', transcript)
  
  formatted_transcripts.append(trans.strip())

print(len(formatted_transcripts))
formatted_transcripts_set = set(formatted_transcripts)
print(len(formatted_transcripts_set))

3843
386


In [10]:
transcription_dict = {}

for transcription in unformatted_transcripts:
  audio_id = re.findall(r'\d+_\d+', transcription)[0]
  trans = re.sub(r'\d+_\d+', '', transcription)
  trans = ' '.join(trans.split())
  transcription_dict[audio_id] = trans

In [15]:
import torch
from torch.utils.data import Dataset

class AudioDataset(Dataset):
    def __init__(self, audio_files_path, transcription_dict, unformatted_transcripts, formatted_transcripts, hinglish_dict):
        self.audio_files_path = audio_files_path
        self.audio_files = os.listdir(audio_files_path)
        self.transcription_dict = transcription_dict
        self.unformatted_transcripts = unformatted_transcripts
        self.formatted_transcripts = formatted_transcripts
        self.hinglish_dict = hinglish_dict

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        #get audio file
        audio_file = self.audio_files[idx]
        #get audio num
        audio_num = re.findall(r'\d+_\d+', audio_file)[0]
        #speaker id
        speaker_id = re.findall(r'\d+', audio_num)[1]
        #get transcription
        hindi_transcription = self.transcription_dict[audio_num]
        hinglish_transcription = self.hinglish_dict[hindi_transcription]

        #get audio
        y, sr = librosa.load(f'{self.audio_files_path}/{audio_file}', sr = None)

        audio_dict = {
            'data': y,
            'sr': sr
        }

        return audio_dict, speaker_id, hindi_transcription, hinglish_transcription

In [16]:
dataset = AudioDataset('sr_8000', transcription_dict, unformatted_transcripts, formatted_transcripts, hinglish_dict)

In [17]:
from torch.utils.data import DataLoader
trainloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [18]:
audio_dict, speaker_id, hindi_transcription, hinglish_transcription = next(iter(trainloader))

# Speaker Embeddings

In [19]:
# speaker embedding
import os
import torch
import speechbrain
import torchaudio
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name)
)

  torch.load(path, map_location=device), strict=False
  stats = torch.load(path, map_location=device)


In [20]:
def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(waveform)
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [None]:
def padding_speechT5()

In [None]:
def prepare_dataset(audio_dict, hinglish_transcript, sample_rate):
    audio = audio_dict

    example = processor(
        text = hinglish_transcript[0],
        audio_target = np.squeeze(audio_dict["data"]), 
        sampling_rate = sample_rate,
        return_attention_mask=False,
    )

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(np.squeeze(audio_dict["data"]))

    return example