### VSASV

In [None]:
import torch
import torchaudio
import pandas as pd
import os

class Parquet2Audio_VSASV:
    def __init__(self, save_dir="audio_data"):
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)

    def process_parquet_folder(self, parquet_folder_path):
        for root, dirs, files in os.walk(parquet_folder_path):
            for file in files:
                if file.endswith(".parquet"):
                    full_path = os.path.join(root, file)
                    print(f"üìÇ ƒêang x·ª≠ l√Ω: {full_path}")
                    self.process_single_parquet(full_path)

    def process_single_parquet(self, parquet_path):
        try:
            df = pd.read_parquet(parquet_path)

            for index, row in df.iterrows():
                try:
                    # Ch·ªâ l·∫•y d·ªØ li·ªáu th·∫≠t (bonafide), b·ªè qua d·ªØ li·ªáu gi·∫£ (spoof)
                    if 'utt_type' in row and row['utt_type'] != 'bonafide':
                        continue

                    # Speaker ID
                    speaker_id = str(row['label'])

                    # File name
                    filename = os.path.basename(str(row['file']))
                    if not filename.endswith('.wav'):
                        filename = filename.replace('.', '_') + '.wav'

                    # --- X·ª¨ L√ù AUDIO ARRAY ---
                    audio_dict = row['audio']
                    input_array = audio_dict['array']
                    waveform = torch.tensor(input_array, dtype=torch.float32)

                    # Th√™m chi·ªÅu channels (Time,) -> (1, Time)
                    if waveform.dim() == 1:
                        waveform = waveform.unsqueeze(0)

                    # Ki·ªÉm tra audio r·ªóng
                    if waveform.numel() == 0:
                        print(f"‚ö†Ô∏è Audio r·ªóng t·∫°i d√≤ng {index}, b·ªè qua.")
                        continue

                    # L·∫•y sample rate g·ªëc t·ª´ dict
                    orig_sr = audio_dict.get('sampling_rate')
                    if orig_sr is None:
                        print(f"‚ö†Ô∏è C·∫£nh b√°o: Kh√¥ng t√¨m th·∫•y sampling_rate ·ªü d√≤ng {index}, b·ªè qua.")
                        continue

                    waveform = waveform.clamp(-1.0, 1.0)

                    # Save
                    spk_dir = os.path.join(self.save_dir, speaker_id)
                    os.makedirs(spk_dir, exist_ok=True)
                    save_path = os.path.join(spk_dir, filename)

                    torchaudio.save(save_path, waveform, orig_sr)

                except Exception as e:
                    print(f"‚ùå L·ªói d√≤ng {index}: {str(e)}")
                    continue

            print(f"‚úÖ ƒê√£ x·ª≠ l√Ω xong file: {parquet_path}")

        except Exception as e:
            print(f"‚ùå L·ªói ƒë·ªçc file parquet: {str(e)}")


if __name__ == "__main__":
    parquet_folder = r""
    cleaner = Parquet2Audio_VSASV(save_dir="VSASV_wav")
    cleaner.process_parquet_folder(parquet_folder)

### Preprocessing for VoxVietnam

In [None]:
class Parquet2Audio_Vox:
    def __init__(self, save_dir="audio_data"):
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)

    def process_parquet_folder(self, parquet_folder_path):
        for root, dirs, files in os.walk(parquet_folder_path):
            for file in files:
                if file.endswith(".parquet"):
                    full_path = os.path.join(root, file)
                    print(f"üìÇ ƒêang x·ª≠ l√Ω file: {file}")
                    self.process_single_parquet(full_path)

    def process_single_parquet(self, parquet_path):
        try:
            df = pd.read_parquet(parquet_path)

            # L·∫•y t√™n file parquet ƒë·ªÉ l√†m ti·ªÅn t·ªë ƒë·∫∑t t√™n
            parquet_name = os.path.splitext(os.path.basename(parquet_path))[0]

            for index, row in df.iterrows():
                try:
                    # Speaker ID
                    speaker_id = str(row['speaker'])

                    # TaÃ£o t√™n file: speakerID_parquetName_index.wav
                    filename = f"{speaker_id}_{parquet_name}_{index}.wav"

                    # --- X·ª¨ L√ù AUDIO ARRAY ---
                    audio_dict = row['audio']
                    input_array = audio_dict['array']
                    waveform = torch.tensor(input_array, dtype=torch.float32)

                    if waveform.dim() == 1:
                        waveform = waveform.unsqueeze(0)

                    if waveform.numel() == 0:
                        print(f"‚ö†Ô∏è Audio r·ªóng t·∫°i d√≤ng {index}, b·ªè qua.")
                        continue

                    orig_sr = audio_dict.get('sampling_rate')
                    if orig_sr is None:
                        print(f"‚ö†Ô∏è C·∫£nh b√°o: Kh√¥ng t√¨m th·∫•y sampling_rate ·ªü d√≤ng {index}, b·ªè qua.")
                        continue

                    waveform = waveform.clamp(-1.0, 1.0)

                    # Save
                    spk_dir = os.path.join(self.save_dir, speaker_id)
                    os.makedirs(spk_dir, exist_ok=True)
                    save_path = os.path.join(spk_dir, filename)

                    torchaudio.save(save_path, waveform, orig_sr)

                except Exception as e:
                    print(f"‚ùå L·ªói d√≤ng {index}: {str(e)}")
                    continue

            print(f"‚úÖ Xong file parquet: {parquet_path}")

        except Exception as e:
            print(f"‚ùå L·ªói ƒë·ªçc file parquet {parquet_path}: {str(e)}")


if __name__ == "__main__":
    parquet_folder_path = r""
    cleaner = Parquet2Audio_Vox(save_dir="Vox_train_wav")
    cleaner.process_parquet_folder(parquet_folder_path)