In [None]:
import os
from pydub import AudioSegment
import pandas as pd
from datasets import Dataset, Audio, DatasetDict

# Paths
excel_path = "path/to/excel_file.xlsx"  
audio_dir = "path/to/audio_directory"    
output_audio_dir = "path/to/output_audio_directory"  


df = pd.read_excel(excel_path)


os.makedirs(output_audio_dir, exist_ok=True)


def convert_mp3_to_wav(mp3_path, wav_path):
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(wav_path, format="wav")


dataset_records = []
for index, row in df.iterrows():
    term = row['term']  
    phonetic = row['phonetic'] 
    mp3_path = os.path.join(audio_dir, f"{term}.mp3")
    wav_path = os.path.join(output_audio_dir, f"{term}.wav")

    if os.path.exists(mp3_path):
       
        convert_mp3_to_wav(mp3_path, wav_path)
        dataset_records.append({
            "term": term,
            "phonetic": phonetic,
            "audio": wav_path
        })
    else:
        print(f"Audio file for term '{term}' not found in {audio_dir}.")

# Create a Hugging Face Dataset
dataset = Dataset.from_dict(dataset_records)
dataset = dataset.cast_column("audio", Audio())  # Cast audio files to Audio feature

# Wrap dataset in DatasetDict if you want train/test split (optional)
dataset_dict = DatasetDict({"train": dataset})

# Save to Hugging Face format
dataset_dict.push_to_hub("your-username/your-audio-dataset")
