it's generally recommended to convert MP3 files to WAV format, preferably with a consistent sampling rate (e.g., 16kHz, mono) to ensure compatibility with training models

**Steps for Data Preprocessing:**

---

1. Convert MP3 files to WAV (16kHz, mono)

2. Load and inspect the dataset (TSV files)

3. Align audio files with text transcriptions

4. Remove invalid or low-quality samples

5. Normalize and clean text data

6. Split the train data into Train/Validation



---



In [2]:
import os
import librosa
import soundfile as sf
from tqdm import tqdm
import pandas as pd
import re
from sklearn.model_selection import train_test_split

**Mount Google Drive & Set Paths**

In [3]:
from google.colab import drive

#Mount Google Drive
drive.mount('/content/drive')

#Define dataset paths
DATASET_PATH = "/content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM"
CLIPS_PATH = os.path.join(DATASET_PATH, "clips")
TSV_FILES = [f for f in os.listdir(DATASET_PATH) if f.endswith('.tsv')]

print("Dataset Path:", DATASET_PATH)
print("Clips Path:", CLIPS_PATH)
print("TSV Files:", TSV_FILES)

Mounted at /content/drive
Dataset Path: /content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM
Clips Path: /content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM/clips
TSV Files: ['clip_durations.tsv', 'invalidated.tsv', 'test.tsv', 'validated.tsv', 'other.tsv', 'train.tsv', 'dev.tsv', 'reported.tsv', 'validated_sentences.tsv', 'unvalidated_sentences.tsv']


**Convert MP3 to WAV**

In [6]:
OUTPUT_WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")
os.makedirs(OUTPUT_WAV_PATH, exist_ok=True)

#convert MP3 to WAV
def convert_mp3_to_wav(mp3_file, wav_file, sr=16000):
    try:
        y, _ = librosa.load(mp3_file, sr=sr)
        sf.write(wav_file, y, sr)  #Save as WAV
    except Exception as e:
        print(f"Error processing {mp3_file}: {e}")

#List MP3 and WAV files
mp3_files = set([f for f in os.listdir(CLIPS_PATH) if f.endswith(".mp3")])
wav_files = set([f.replace(".wav", ".mp3") for f in os.listdir(OUTPUT_WAV_PATH) if f.endswith(".wav")])

#Find missing conversions
missing_files = mp3_files - wav_files

if missing_files:
    print(f"Resuming conversion for {len(missing_files)} missing files...")
    for file in tqdm(missing_files):
        mp3_path = os.path.join(CLIPS_PATH, file)
        wav_path = os.path.join(OUTPUT_WAV_PATH, file.replace(".mp3", ".wav"))
        convert_mp3_to_wav(mp3_path, wav_path)
    print("Conversion of missing files completed!")
else:
    print("All files are already converted!")


All files are already converted!


In [7]:
#Count WAV files to check if everything is converted

wav_files = [f for f in os.listdir(OUTPUT_WAV_PATH) if f.endswith(".wav")]
wav_count = len(wav_files)

print(f"Total WAV files: {wav_count}")

Total WAV files: 37632



**Load and inspect the *train.tsv* file**

In [8]:
TSV_PATH = os.path.join(DATASET_PATH, "train.tsv")
df = pd.read_csv(TSV_PATH, sep="\t")

#Display first few rows
print("First 5 Rows of train.tsv:")
print(df.head())

#Show column names
print("\nColumn Names in Dataset:")
print(df.columns)

#Check for missing values
print("\nMissing Values in Dataset:")
print(df.isnull().sum())


First 5 Rows of train.tsv:
                                           client_id  \
0  f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182...   
1  f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182...   
2  f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182...   
3  f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182...   
4  f0aba38a8ab8705a40d05d96829ded5738a7eec7a9a182...   

                              path  \
0  common_voice_hy-AM_26078953.mp3   
1  common_voice_hy-AM_26078954.mp3   
2  common_voice_hy-AM_26078955.mp3   
3  common_voice_hy-AM_26078956.mp3   
4  common_voice_hy-AM_26078957.mp3   

                                         sentence_id  \
0  00014f9fed6163512d57623235a957a437359074b4fb76...   
1  007874c957da34754fbbea93f069b5acdc65d1a31c5731...   
2  007d6389e52bd5bec1b7de98468981aac7cbd063b79222...   
3  00404cb33e31b3b8a5c1694596efbaf09797c10db3dd9b...   
4  005f7b60d6d50c3294a90bb34d8c93d627521103cc7659...   

                                            sentence  sentence_domain 

**Check Unique Samples & Distribution**

In [9]:
#total number of samples
print(f"\nTotal Samples: {len(df)}")

#unique sentences (sometimes duplicates exist)
print(f"Unique Transcriptions: {df['sentence'].nunique()}")

#dataset statistics
print("\nDataset Summary:")
print(df.describe())


Total Samples: 9270
Unique Transcriptions: 9270

Dataset Summary:
       sentence_domain     up_votes   down_votes  variant  segment
count              0.0  9270.000000  9270.000000      0.0      0.0
mean               NaN     2.792880     0.109061      NaN      NaN
std                NaN     1.326767     0.427345      NaN      NaN
min                NaN     2.000000     0.000000      NaN      NaN
25%                NaN     2.000000     0.000000      NaN      NaN
50%                NaN     2.000000     0.000000      NaN      NaN
75%                NaN     4.000000     0.000000      NaN      NaN
max                NaN    12.000000     6.000000      NaN      NaN


All 9270 samples are unique -> No duplicate sentences, which is good for training.

**Verify Audio-Text Alignment**

In [10]:
WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")

#available WAV files
wav_files = set(os.listdir(WAV_PATH))

#check how many audio files match the dataset
matched = df["path"].apply(lambda x: x.replace(".mp3", ".wav") in wav_files)
print(f"\nMatched Audio Files: {matched.sum()} / {len(df)}")

#rows where audio files are missing
missing_audio = df[~matched]
print(f"\nMissing Audio Files: {len(missing_audio)}")
print(missing_audio.head())

#missing files report
missing_audio.to_csv(os.path.join(DATASET_PATH, "missing_audio_files.csv"), index=False)


Matched Audio Files: 9270 / 9270

Missing Audio Files: 0
Empty DataFrame
Columns: [client_id, path, sentence_id, sentence, sentence_domain, up_votes, down_votes, age, gender, accents, variant, locale, segment]
Index: []


All 9270 files are found -> No missing WAV files.

**Keep Only Relevant Columns**

In [11]:
#only useful columns
filtered_df = df[["path", "sentence"]].copy()

#replace MP3 with WAV filenames
filtered_df["path"] = filtered_df["path"].apply(lambda x: x.replace(".mp3", ".wav"))

filtered_df.to_csv(os.path.join(DATASET_PATH, "filtered_train.csv"), index=False)
print(f"Filtered dataset saved with {len(filtered_df)} samples")

Filtered dataset saved with 9270 samples


**Clean and Filter**

In [12]:
#keeping the uppercase and punctuation in the text
def clean_text(text):
    text = re.sub(r"\s+", " ", text).strip()  #normalize whitespace
    return text

filtered_df["sentence"] = filtered_df["sentence"].apply(clean_text)
filtered_df.to_csv(os.path.join(DATASET_PATH, "cleaned_train.csv"), index=False)
print("Cleaned text saved")

Cleaned text saved


**Filter Out Bad Audio Samples**

In [13]:
WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")
valid_data = []
corrupt_files = []

for _, row in tqdm(filtered_df.iterrows(), total=len(filtered_df)):
    wav_path = os.path.join(WAV_PATH, row["path"])
    if os.path.exists(wav_path):
        try:
            y, sr = librosa.load(wav_path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            if 0.5 <= duration <= 15.0:
                valid_data.append((row["path"], row["sentence"], duration))
        except Exception as e:
            corrupt_files.append(wav_path)

final_df = pd.DataFrame(valid_data, columns=["wav_path", "transcript", "duration"])
final_df.to_csv(os.path.join(DATASET_PATH, "final_train.csv"), index=False)

  y, sr = librosa.load(wav_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
100%|██████████| 9270/9270 [2:14:30<00:00,  1.15it/s]


**Split Final Dataset into Train/Validation**

In [14]:
final_df = pd.read_csv(os.path.join(DATASET_PATH, "final_train.csv"))

#Train/validation split
train_df, new_val_df = train_test_split(final_df, test_size=0.1, random_state=42)

dev_df = pd.read_csv(os.path.join(DATASET_PATH, "dev.tsv"), sep="\t")[["path", "sentence"]]
dev_df["path"] = dev_df["path"].apply(lambda x: x.replace(".mp3", ".wav"))
dev_df["duration"] = -1  #optional placeholder
dev_df.columns = ["wav_path", "transcript", "duration"]

final_val_df = pd.concat([new_val_df, dev_df])

train_df.to_csv(os.path.join(DATASET_PATH, "train.csv"), index=False)
final_val_df.to_csv(os.path.join(DATASET_PATH, "validation.csv"), index=False)