it's generally recommended to convert MP3 files to WAV format, preferably with a consistent sampling rate (e.g., 16kHz, mono) to ensure compatibility with training models

**Steps for Data Preprocessing:**

---

1. Convert MP3 files to WAV (16kHz, mono)

2. Load and inspect the dataset (TSV files)

3. Align audio files with text transcriptions

4. Remove invalid or low-quality samples

5. Normalize and clean text data

6. Split the train data into Train/Validation



---



**Mount Google Drive & Set Paths**

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define dataset paths
DATASET_PATH = "/content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM"
CLIPS_PATH = os.path.join(DATASET_PATH, "clips")
TSV_FILES = [f for f in os.listdir(DATASET_PATH) if f.endswith('.tsv')]

print("Dataset Path:", DATASET_PATH)
print("Clips Path:", CLIPS_PATH)
print("TSV Files:", TSV_FILES)

Mounted at /content/drive
Dataset Path: /content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM
Clips Path: /content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM/clips
TSV Files: ['clip_durations.tsv', 'invalidated.tsv', 'test.tsv', 'validated.tsv', 'other.tsv', 'train.tsv', 'dev.tsv', 'reported.tsv', 'validated_sentences.tsv', 'unvalidated_sentences.tsv']


**Convert MP3 to WAV**

In [None]:
import os
import librosa
import soundfile as sf
from tqdm import tqdm

OUTPUT_WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")

#Create directory for WAV files
os.makedirs(OUTPUT_WAV_PATH, exist_ok=True)

def convert_mp3_to_wav(mp3_file, wav_file, sr=16000):
    y, _ = librosa.load(mp3_file, sr=sr)  #Load with 16kHz sample rate
    sf.write(wav_file, y, sr)

#Convert all the MP3 files
for file in tqdm(os.listdir(CLIPS_PATH)):
    if file.endswith(".mp3"):
        mp3_path = os.path.join(CLIPS_PATH, file)
        wav_path = os.path.join(OUTPUT_WAV_PATH, file.replace(".mp3", ".wav"))
        convert_mp3_to_wav(mp3_path, wav_path)

print("MP3 to WAV conversion completed!")


 99%|█████████▉| 37303/37632 [3:24:26<02:01,  2.71it/s]

In [None]:
import os
import librosa
import soundfile as sf
from tqdm import tqdm

OUTPUT_WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")
os.makedirs(OUTPUT_WAV_PATH, exist_ok=True)

#convert MP3 to WAV
def convert_mp3_to_wav(mp3_file, wav_file, sr=16000):
    try:
        y, _ = librosa.load(mp3_file, sr=sr)
        sf.write(wav_file, y, sr)  #Save as WAV
    except Exception as e:
        print(f"Error processing {mp3_file}: {e}")

#List MP3 and WAV files
mp3_files = set([f for f in os.listdir(CLIPS_PATH) if f.endswith(".mp3")])
wav_files = set([f.replace(".wav", ".mp3") for f in os.listdir(OUTPUT_WAV_PATH) if f.endswith(".wav")])

#Find missing conversions
missing_files = mp3_files - wav_files

if missing_files:
    print(f"Resuming conversion for {len(missing_files)} missing files...")
    for file in tqdm(missing_files):
        mp3_path = os.path.join(CLIPS_PATH, file)
        wav_path = os.path.join(OUTPUT_WAV_PATH, file.replace(".mp3", ".wav"))
        convert_mp3_to_wav(mp3_path, wav_path)
    print("Conversion of missing files completed!")
else:
    print("All files are already converted!")


All files are already converted!


In [None]:
'''

#Count WAV files to check if everything is converted

wav_files = [f for f in os.listdir(OUTPUT_WAV_PATH) if f.endswith(".wav")]
wav_count = len(wav_files)

print(f"Total WAV files: {wav_count}")

'''

Total WAV files: 37632



**Load and inspect the *train.tsv* file**

In [None]:
import pandas as pd
import os

TSV_PATH = os.path.join(DATASET_PATH, "train.tsv")
df = pd.read_csv(TSV_PATH, sep="\t")

#Display first few rows
print("First 5 Rows of train.tsv:")
print(df.head())

#Show column names
print("\nColumn Names in Dataset:")
print(df.columns)

#Check for missing values
print("\nMissing Values in Dataset:")
print(df.isnull().sum())


**Check Unique Samples & Distribution**

In [None]:
#total number of samples
print(f"\nTotal Samples: {len(df)}")

#unique sentences (sometimes duplicates exist)
print(f"Unique Transcriptions: {df['sentence'].nunique()}")

#dataset statistics
print("\nDataset Summary:")
print(df.describe())


Total Samples: 9270
Unique Transcriptions: 9270

Dataset Summary:
       sentence_domain     up_votes   down_votes  variant  segment
count              0.0  9270.000000  9270.000000      0.0      0.0
mean               NaN     2.792880     0.109061      NaN      NaN
std                NaN     1.326767     0.427345      NaN      NaN
min                NaN     2.000000     0.000000      NaN      NaN
25%                NaN     2.000000     0.000000      NaN      NaN
50%                NaN     2.000000     0.000000      NaN      NaN
75%                NaN     4.000000     0.000000      NaN      NaN
max                NaN    12.000000     6.000000      NaN      NaN


All 9270 samples are unique → No duplicate sentences, which is good for training.

**Verify Audio-Text Alignment**

In [None]:
import os

WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")

#available WAV files
wav_files = set(os.listdir(WAV_PATH))

#check how many audio files match the dataset
matched = df["path"].apply(lambda x: x.replace(".mp3", ".wav") in wav_files)
print(f"\nMatched Audio Files: {matched.sum()} / {len(df)}")

#rows where audio files are missing
missing_audio = df[~matched]
print(f"\nMissing Audio Files: {len(missing_audio)}")
print(missing_audio.head())

#missing files report
missing_audio.to_csv(os.path.join(DATASET_PATH, "missing_audio_files.csv"), index=False)


Matched Audio Files: 9270 / 9270

Missing Audio Files: 0
Empty DataFrame
Columns: [client_id, path, sentence_id, sentence, sentence_domain, up_votes, down_votes, age, gender, accents, variant, locale, segment]
Index: []


All 9270 files are found → No missing WAV files.

**Keep Only Relevant Columns**

In [None]:
#only useful columns
filtered_df = df[["path", "sentence"]].copy()

#replace MP3 with WAV filenames
filtered_df["path"] = filtered_df["path"].apply(lambda x: x.replace(".mp3", ".wav"))

filtered_df.to_csv(os.path.join(DATASET_PATH, "filtered_train.csv"), index=False)
print(f"Filtered dataset saved with {len(filtered_df)} samples!")

Filtered dataset saved with 9270 samples!


**Clean the Transcriptions**

In [None]:
import re

def clean_text(text):
    text = text.lower()  #Convert to lowercase
    text = re.sub(r"[^ա-ֆԱ-Ֆa-zA-Z0-9\s]", "", text)  #Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  #Remove extra spaces
    return text

filtered_df["sentence"] = filtered_df["sentence"].apply(clean_text)
filtered_df.to_csv(os.path.join(DATASET_PATH, "cleaned_train.csv"), index=False)

print("Text cleaning completed!")


Text cleaning completed!


**Filter Out Bad Audio Samples**

In [None]:
import librosa
from tqdm import tqdm

WAV_PATH = os.path.join(DATASET_PATH, "wav_clips")

valid_data = []
min_duration = 0.5  # Min valid duration (seconds)
max_duration = 15.0  # Max valid duration (seconds)

corrupt_files = []

for index, row in tqdm(filtered_df.iterrows(), total=len(filtered_df)):
    wav_file = os.path.join(WAV_PATH, row["path"])

    if os.path.exists(wav_file):
        try:
            y, sr = librosa.load(wav_file, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)

            #only valid durations
            if min_duration <= duration <= max_duration:
                valid_data.append((row["path"], row["sentence"], duration))

        except Exception as e:
            print(f"!!!Skipping {wav_file} due to error: {e}!!!")
            corrupt_files.append(wav_file)

#Convert to DataFrame
final_df = pd.DataFrame(valid_data, columns=["wav_path", "transcript", "duration"])

#Save final processed dataset
final_df.to_csv(os.path.join(DATASET_PATH, "final_train.csv"), index=False)

#Save list of corrupt files
if corrupt_files:
    with open(os.path.join(DATASET_PATH, "corrupt_files.txt"), "w") as f:
        for file in corrupt_files:
            f.write(file + "\n")

print(f"Final dataset saved with {len(final_df)} valid samples!")
print(f"Skipped {len(corrupt_files)} corrupted files. See 'corrupt_files.txt' for details.")

  y, sr = librosa.load(wav_file, sr=None)
 87%|████████▋ | 8041/9270 [00:24<00:03, 354.73it/s]

!!!Skipping /content/drive/MyDrive/Capstone/cv-corpus-20.0-2024-12-06/hy-AM/wav_clips/common_voice_hy-AM_39460081.wav due to error: !!!


100%|██████████| 9270/9270 [04:33<00:00, 33.90it/s]


Final dataset saved with 9269 valid samples!
Skipped 1 corrupted files. See 'corrupt_files.txt' for details.


**Split final_train.csv into Train/Validation**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

train_full_df = pd.read_csv(os.path.join(DATASET_PATH, "final_train.csv"))
test_df = pd.read_csv(os.path.join(DATASET_PATH, "test.tsv"), sep="\t")
#90% train, 10% validation
train_df, new_val_df = train_test_split(train_full_df, test_size=0.1, random_state=42)

dev_df = pd.read_csv(os.path.join(DATASET_PATH, "dev.tsv"), sep="\t")[["path", "sentence"]]
dev_df["path"] = dev_df["path"].apply(lambda x: x.replace(".mp3", ".wav"))

#Combine new validation split with the existing validation set
final_val_df = pd.concat([dev_df, new_val_df])

#Saving the train and validation sets
train_df.to_csv(os.path.join(DATASET_PATH, "train.csv"), index=False)
final_val_df.to_csv(os.path.join(DATASET_PATH, "validation.csv"), index=False)

print(f"New Train Set: {len(train_df)} samples")
print(f"Final Validation Set: {len(final_val_df)} samples (Including dev.tsv)")
print(f"Test Set: {len(test_df)} samples")



New Train Set: 8342 samples
Final Validation Set: 6633 samples (Including dev.tsv)
Test Set: 5856 samples
