In [2]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!ls tts_dataset/vocals_00ech6h9uiuf

chunks	vocals_converted.wav  vocals_metadata.csv


In [4]:
import os
import pandas as pd

def combine_metadata_files(root_dir: str, output_file: str = "big_metadata.csv") -> None:
    """
    Combines all vocals_metadata.csv files under root_dir into a single CSV.
    Only keeps 'file_name' and 'text' columns, skipping malformed rows.
    """
    combined = []

    for index,subdir in enumerate(os.listdir(root_dir)):
        print(f'{index} {subdir} working on')
        subdir_path = os.path.join(root_dir, subdir)
        if not os.path.isdir(subdir_path):
            continue

        meta_path = os.path.join(subdir_path, "vocals_metadata.csv")

        if os.path.exists(meta_path):
            try:
                df = pd.read_csv(meta_path, usecols=['filename', 'text'], on_bad_lines='skip')
                df['filename'] = df['filename'].apply(lambda x: os.path.join(subdir, "chunks", x))
                combined.append(df)
            except Exception as e:
                print(f"❌ Skipping {meta_path} due to error: {e}")

    if combined:
        final_df = pd.concat(combined, ignore_index=True)
        final_df.to_csv(output_file, index=False)
        print(f"✅ Combined metadata saved to '{output_file}' with {len(final_df)} entries.")
    else:
        print("⚠️ No valid metadata files found.")



In [24]:
combine_metadata_files("tts_dataset", "my_tts_metadata.csv")

0 vocals_xata3d1xiacx working on
1 vocals_w8jc7jx8iw00 working on
2 vocals_z9g1p26cthw5 working on
3 vocals_rl42r0nh03up working on
4 vocals_l84b32vy9iyi working on
5 vocals_tid29hk4956s working on
6 vocals_26plman4n1xa working on
7 vocals_37gzp8k3a6tu working on
8 vocals_yehcjrrp82og working on
9 vocals_1r7knqym6q3e working on
10 vocals_h9j07pbskc36 working on
11 vocals_73izjeyhsti0 working on
12 vocals_k2y67wuiijoc working on
13 vocals_7xqtsl3qtrc4 working on
14 vocals_70gja33ffjrl working on
15 vocals_im0cgr8pn5d8 working on
16 vocals_p1yp1393wjbr working on
17 vocals_lqp08i5h82vs working on
18 vocals_2zeb40m73ho0 working on
19 vocals_9o66lywxx91m working on
20 vocals_yh0kqq1huvas working on
21 vocals_nbbtjktawubp working on
22 vocals_0h0wqpjqvl3p working on
23 vocals_3xtf3ejbe6kb working on
24 vocals_0bmj52bf155q working on
25 vocals_n95j54ziz2uz working on
26 vocals_0lof0l3fwvx8 working on
27 vocals_moot8msgqasi working on
28 vocals_vko41lning78 working on
29 vocals_xsbftoaabfy9 w

In [5]:
df = pd.read_csv('my_tts_metadata.csv')

In [6]:
import random
from IPython.display import Audio, display
import pandas as pd

In [7]:
def play_random_audio(df: pd.DataFrame, root_dir: str = ""):
    """
    Randomly picks a row from the DataFrame and plays the audio file.
    Also displays the corresponding transcript.

    Args:
        df (pd.DataFrame): DataFrame with 'file_name' and 'text' columns.
        root_dir (str): Root path to prepend if file paths are relative.
    """
    if df.empty:
        print("⚠️ DataFrame is empty.")
        return

    row = df.sample(1).iloc[0]
    file_path = os.path.join(root_dir, row['filename'])

    print(f"🎧 Playing: {file_path}")
    print(f"📝 Transcript: {row['text']}\n")

    try:
        display(Audio(filename=file_path))
    except Exception as e:
        print(f"❌ Could not play audio: {e}")


In [8]:
play_random_audio(df, root_dir="tts_dataset")

🎧 Playing: tts_dataset/vocals_o4liifizj41h/chunks/vocals_chunk_0084_357920ms_65vk.wav
📝 Transcript: اگر آپ غور کریں تو این اس وقت جب لکڑی ڈرون کی طرف پھینکی انہوں نے



In [9]:
import wave

def get_wav_duration(file_path: str) -> float:
    """
    Returns the duration of a WAV file in seconds.

    Args:
        file_path (str): Path to the WAV file.

    Returns:
        float: Duration in seconds.
    """
    with wave.open(file_path, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
        return duration


In [10]:
# duration = get_wav_duration("tts_dataset/vocals_00ech6h9uiuf/chunks/vocals_chunk_0000_15180ms_qshq.wav")
# print(f"Duration: {duration:.2f} seconds")


In [11]:
def join_path(file_path,root_dir = 'tts_dataset'):
    file_path = os.path.join(root_dir, file_path)
    return file_path

In [12]:
df['filename'] = df['filename'].apply(join_path)

In [13]:
df['duration'] = df['filename'].apply(get_wav_duration)

In [14]:
print('total_hours',df['duration'].sum() / 3600)

total_hours 132.85004145679017


In [15]:
df['duration'].min()

0.0

In [16]:
df['duration'].max()

42.95995555555555

In [17]:
# Keep only rows where duration is between 1 and 20 seconds
filtered_df = df[(df['duration'] >= 1.0) & (df['duration'] <= 20.0)].reset_index(drop=True)

print(f"✅ Filtered dataset: {len(filtered_df)} entries remaining (from {len(df)})")

✅ Filtered dataset: 104521 entries remaining (from 106790)


In [60]:
# df

In [18]:
print('total_hours',filtered_df['duration'].sum() / 3600)

total_hours 130.7799937160494


In [22]:
df = filtered_df

In [23]:
df['duration'].min(),df['duration'].max(),

(1.0189777777777778, 19.999955555555555)

In [24]:
!pip install seaborn

[0m

In [25]:
df['duration_bucket'] = pd.cut(df['duration'], bins=[0, 1, 5, 10, 15, 20, 25], right=False)

# Count files in each bucket
bucket_counts = df['duration_bucket'].value_counts().sort_index()
print(bucket_counts)


duration_bucket
[0, 1)          0
[1, 5)      70181
[5, 10)     27924
[10, 15)     5713
[15, 20)      703
[20, 25)        0
Name: count, dtype: int64


In [26]:
df.to_csv('final_meta.csv',index=False)

In [27]:
df

Unnamed: 0,filename,text,duration,duration_bucket
0,tts_dataset/vocals_xata3d1xiacx/chunks/vocals_...,یہ واحد ایرانی میزائل ہوگا جو سات منٹ سے بھی پ...,4.419956,"[1, 5)"
1,tts_dataset/vocals_xata3d1xiacx/chunks/vocals_...,اس سوال کا جواب کہ اسرائیل کیا رسپونس کرے گا ب...,3.399956,"[1, 5)"
2,tts_dataset/vocals_xata3d1xiacx/chunks/vocals_...,اس کے لوکل چینلز چلا چلا کر بتا رہے ہیں۔,3.199956,"[1, 5)"
3,tts_dataset/vocals_xata3d1xiacx/chunks/vocals_...,اسرائیل کبھی نہیں چاہے گا کہ ایرانی ہائپرسونک ...,3.779956,"[1, 5)"
4,tts_dataset/vocals_xata3d1xiacx/chunks/vocals_...,کچھ اور ہو نہ ہو اسرائیلی ڈیفنس سسٹم کا ملٹری ...,3.619956,"[1, 5)"
...,...,...,...,...
104516,tts_dataset/vocals_ai84qlskv2r0/chunks/vocals_...,ٹچ کر کے دیکھ سکتے ہیں یہاں دیکھئے,1.799956,"[1, 5)"
104517,tts_dataset/vocals_ai84qlskv2r0/chunks/vocals_...,رومن امپائر کے اروجوزوال کی کہانی,2.139956,"[1, 5)"
104518,tts_dataset/vocals_ai84qlskv2r0/chunks/vocals_...,اور یہ رہی چین ووس اور امریکہ کے درمیان,2.619956,"[1, 5)"
104519,tts_dataset/vocals_ai84qlskv2r0/chunks/vocals_...,ٹریڈ گیم پر ہماری ڈاکمنٹری سیڈیز کا,2.159956,"[1, 5)"


In [28]:
!pip install datasets huggingface_hub

[0m

In [100]:
df = pd.read_csv('final_meta.csv')

In [101]:
from datasets import Dataset, DatasetDict
from huggingface_hub import notebook_login



In [102]:
from datasets import Audio

In [103]:
df = df.dropna(subset=['filename', 'text', 'duration','filename'])

In [104]:
df = df.copy()

In [105]:
df['audio'] = df['filename']

In [106]:
df['filename'] = df['filename'].apply(os.path.basename)

In [95]:
# df

In [107]:
dataset_dict = df[['text','audio','duration','filename']].to_dict(orient='list')

In [108]:
# dataset_dict['audio']

In [48]:
# dataset_dict = df.to_dict(orient='list')

In [37]:
# dataset_dict['duration']

In [77]:
hf_dataset = Dataset.from_dict(dataset_dict)

In [78]:
# hf_dataset['audio']

In [112]:
hf_dataset = hf_dataset.cast_column("audio", Audio())

In [113]:
hf_dataset

Dataset({
    features: ['text', 'audio', 'duration'],
    num_rows: 104520
})

In [114]:
repo_name = "m-aliabbas1/myeidi"
hf_dataset.push_to_hub(repo_name)


Uploading the dataset shards:   0%|                                                                                                         | 0/42 [00:00<?, ?it/s]
Map:   0%|                                                                                                                         | 0/2489 [00:00<?, ? examples/s][A
Map:  16%|█████████████████▌                                                                                           | 400/2489 [00:00<00:00, 3066.06 examples/s][A
Map:  48%|████████████████████████████████████████████████████                                                        | 1200/2489 [00:00<00:00, 5322.79 examples/s][A
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2489/2489 [00:00<00:00, 5556.32 examples/s][A

Creating parquet from Arrow format:   0%|                                                                                                   | 0/25 [00:00<?, ?ba/s][A


CommitInfo(commit_url='https://huggingface.co/datasets/m-aliabbas1/myeidi/commit/147caca026b2e8b86858ae37e148c22ff48926bc', commit_message='Upload dataset', commit_description='', oid='147caca026b2e8b86858ae37e148c22ff48926bc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/m-aliabbas1/myeidi', endpoint='https://huggingface.co', repo_type='dataset', repo_id='m-aliabbas1/myeidi'), pr_revision=None, pr_num=None)

In [115]:
print('done')

done


In [116]:
print('DONE')

DONE
