In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/Classification-Speech-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Classification-Speech-Instructions",
)

Fetching 21 files:   0%|          | 0/21 [00:00<?, ?it/s]

'/home/mesolitica/stt/Classification-Speech-Instructions'

In [2]:
from glob import glob
import json
import pandas as pd

files = glob('Classification-Speech-Instructions/data/gender_age-*.parquet')
files

['Classification-Speech-Instructions/data/gender_age-00003-of-00005.parquet',
 'Classification-Speech-Instructions/data/gender_age-00002-of-00005.parquet',
 'Classification-Speech-Instructions/data/gender_age-00004-of-00005.parquet',
 'Classification-Speech-Instructions/data/gender_age-00000-of-00005.parquet',
 'Classification-Speech-Instructions/data/gender_age-00001-of-00005.parquet']

In [4]:
import os
from tqdm import tqdm

ages = []
for f in files:
    f_only = os.path.split(f)[1].replace('.parquet', '')
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        d = json.loads(df['metadata'].iloc[i])
        new_f = os.path.join('Classification-Speech-Instructions-audio', 
                                 f'{f_only}-{i}.mp3')
        if not os.path.exists(new_f):
            continue
        m = df.iloc[i].to_dict()
        m.pop('audio_filename')
        ages.append({
            'audio_filename': new_f,
            'metadata': json.dumps(m),
            'answer': d['age'],
        })

100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19790.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19764.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19911.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19924.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19953.65it/s]


In [6]:
unique_ages = str(list(set([r['answer'] for r in ages])))
unique_ages

"['twenties', 'fifties', 'nineties', 'teens', 'fourties', 'sixties', 'thirties', 'seventies', 'eighties']"

In [7]:
import random

questions = [
    'given the labels\n{labels}\n\nwhat is the label for the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

for i in range(len(ages)):
    ages[i]['question'] = random.choice(questions).format(labels = unique_ages)

In [8]:
genders = []
for f in files:
    f_only = os.path.split(f)[1].replace('.parquet', '')
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        d = json.loads(df['metadata'].iloc[i])
        new_f = os.path.join('Classification-Speech-Instructions-audio', 
                                 f'{f_only}-{i}.mp3')
        if not os.path.exists(new_f):
            continue
        m = df.iloc[i].to_dict()
        m.pop('audio_filename')
        genders.append({
            'audio_filename': new_f,
            'metadata': json.dumps(m),
            'answer': d['gender'],
        })

100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19745.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19859.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19793.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20067.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20257.01it/s]


In [9]:
unique_genders = str(list(set([r['answer'] for r in genders])))
unique_genders

"['male_masculine', 'female_feminine']"

In [10]:
for i in range(len(genders)):
    genders[i]['question'] = random.choice(questions).format(labels = unique_genders)

In [12]:
locales = []
for f in files:
    f_only = os.path.split(f)[1].replace('.parquet', '')
    df = pd.read_parquet(f)
    for i in tqdm(range(len(df))):
        d = json.loads(df['metadata'].iloc[i])
        new_f = os.path.join('Classification-Speech-Instructions-audio', 
                                 f'{f_only}-{i}.mp3')
        if not os.path.exists(new_f):
            continue
        m = df.iloc[i].to_dict()
        m.pop('audio_filename')
        if len(d['locale']) < 2:
            continue
        locales.append({
            'audio_filename': new_f,
            'metadata': json.dumps(m),
            'answer': d['locale'],
        })

100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19679.12it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19763.98it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9753/9753 [00:00<00:00, 19363.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 20083.36it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:00<00:00, 19702.83it/s]


In [13]:
unique_locales = str(list(set([r['answer'] for r in locales])))
unique_locales

"['tr', 'as', 'zgh', 'ur', 'sl', 'el', 'ca', 'de', 'ar', 'is', 'ab', 'tt', 'pl', 'en', 'ro', 'ug', 'oc', 'da', 'ko', 'sv-SE', 'kmr', 'ne-NP', 'cs', 'az', 'skr', 'ba', 'pt', 'dv', 'ky', 'pa-IN', 'ia', 'rw', 'tk', 'bg', 'zh-CN', 'bn', 'eu', 'myv', 'nhi', 'mdf', 'eo', 'sq', 'tok', 'ka', 'tw', 'et', 'mrj', 'nn-NO', 'ti', 'sw', 'ps', 'lg', 'ml', 'rm-sursilv', 'th', 'nan-tw', 'hsb', 'fi', 'sat', 'yue', 'cnh', 'cy', 'uk', 'fa', 'it', 'hy-AM', 'yo', 'rm-vallader', 'ltg', 'sr', 'mt', 'br', 'mn', 'sc', 'hi', 'ha', 'lv', 'gn', 'sk', 'te', 'cv', 'hu', 'kab', 'gl', 'he', 'lo', 'ckb', 'kk', 'ta', 'mr', 'ga-IE', 'ig', 'lt', 'ast', 'vi', 'or', 'af', 'mhr', 'be', 'am', 'nl', 'es', 'ru', 'fy-NL', 'id', 'fr', 'sah', 'uz', 'mk', 'os', 'lij', 'ja', 'dyu']"

In [14]:
for i in range(len(locales)):
    locales[i]['question'] = random.choice(questions).format(labels = unique_locales)

In [15]:
len(ages), len(genders), len(locales)

(48767, 48767, 48767)

In [16]:
from datasets import Dataset

dataset = Dataset.from_list(ages)

In [17]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'age')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.26k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/4abdc3f0b7fe767462e04f5d0c750fec50754d3f', commit_message='Upload dataset', commit_description='', oid='4abdc3f0b7fe767462e04f5d0c750fec50754d3f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [21]:
dataset = Dataset.from_list(genders)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'gender')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/11.8M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/1e3a505b97cecd946c1358f16771c2b13e281fd0', commit_message='Upload dataset', commit_description='', oid='1e3a505b97cecd946c1358f16771c2b13e281fd0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [None]:
dataset = Dataset.from_list(locales)
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'language')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/12.0M [00:00<?, ?B/s]