In [10]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))



def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        # os.remove(zip_file_path)

# multiprocessing(files, loop, cores = min(len(files), 20), returned = False)
# files = glob('TAU-*.zip')
# multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [11]:
!ls TAU-urban-acoustic-scenes-2022-mobile-development

audio  evaluation_setup  LICENSE  meta.csv  README.html  README.md


In [12]:
files = glob('TAU-urban-acoustic-scenes-2022-mobile-development/audio/*')
len(files)

230350

In [13]:
files[0]

'TAU-urban-acoustic-scenes-2022-mobile-development/audio/park-paris-98-2761-4-a.wav'

In [17]:
!mkdir tau-2022-audio

In [19]:
import pandas as pd
import soundfile as sf
import librosa

df = pd.read_csv('TAU-urban-acoustic-scenes-2022-mobile-development/meta.csv', sep = '\t').to_dict(orient = 'records')
df[0]

{'filename': 'audio/airport-lisbon-1000-40000-0-a.wav',
 'scene_label': 'airport',
 'identifier': 'lisbon-1000',
 'source_label': 'a'}

In [20]:
labels = [r['scene_label'] for r in df]
labels = str(list(set(labels)))
labels

"['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']"

In [27]:
import random
import json
questions = [
    'given the labels\n{labels}\n\nclassify the audio',
    'what is the label for audio\n\nthe labels: {labels}'
]

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = os.path.join('TAU-urban-acoustic-scenes-2022-mobile-development', r['filename'])
        audio_filename = os.path.join('tau-2022-audio', f.replace('/', '_')).replace('.wav', '.mp3')
        
        if not os.path.exists(audio_filename):
            y, sr = librosa.load(f, sr = 16000)
            sf.write(audio_filename, y, sr)
        
        data.append({
            'question': random.choice(questions).format(labels=labels),
            'answer': r['scene_label'],
            'audio_filename': audio_filename,
            'metadata': json.dumps(r),
        })
    return data

In [28]:
processed = loop((df[:10], 0))

100%|████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 17848.10it/s]


In [29]:
processed[0]

{'question': "what is the label for audio\n\nthe labels: ['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']",
 'answer': 'airport',
 'audio_filename': 'tau-2022-audio/TAU-urban-acoustic-scenes-2022-mobile-development_audio_airport-lisbon-1000-40000-0-a.mp3',
 'metadata': '{"filename": "audio/airport-lisbon-1000-40000-0-a.wav", "scene_label": "airport", "identifier": "lisbon-1000", "source_label": "a"}'}

In [32]:
processed = multiprocessing(df, loop, cores = 20)

100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 115.26it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.37it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 119.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:36<00:00, 118.85it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.50it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 118.11it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11517/11517 [01:37<00:00, 117.89it/s]


In [33]:
len(processed)

230350

In [35]:
processed[0]

{'question': "what is the label for audio\n\nthe labels: ['street_traffic', 'street_pedestrian', 'public_square', 'shopping_mall', 'metro', 'airport', 'metro_station', 'bus', 'park', 'tram']",
 'answer': 'airport',
 'audio_filename': 'tau-2022-audio/TAU-urban-acoustic-scenes-2022-mobile-development_audio_airport-lisbon-1000-40000-0-a.mp3',
 'metadata': '{"filename": "audio/airport-lisbon-1000-40000-0-a.wav", "scene_label": "airport", "identifier": "lisbon-1000", "source_label": "a"}'}

In [36]:
from datasets import Dataset

dataset = Dataset.from_list(processed)

In [37]:
dataset.push_to_hub('mesolitica/Zeroshot-Audio-Classification-Instructions', split = 'tau2022')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/231 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/5.59M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/commit/80352deea4eed4770fefb65b8b324309bb6806f5', commit_message='Upload dataset', commit_description='', oid='80352deea4eed4770fefb65b8b324309bb6806f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/Zeroshot-Audio-Classification-Instructions'), pr_revision=None, pr_num=None)

In [38]:
from glob import glob

audio_files = glob('tau-2022-audio/*.mp3')
len(audio_files)

230350

In [39]:
import zipfile

with zipfile.ZipFile('tau-2022-audio.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for f in audio_files:
        zipf.write(f, arcname=f)

In [40]:
!huggingface-cli upload mesolitica/Zeroshot-Audio-Classification-Instructions tau-2022-audio.zip \
--repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 1.23G/1.23G [00:25<00:00, 48.9MB/s]
https://huggingface.co/datasets/mesolitica/Zeroshot-Audio-Classification-Instructions/blob/main/tau-2022-audio.zip


In [43]:
!rm TAU-urban-acoustic-scenes-2022-mobile-development*.zip