In [1]:
from datasets import load_dataset, load_from_disk
from torch.utils.data import DataLoader
import os
import pandas as pd 
import librosa
import numpy as np
import matplotlib.pyplot as plt
import torch 
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('amaai-lab/MusicBench', data_files="MusicBench_train_modified.json")  
# dataset = load_dataset('amaai-lab/MusicBench')  

In [3]:
from musiclm_pytorch import MuLaN, AudioSpectrogramTransformer, TextTransformer


2024-04-19 15:20:56 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [4]:
audio_transformer = AudioSpectrogramTransformer(
    dim=512,
    depth=6,
    heads=8,
    dim_head=64,
    spec_n_fft=128,
    spec_win_length=24,
    spec_aug_stretch_factor=0.8
)

text_transformer = TextTransformer(
    dim=512,
    depth=6,
    heads=8,
    dim_head=64
)

mulan = MuLaN(
    audio_transformer=audio_transformer,
    text_transformer=text_transformer
)


In [5]:
dataloader = DataLoader(dataset['train'], batch_size=64, shuffle=True, collate_fn=lambda x: x)


In [6]:
dataloader.dataset[0]


{'dataset': 'MusicBench',
 'location': 'data_aug2/-0SdAVK79lg_1.wav',
 'main_caption': 'This mellow instrumental track showcases a dominant electric guitar that opens with a descending riff, followed by arpeggiated chords, hammer-ons, and a slide. The percussion section keeps it simple with rim shots and a common time count, while the bass adds a single note on the first beat of every bar. Minimalist piano chords round out the song while leaving space for the guitar to shine. There are no vocals, making it perfect for a coffee shop or some chill background music. The key is in E major, with a chord progression that centers around that key and a straightforward 4/4 time signature.',
 'alt_caption': 'This song features an electric guitar as the main instrument. The guitar plays a descending run in the beginning then plays an arpeggiated chord followed by a double stop hammer on to a higher note and a descending slide followed by a descending chord run. The percussion plays a simple beat 

In [7]:
from datasets import load_dataset

ds = load_dataset('google/MusicCaps', split='train')

In [11]:
import subprocess
import os
from pathlib import Path

def download_clip(
    video_identifier,
    output_filename,
    start_time,
    end_time,
    tmp_dir='/tmp/musiccaps/',
    num_attempts=5,
    url_base='https://www.youtube.com/watch?v='
):
    status = False

    command = f"""
        yt-dlp --quiet --no-warnings -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" {url_base}{video_identifier}
    """.strip()

    attempts = 0
    while True:
        try:
            output = subprocess.check_output(command, shell=True,
                                                stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            attempts += 1
            if attempts == num_attempts:
                return status, err.output
        else:
            break

    # Check if the video was successfully saved.
    status = os.path.exists(output_filename)
    return status, 'Downloaded'

def process(example):
    outfile_path = str(data_dir / f"{example['ytid']}.wav")
    status = True
    if not os.path.exists(outfile_path):
        status = False
        status, log = download_clip(
            example['ytid'],
            outfile_path,
            example['start_s'],
            example['end_s'],
        )

    example['audio'] = outfile_path
    example['download_status'] = status
    return example

In [12]:
from datasets import Audio

samples_to_load = 32      # How many samples to load
cores = 4                 # How many processes to use for the loading
sampling_rate = 44100     # Sampling rate for the audio, keep in 44100
writer_batch_size = 1000  # How many examples to keep in memory per worker. Reduce if OOM.
data_dir = "./music_data" # Where to save the data

# Just select some samples 
ds = ds.select(range(samples_to_load))

# Create directory where data will be saved
data_dir = Path(data_dir)
data_dir.mkdir(exist_ok=True, parents=True)

ds = ds.map(
        process,
        num_proc=cores,
        writer_batch_size=writer_batch_size,
        keep_in_memory=False
    ).cast_column('audio', Audio(sampling_rate=sampling_rate))

Map (num_proc=4): 100%|██████████| 32/32 [00:30<00:00,  1.04 examples/s]


In [13]:
ds[0]

{'ytid': '-0Gj8-vB1q4',
 'start_s': 30,
 'end_s': 40,
 'audioset_positive_labels': '/m/0140xf,/m/02cjck,/m/04rlf',
 'aspect_list': "['low quality', 'sustained strings melody', 'soft female vocal', 'mellow piano melody', 'sad', 'soulful', 'ballad']",
 'caption': 'The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft female vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services.',
 'author_id': 4,
 'is_balanced_subset': False,
 'is_audioset_eval': True,
 'audio': {'path': 'music_data/-0Gj8-vB1q4.wav',
  'array': array([-2.98399059e-03, -3.63287400e-05,  2.31859041e-03, ...,
         -1.94326155e-02, -2.38079876e-02,  0.00000000e+00]),
  'sampling_rate': 44100},
 'download_status': True}