# Task 1: Unconditioned Symbolic Generation

In [None]:
# !git clone https://github.com/facebookresearch/audiocraft.git
# %cd audiocraft
# !uv pip install -e .
# !uv pip install dora-search numba ipython ipykernel librosa mido PyYAML

In [16]:
%cd /home/matt/audiocraft

/home/matt/audiocraft


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [24]:
import json
import os
import random
import re
import shutil
from collections import defaultdict
from functools import partial
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import librosa
import mido

import numpy as np
import torch
import yaml
from tqdm import tqdm

In [25]:
from audiocraft.data.audio import audio_write
from audiocraft.models import MusicGen

In [3]:
condition_data_path = Path("/data/matt/conditions.json")
with open(condition_data_path, "r") as f:
    condition_data = json.load(f)

In [8]:
SLAKH_DIR = Path("/data/matt/slakh2100_flac_redux")
BABYSLAKH_DIR = Path("/data/matt/babyslakh_16k")
TRACK_ID_PATTERN = re.compile(r"slakh2100_flac_redux\/(.+?)\/Track(\d+)\/mix\.flac$")
BABYSLAKH_TRACK_ID_PATTERN = re.compile(r"\/Track(\d+)\/mix\.wav$")
DEFAULT_INSTRUMENTS = ["Piano", "Bass", "Guitar", "Drums"]
DEFAULT_MIDI_TEMPO = 500000
BABYSLAKH_SAMPLE_RATE = 16000
SLAKH_SAMPLE_RATE = 44100


def get_babyslakh_paths(root_dir: Path = BABYSLAKH_DIR) -> List[Path]:
    return [
        root_dir / track_dir / "mix.wav"
        for track_dir in os.listdir(root_dir)
        if "Track" in track_dir and (root_dir / track_dir / "mix.wav").exists()
    ]


def get_slakh_paths(root_dir: Path = SLAKH_DIR) -> List[Path]:
    splits = ["train", "test", "validation"]
    paths = []
    for split_dir in os.listdir(root_dir):
        if split_dir not in splits:
            continue
        split_path = root_dir / split_dir
        for track_dir in os.listdir(split_path):
            mix_path = split_path / track_dir / "mix.flac"
            if "Track" in track_dir and mix_path.exists():
                paths.append(mix_path)
    return paths


def extract_sample_id(path: str, is_babyslakh: bool = False) -> Tuple[str, str]:
    pattern = BABYSLAKH_TRACK_ID_PATTERN if is_babyslakh else TRACK_ID_PATTERN
    match = pattern.search(path)
    if match is None:
        raise ValueError(f"Track ID not found in path: {path}")
    if is_babyslakh:
        coin_flip = random.randint(0, 1)
        split = "test" if coin_flip == 0 else "train"
        return split, match.group(1)
    return match.group(1), match.group(2)


def get_midi_program_names(track_directory: Path) -> List[str]:
    try:
        with open(track_directory / "metadata.yaml", "r") as f:
            metadata = yaml.safe_load(f)
        program_names = []
        for stem_id, stem_info in metadata["stems"].items():
            if "midi_program_name" in stem_info:
                program_names.append(stem_info["midi_program_name"])
        return program_names
    except Exception as e:
        print(f"Failed to load metadata for {track_directory}: {e}")
        return DEFAULT_INSTRUMENTS


def get_tempo(mid):
    for track in mid.tracks:
        for msg in track:
            if msg.type == "set_tempo":
                return msg.tempo
    return DEFAULT_MIDI_TEMPO


def get_bpm(track_directory: Path) -> int:
    try:
        mid = mido.MidiFile(track_directory / "all_src.mid")
        tempo = get_tempo(mid)
    except Exception as e:
        print(f"Failed to get tempo for {track_directory}: {e}")
        tempo = DEFAULT_MIDI_TEMPO
    return round(mido.tempo2bpm(tempo))


def get_condition_data(slakh_paths, is_babyslakh: bool = False) -> Dict[str, Any]:
    condition_data = defaultdict(dict)
    for audio_path in tqdm(slakh_paths):
        track_directory = audio_path.parent
        path_str = str(audio_path)
        split, track_id = extract_sample_id(path_str, is_babyslakh=is_babyslakh)
        if split == "train":
            split = "training"
        try:
            bpm = get_bpm(track_directory)
            program_names = get_midi_program_names(track_directory)
            condition_data[split][track_id] = {
                "bpm": bpm,
                "midi_program_names": program_names,
                "track_path": str(audio_path),
            }
        except Exception as e:
            print(f"Failed on {audio_path}: {e}")
    return condition_data

In [22]:
babyslakh_paths = get_babyslakh_paths()
condition_data = get_condition_data(babyslakh_paths, is_babyslakh=True)

100%|██████████| 20/20 [00:02<00:00,  6.83it/s]


In [25]:
# Create .jsonl from the extracted features, make a train/test split, and save in the right place.
# make sure the .jsonl has a place to go
train_dir = Path("/data/matt/ac_bs/train")
test_dir = Path("/data/matt/ac_bs/test")


def write_jsonl(data: list[dict], file_path: Path) -> None:
    with open(file_path, "w") as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")


def prepare_data(
    train_dir: Path,
    test_dir: Path,
    train_split_ratio: float = 0.8,
    sr: int = BABYSLAKH_SAMPLE_RATE,
    file_extension: str = "wav",
):
    train_dir.mkdir(parents=True, exist_ok=True)
    test_dir.mkdir(parents=True, exist_ok=True)

    train_data = []
    test_data = []

    for split, split_data in condition_data.items():
        for track_id, track_info in tqdm(split_data.items(), total=len(split_data)):
            path = Path(track_info["track_path"])
            y, sr = librosa.load(path)
            chroma = librosa.feature.chroma_stft(y=y, sr=sr)
            key = np.argmax(np.sum(chroma, axis=1))
            length = librosa.get_duration(y=y, sr=sr)
            entry = {
                "key": str(key),
                "sample_rate": sr,
                "file_extension": file_extension,
                "description": "",
                "keywords": "",
                "duration": length,
                "bpm": track_info["bpm"],
                "genre": "",
                "title": "",
                "name": "",
                "instrument": ", ".join(track_info["midi_program_names"]),
                "moods": [],
                "path": str(path),
            }
            if random.random() < train_split_ratio:
                train_data.append(entry)
            else:
                test_data.append(entry)

    print(f"train size: {len(train_data)}, test size: {len(test_data)}")
    write_jsonl(train_data, train_dir / "data.jsonl")
    write_jsonl(test_data, test_dir / "data.jsonl")

In [26]:
prepare_data(train_dir, test_dir)

100%|██████████| 12/12 [00:08<00:00,  1.36it/s]
100%|██████████| 8/8 [00:06<00:00,  1.21it/s]

train size: 15, test size: 5





## run training with dora

In [34]:
command = """\
CUDA_VISIBLE_DEVICES=4,5,6,7 dora -P audiocraft run \
  solver=musicgen/musicgen_base_32khz \
  +model.lm.model_scale=small \
  continue_from=//pretrained/facebook/musicgen-small \
  conditioner=text2music \
  dset=audio/babyslakh \
  dataset.num_workers=2 \
  dataset.valid.num_samples=1 \
  dataset.batch_size=2 \
  schedule.cosine.warmup=8 \
  optim.optimizer=adamw \
  optim.lr=1e-4 \
  optim.epochs=2 \
  optim.updates_per_epoch=100 \
  optim.adam.weight_decay=0.01 \
  generate.lm.prompted_samples=False \
  generate.lm.gen_gt_samples=True
"""

In [35]:
!{command}

Dora directory: /tmp/audiocraft_matt
Traceback (most recent call last):
  File "/data/matt/miniconda3/envs/cse253/bin/dora", line 10, in <module>
    sys.exit(main())
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/__main__.py", line 170, in main
    args.action(args, main)
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/run.py", line 51, in run_action
    xp = main.get_xp(args.argv)
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/hydra.py", line 190, in get_xp
    delta += self._get_delta(base, cfg)
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/hydra.py", line 297, in _get_delta
    for diff in _compare_config(init, other):
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/hydra.py", line 75, in _compare_config
    yield from _compare_config(ref_value, other_value, path)
  File "/data/matt/miniconda3/envs/cse253/lib/python3.9/site-packages/dora/hydra.p

## check generations from fit loop

In [6]:
samples_dir = Path("/tmp/audiocraft_matt/xps/ed9b1b62/samples")

In [7]:
os.listdir(samples_dir / "2")

['44934f915e9b73ce8431dcd1dbb6aba94fda896c_unprompted_description=none.json',
 '44934f915e9b73ce8431dcd1dbb6aba94fda896c_unprompted_description=none.wav',
 '75c79b8b41528b9679f23da52b9f21ef0b8499d0_unprompted_description=none.wav',
 '75c79b8b41528b9679f23da52b9f21ef0b8499d0_unprompted_description=none.json']

In [8]:
from IPython.display import Audio, display

In [9]:
ref_dir = samples_dir / "reference"
lsrd = os.listdir(ref_dir)
ref_file = ref_dir / lsrd[0]
# display(Audio(ref_file, autoplay=True))

In [10]:
audio_file = (
    samples_dir
    / "2"
    / "44934f915e9b73ce8431dcd1dbb6aba94fda896c_unprompted_description=none.wav"
)
print(audio_file, audio_file.exists())
# display(Audio(filename=audio_file))

/tmp/audiocraft_matt/xps/ed9b1b62/samples/2/44934f915e9b73ce8431dcd1dbb6aba94fda896c_unprompted_description=none.wav True


## export fine-tuned model params

In [4]:
checkpoints_dir = Path("/data/matt/mg_checkpoints")
v1_checkpoints_dir = checkpoints_dir / "v1/finetune"

In [57]:
# Exporting .bin files from a training run:

from audiocraft import train
from audiocraft.utils import export

sig = "ed9b1b62"

# from https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md#importing--exporting-models
xp = train.main.get_xp_from_sig(sig)
v1_checkpoints_dir.mkdir(parents=True, exist_ok=True)
# export.export_lm(xp.folder / 'checkpoint.th', '/data/matt/mg_checkpoints/v1/finetune/state_dict.bin')
export.export_lm(xp.folder / "checkpoint.th", v1_checkpoints_dir / "state_dict.bin")
# export.export_pretrained_compression_model('facebook/encodec_32khz', '/data/matt/mg_checkpoints/v1/finetune/compression_state_dict.bin')
export.export_pretrained_compression_model(
    "facebook/encodec_32khz", v1_checkpoints_dir / "compression_state_dict.bin"
)

## Generate New Samples

In [3]:
torch.cuda.set_device(4)

In [None]:
# load your finetune
# musicgen = MusicGen.get_pretrained(v1_checkpoints_dir)
baseline = MusicGen.get_pretrained("small")
# musicgen.set_generation_params(duration=16)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
BASELINE_OUTPUT_DIR = Path("/data/matt/mg_baseline_output")
FINETUNE_OUTPUT_DIR = Path("/data/matt/mg_finetune_output")


def unconditional_generate_wrapper(
    model: MusicGen,
    duration: int = 16,
    num_samples: int = 32,
    output_dir: Path = BASELINE_OUTPUT_DIR,
    batch_size: int = 4,
):
    output_dir.mkdir(parents=True, exist_ok=True)
    model.set_generation_params(duration=duration)
    for i in tqdm(range(0, num_samples, batch_size)):
        samples_in_this_batch = min(batch_size, num_samples - i)
        batch = model.generate_unconditional(num_samples=samples_in_this_batch)
        wavs = batch.cpu()
        for j, wav in enumerate(wavs):
            audio_write(
                output_dir / f"sample_{i + j}",
                wav,
                model.sample_rate,
                strategy="loudness",
            )

In [20]:
# generate baseline
unconditional_generate_wrapper(
    baseline,
    duration=16,
    num_samples=32,
    output_dir=BASELINE_OUTPUT_DIR,
    batch_size=4,
)

  0%|          | 0/8 [00:00<?, ?it/s]CLIPPING /data/matt/mg_baseline_output/sample_0.wav happening with proba (a bit of clipping is okay): 0.0003828124899882823 maximum scale:  1.4274364709854126
CLIPPING /data/matt/mg_baseline_output/sample_1.wav happening with proba (a bit of clipping is okay): 0.0004394531133584678 maximum scale:  1.0900932550430298
CLIPPING /data/matt/mg_baseline_output/sample_2.wav happening with proba (a bit of clipping is okay): 0.0003632812586147338 maximum scale:  1.246445655822754
CLIPPING /data/matt/mg_baseline_output/sample_3.wav happening with proba (a bit of clipping is okay): 0.0019199218368157744 maximum scale:  1.6721266508102417
 12%|█▎        | 1/8 [00:22<02:39, 22.83s/it]CLIPPING /data/matt/mg_baseline_output/sample_4.wav happening with proba (a bit of clipping is okay): 0.0010078124469146132 maximum scale:  1.7724496126174927
CLIPPING /data/matt/mg_baseline_output/sample_5.wav happening with proba (a bit of clipping is okay): 0.004183593671768904 m

In [5]:
# generate from fine-tuned model
finetuned_model = MusicGen.get_pretrained(v1_checkpoints_dir)

In [8]:
# generate unconditional
unconditional_generate_wrapper(
    finetuned_model,
    duration=16,
    num_samples=32,
    output_dir=FINETUNE_OUTPUT_DIR,
    batch_size=4,
)

  0%|          | 0/8 [00:00<?, ?it/s]CLIPPING /data/matt/mg_finetune_output/sample_1 happening with proba (a bit of clipping is okay): 0.0019277343526482582 maximum scale:  1.7483588457107544
CLIPPING /data/matt/mg_finetune_output/sample_2 happening with proba (a bit of clipping is okay): 0.0013710937928408384 maximum scale:  1.5394408702850342
CLIPPING /data/matt/mg_finetune_output/sample_3 happening with proba (a bit of clipping is okay): 0.00041796875302679837 maximum scale:  1.3901313543319702
 12%|█▎        | 1/8 [00:22<02:40, 22.88s/it]CLIPPING /data/matt/mg_finetune_output/sample_4 happening with proba (a bit of clipping is okay): 1.9531250927684596e-06 maximum scale:  1.0083940029144287
CLIPPING /data/matt/mg_finetune_output/sample_5 happening with proba (a bit of clipping is okay): 0.0016152344178408384 maximum scale:  1.5649049282073975
CLIPPING /data/matt/mg_finetune_output/sample_6 happening with proba (a bit of clipping is okay): 0.0004648437607102096 maximum scale:  1.339