In [3]:
import music21
import pathlib
import subprocess

In [4]:
class ABCMusicConverter:
    score: music21.stream.base.Score | None
    destination: pathlib.Path
    filename: str
    midi_file: pathlib.Path | None
    wav_file: pathlib.Path | None
    mp3_file: pathlib.Path | None

    instruments: dict = {
        k.lower(): v
        for k, v in vars(music21.instrument).items()
        if hasattr(v, "bestName")
    }

    def __init__(self, abc: str, filename: str, destination: str | pathlib.Path = "."):
        self.destination = pathlib.Path(destination)
        self.filename = filename

        self.midi_file = None
        self.wav_file = None
        self.mp3_file = None

        self.score = music21.converter.parse(abc)

    def to_midi(
        self,
        midi_file: str | pathlib.Path | None = None,
        instrument: str | None = None,
        tempo: int | None = None,
    ) -> pathlib.Path:
        # Path to new midi file
        if midi_file is None:
            self.midi_file = (self.destination / self.filename).with_suffix(".mid")
        else:
            self.midi_file = pathlib.Path(midi_file)

        # Delete midi file if exists
        if self.midi_file.exists():
            self.midi_file.unlink()

        if instrument is not None:
            instrument = self.instruments.get(instrument.lower())
            for p in self.score.parts:
                p.insert(0, instrument())

        if tempo is not None:
            self.score.insert(0, music21.tempo.MetronomeMark(number=tempo))

        # Convert to midi
        mf = music21.midi.translate.music21ObjectToMidiFile(self.score)
        mf.open(self.midi_file, "wb")
        mf.write()
        mf.close()

        return self.midi_file

    def to_wav(
        self,
        wav_file: str | pathlib.Path | None = None,
        sound_font: str | pathlib.Path = "GeneralUser-GS.sf2",
        sampling_rate: int = 16000,
        **kwargs
    ) -> pathlib.Path:
        # Create midi file if necessary
        if self.midi_file is None:
            self.to_midi(**kwargs)

        # Path to new wav file
        if wav_file is None:
            self.wav_file = (self.destination / self.filename).with_suffix(".wav")
        else:
            self.wav_file = pathlib.Path(wav_file)

        # Remove file if exists
        if self.wav_file.exists():
            self.wav_file.unlink()

        # Check if sound_font exists
        sound_font = pathlib.Path(sound_font)
        assert sound_font.exists()

        # Convert to wav
        command = [
            "fluidsynth",
            "-ni",
            str(sound_font),
            str(self.midi_file),
            "-F",
            str(self.wav_file),
            "-r",
            str(sampling_rate),
        ]

        subprocess.run(command, check=True, capture_output=True)

        return self.wav_file

    def to_mp3(
        self, mp3_file: str | pathlib.Path | None = None, **kwargs
    ) -> pathlib.Path:
        # Create wave file if necessary
        if self.wav_file is None:
            self.to_wav(**kwargs)

        # Path to new mp3 file
        if mp3_file is None:
            self.mp3_file = (self.destination / self.filename).with_suffix(".mp3")
        else:
            self.mp3_file = pathlib.Path(mp3_file)

        # Remove file if exists
        if self.mp3_file.exists():
            self.mp3_file.unlink()

        command = ["ffmpeg", "-i", str(self.wav_file), str(self.mp3_file)]

        subprocess.run(command, check=True, capture_output=True)

        return self.mp3_file

In [167]:
ABCMusicConverter("cooleys.abc", "cooleys").to_mp3(instrument="violin", tempo=180)
ABCMusicConverter("cooleys.abc", "cooleys_flute").to_wav(instrument="flute", tempo=180)

PosixPath('cooleys_flute.wav')

# CLAP

In [4]:
import torch
from laion_clap import CLAP_Module

# Load pretrained CLAP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLAP_Module(enable_fusion=False)  # disable fusion for audio-only use
model.load_ckpt()  # downloads pretrained weights
model = model.to(device)

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Load our best checkpoint in the paper.
The checkpoint is already downloaded
Load Checkpoint...
logit_scale_a 	 Loaded
logit_scale_t 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_real.weight 	 Loaded
audio_branch.spectrogram_extractor.stft.conv_imag.weight 	 Loaded
audio_branch.logmel_extractor.melW 	 Loaded
audio_branch.bn0.weight 	 Loaded
audio_branch.bn0.bias 	 Loaded
audio_branch.patch_embed.proj.weight 	 Loaded
audio_branch.patch_embed.proj.bias 	 Loaded
audio_branch.patch_embed.norm.weight 	 Loaded
audio_branch.patch_embed.norm.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm1.weight 	 Loaded
audio_branch.layers.0.blocks.0.norm1.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.relative_position_bias_table 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.qkv.bias 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.weight 	 Loaded
audio_branch.layers.0.blocks.0.attn.proj.bias 	 Loaded
audio_branch.layers.0.blocks.0.norm2.we

In [5]:
import torchaudio

waveform, sr = torchaudio.load("cooleys.wav")

In [6]:
import torchaudio


def load_audio(filepath, target_sr=16000):
    waveform, sr = torchaudio.load(filepath)
    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)
    return waveform.mean(dim=0).unsqueeze(0)  # Convert to mono, add batch dim

In [7]:
audio_tensor = load_audio("cooleys.wav")

In [8]:
with torch.no_grad():
    embedding = model.get_audio_embedding_from_data(audio_tensor, use_tensor=True)

In [9]:
embedding.shape

torch.Size([1, 512])

In [10]:
audio_tensor2 = load_audio("cooleys_flute.wav")
with torch.no_grad():
    embedding2 = model.get_audio_embedding_from_data(audio_tensor2, use_tensor=True)

In [11]:
embedding2

tensor([[-4.7669e-02,  1.5185e-02,  5.8923e-02, -1.2021e-02,  9.5482e-02,
         -3.4315e-03, -4.8566e-02,  1.8087e-02,  2.7128e-02,  1.4882e-02,
         -6.4931e-02, -5.3605e-03,  4.7362e-02, -1.5416e-03, -5.5228e-03,
         -4.7449e-02, -6.3107e-02, -2.0978e-02, -3.9053e-02,  7.6627e-02,
          2.2313e-02,  1.4599e-01, -1.2977e-02,  4.0569e-03,  1.7864e-02,
         -6.3704e-05,  1.7713e-02, -1.1361e-02,  1.8299e-02, -7.1580e-04,
          5.2289e-02,  1.0640e-01,  9.7967e-03, -7.6950e-02,  1.0185e-01,
         -6.2829e-02, -8.8202e-02, -1.9113e-02, -2.3685e-02,  6.1610e-02,
         -6.7963e-02,  1.8895e-02, -8.2265e-02,  6.9059e-03,  2.3369e-02,
         -5.4911e-02, -1.8842e-02,  9.8239e-03, -2.3676e-02,  3.2094e-03,
          2.8726e-02, -6.8386e-02, -2.8290e-02,  3.8179e-02,  2.3533e-02,
          2.7939e-02, -1.0502e-01, -2.0496e-02,  1.4185e-02,  2.3109e-02,
          1.3942e-03, -1.5327e-02,  2.7790e-02, -3.3877e-02,  7.1842e-03,
         -4.5241e-02,  1.3446e-02, -1.

In [12]:
torch.nn.functional.cosine_similarity(embedding, embedding2)

tensor([0.7682], device='cuda:0')

In [13]:
ABCMusicConverter("butterfly.abc").to_wav(instrument="violin", tempo=160)

PosixPath('butterfly.wav')

In [14]:
ABCMusicConverter("butterfly.abc").to_wav(
    "butterfly_flute.wav", instrument="flute", tempo=160
)

PosixPath('butterfly_flute.wav')

In [15]:
audio_tensor3 = load_audio("butterfly.wav")
with torch.no_grad():
    embedding3 = model.get_audio_embedding_from_data(audio_tensor3, use_tensor=True)

In [16]:
audio_tensor4 = load_audio("butterfly_flute.wav")
with torch.no_grad():
    embedding4 = model.get_audio_embedding_from_data(audio_tensor4, use_tensor=True)

In [17]:
torch.nn.functional.cosine_similarity(embedding3, embedding4)

tensor([0.8442], device='cuda:0')

In [18]:
import torch
import torch.nn.functional as F


def nt_xent_loss(z1, z2, temperature=0.07):
    """
    Contrastive loss using implicit negatives (NT-Xent).
    Args:
        z1: Tensor of shape (N, D) – embeddings from view 1 (e.g., anchors)
        z2: Tensor of shape (N, D) – embeddings from view 2 (e.g., positives)
    Returns:
        Scalar contrastive loss
    """
    batch_size = z1.size(0)

    # Normalize embeddings
    z1 = F.normalize(z1, dim=1)
    z2 = F.normalize(z2, dim=1)

    # Concatenate for full 2N x D
    z = torch.cat([z1, z2], dim=0)  # shape: (2N, D)

    # Cosine similarity matrix (2N x 2N)
    sim = torch.matmul(z, z.T) / temperature  # shape: (2N, 2N)

    # Mask self-similarity
    mask = torch.eye(2 * batch_size, device=z.device).bool()
    sim.masked_fill_(mask, -float("inf"))  # ignore similarity to self

    # Targets: for i in 0..N-1, positive pair is i<->i+N and i+N<->i
    targets = torch.cat(
        [torch.arange(batch_size, 2 * batch_size), torch.arange(0, batch_size)]
    ).to(z.device)

    loss = F.cross_entropy(sim, targets)
    return loss

In [19]:
z1 = torch.cat([embedding, embedding3], dim=0)

In [20]:
z2 = torch.cat([embedding2, embedding4], dim=0)

In [21]:
nt_xent_loss(z1, z2)

tensor(2.1133, device='cuda:0')

In [13]:
import sqlite3

with sqlite3.connect("database.db") as con:
    cur = con.execute("SELECT TuneVersion FROM TuneVersions WHERE TuneID = 9")

    res = cur.fetchall()

con.close()

In [15]:
print(res[0][0])
ABCMusicConverter(res[1][0], "banish_misfortune").to_wav(instrument="violin", tempo=120)

X: 1
T: Banish Misfortune
R: jig
M: 6/8
L: 1/8
K: Dmix
|:fed cAG|A2d cAG|F2D DED|FEF GFG|
AGA cAG|AGA cde|fed cAG|Ad^c d3:|
|:f2d d^cd|f2g agf|e2c cBc|e2f gfe|
f2g agf|e2f gfe|fed cAG|Ad^c d3:|
|:f2g e2f|d2e c2d|ABA GAG|F2F GED|
c3 cAG|AGA cde|fed cAG|Ad^c d3:|


PosixPath('banish_misfortune.wav')