# LSTM pour la GÃ©nÃ©ration Musicale (Format ABC)

# Chargement et exploration des donnÃ©es

In [3]:
from datasets import load_dataset

ds = load_dataset("sander-wood/irishman")

train_data = ds["train"]
val_data = ds["validation"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.json:   0%|          | 0.00/80.0M [00:00<?, ?B/s]

validation.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/214122 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2162 [00:00<?, ? examples/s]

In [4]:
print(f"Nombre de chansons (train) : {len(train_data)}")
print(f"Nombre de chansons (validation) : {len(val_data)}")

Nombre de chansons (train) : 214122
Nombre de chansons (validation) : 2162


In [5]:
train_data[0]

{'abc notation': 'X:1\nL:1/8\nM:4/4\nK:Emin\n|: E2 EF E2 EF | DEFG AFDF | E2 EF E2 B2 |1 efe^d e2 e2 :|2 efe^d e3 B |: e2 ef g2 fe | \n defg afdf |1 e2 ef g2 fe | efe^d e3 B :|2 g2 bg f2 af | efe^d e2 e2 ||',
 'control code': 'S:2\nB:5\nE:5\nB:6\n'}

# PrÃ©traitement des donnÃ©es
Dans cet exercice, vous allez prÃ©parer les donnÃ©es pour qu&#39;elles puissent Ãªtre utilisÃ©es avec un modÃ¨le
RNN.

## Ã‰tape 1 : Extraction des caractÃ¨res uniques

In [6]:
from tqdm import tqdm

In [7]:
texts = []

for song in tqdm(train_data, desc="Collecting training text"):
    texts.append(song["abc notation"])

all_text = "".join(texts)
unique_chars = sorted(set(all_text))

Collecting training text: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:06<00:00, 33812.86it/s]


In [8]:
# Nombre de caractÃ¨res uniques
print(f"Nombre de caractÃ¨res uniques : {len(unique_chars)}")

Nombre de caractÃ¨res uniques : 95


#### c) Pourquoi utiliser des indices plutÃ´t que des caractÃ¨res ?

Les rÃ©seaux de neurones **ne peuvent pas traiter directement des caractÃ¨res**. Ils manipulent uniquement des **valeurs numÃ©riques**.

ðŸ‘‰ On doit donc :

1. **Associer chaque caractÃ¨re Ã  un indice**
2. Convertir les sÃ©quences de texte en **sÃ©quences dâ€™entiers**
3. (Optionnel) Transformer ces indices en **one-hot vectors** ou en **embeddings**

In [None]:
# char2idx = {ch: i for i, ch in enumerate(unique_chars)}
# idx2char = {i: ch for ch, i in char2idx.items()}

# # Exemple de conversion :

# text = "ABC"
# encoded = [char2idx[c] for c in text]
# print(encoded)

## Ã‰tape 2 : Mapping caractÃ¨res-index
Pour convertir les caractÃ¨res en vecteurs numÃ©riques, nous allons crÃ©er une liste et un dictionnaire
permettant de faire les correspondances.

In [9]:
# a) Ã‰crivez un dictionnaire permettant de passer d&#39;un caractÃ¨re Ã  un index.
# b) Ã‰crivez une liste permettant de passer d&#39;un index Ã  un caractÃ¨re.

char2idx = {ch: i for i, ch in enumerate(unique_chars)}
idx2char = {i: ch for ch, i in char2idx.items()}

In [10]:
text = "ABC"
encoded = [char2idx[c] for c in text]
print(encoded)
decoded = [idx2char[i] for i in encoded]
decoded = "".join(decoded)
print(decoded)

[33, 34, 35]
ABC


## Ã‰tape 3 : Vectorisation des chaÃ®nes

In [11]:
def vectorize_string(text, char2idx):
    return [char2idx[ch] for ch in text]

In [12]:
vectorized_song = vectorize_string(train_data[0]["abc notation"], char2idx)
print(vectorized_song[:20])

[56, 26, 17, 0, 44, 26, 17, 15, 24, 0, 45, 26, 20, 15, 20, 0, 43, 26, 37, 77]


## Ã‰tape 4 : Padding des sÃ©quences

### a) Longueur maximale

In [13]:
max_len = max(
    len(song["abc notation"])
    for song in tqdm(train_data, desc="Computing max sequence length")
)

print("")
print("Longueur maximale :", max_len)

Computing max sequence length: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:05<00:00, 38356.56it/s]


Longueur maximale : 2968





### b) Fonction de padding / truncation

In [14]:
PAD_CHAR = " "
pad_idx = char2idx[PAD_CHAR]

def pad_string(text, max_len):
    if len(text) < max_len:
        return text + PAD_CHAR * (max_len - len(text))
    else:
        return text[:max_len]

# CrÃ©ation du dataset PyTorch

## Ã‰tape 1 : PrÃ©paration des donnÃ©es



In [15]:
def prepare_data(dataset):
    """
    PrÃ©pare les donnÃ©es pour un modÃ¨le RNN :
    - extrait le texte
    - construit le vocabulaire
    - crÃ©e les mappings char <-> index
    - calcule la longueur maximale
    - vectorise et pad les sÃ©quences
    """
    # 1. Extraction du texte
    texts = []
    for song in tqdm(dataset, desc="Extracting text"):
        texts.append(song["abc notation"])

    # 2. Vocabulaire
    all_text = "".join(texts)
    unique_chars = sorted(set(all_text))

    char2idx = {ch: i for i, ch in enumerate(unique_chars)}
    idx2char = {i: ch for ch, i in char2idx.items()}

    # 3. Longueur maximale
    max_len = max(
        len(song["abc notation"])
        for song in tqdm(train_data, desc="Computing max sequence length")
    )

    pad_char = " "
    pad_idx = char2idx[pad_char]

    # 4. Vectorisation + padding
    vectorized_texts = []

    for text in tqdm(texts, desc="Vectorizing & padding"):
        if len(text) < max_len:
            text = text + pad_char * (max_len - len(text))
        else:
            text = text[:max_len]

        vectorized_texts.append(
            [char2idx[ch] for ch in text]
        )

    return vectorized_texts, char2idx, idx2char, max_len

### PrÃ©paration des ensembles train / validation

In [16]:
train_sequences, char2idx, idx2char, max_len = prepare_data(train_data)
val_sequences, _, _, _ = prepare_data(val_data)

vocab_size = len(char2idx)

print("")
print("Vocab size :", vocab_size)
print("Max length :", max_len)

Extracting text: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:05<00:00, 38211.99it/s]
Computing max sequence length: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:04<00:00, 43823.47it/s]
Vectorizing & padding: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:30<00:00, 6910.30it/s]
Extracting text: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2162/2162 [00:00<00:00, 35696.49it/s]
Computing max sequence length: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 214122/214122 [00:04<00:00, 43885.08it/s]
Vectorizing & padding: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2162/2162 [00:00<00:00, 9857.95it/s]


Vocab size : 95
Max length : 2968





## Ã‰tape 2 : Dataset et DataLoader

In [17]:
import torch
from torch.utils.data import Dataset, DataLoader

In [40]:
class MusicDataset(Dataset):
    def __init__(self, sequences, seq_len=200, pad_idx=None):
        """
        sequences: list of vectorized sequences (list of indices)
        seq_len: max length of sequence for training
        pad_idx: index used for padding (e.g., index of " " or special token)
        """
        # self.sequences = sequences
        self.seq_len = seq_len
        self.pad_idx = pad_idx if pad_idx is not None else 0

        # Truncate or pad sequences
        self.sequences = []
        for seq in sequences:
            if len(seq) < seq_len:
                seq = seq + [self.pad_idx] * (seq_len - len(seq))
            else:
                seq = seq[:seq_len]
            self.sequences.append(seq)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]

        x = torch.tensor(seq[:-1], dtype=torch.long)
        y = torch.tensor(seq[1:], dtype=torch.long)

        return x, y

In [19]:
# DataLoaders (batch size = 8)

# batch_size = 8

# train_dataset = MusicDataset(train_sequences)
# val_dataset = MusicDataset(val_sequences)

# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [41]:
seq_len = 200  # truncate sequences to 200 characters
batch_size = 8

# If your padding index is the space character
pad_idx = char2idx[" "]

train_dataset = MusicDataset(train_sequences, seq_len=seq_len, pad_idx=pad_idx)
val_dataset   = MusicDataset(val_sequences, seq_len=seq_len, pad_idx=pad_idx)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size)

In [42]:
x_batch, y_batch = next(iter(train_loader))

print("Input batch shape :", x_batch.shape)
print("Target batch shape :", y_batch.shape)

print("\nPremiÃ¨re sÃ©quence d'entrÃ©e (indices) :")
print(x_batch[0][:20])

print("\nPremiÃ¨re sÃ©quence cible (indices) :")
print(y_batch[0][:20])

Input batch shape : torch.Size([8, 199])
Target batch shape : torch.Size([8, 199])

PremiÃ¨re sÃ©quence d'entrÃ©e (indices) :
tensor([56, 26, 17, 23, 19, 19, 23, 21,  0, 44, 26, 17, 15, 24,  0, 45, 26, 20,
        15, 20])

PremiÃ¨re sÃ©quence cible (indices) :
tensor([26, 17, 23, 19, 19, 23, 21,  0, 44, 26, 17, 15, 24,  0, 45, 26, 20, 15,
        20,  0])


In [43]:
print("EntrÃ©e :", "".join(idx2char[i.item()] for i in x_batch[0][:20]))
print("Cible  :", "".join(idx2char[i.item()] for i in y_batch[0][:20]))

EntrÃ©e : X:173375
L:1/8
M:4/4
Cible  : :173375
L:1/8
M:4/4



Nous avons implÃ©mentÃ© un Dataset PyTorch personnalisÃ© permettant de gÃ©rer des sÃ©quences musicales en notation ABC.
Chaque Ã©lÃ©ment du dataset est constituÃ© dâ€™une sÃ©quence dâ€™entrÃ©e et de sa sÃ©quence cible dÃ©calÃ©e dâ€™un pas, ce qui permet au modÃ¨le RNN dâ€™apprendre Ã  prÃ©dire le caractÃ¨re suivant Ã  chaque pas de temps.

# ImplÃ©mentation du modÃ¨le

## Ã‰tape 1 : Architecture du modÃ¨le

In [44]:
import torch
import torch.nn as nn

In [45]:
class MusicRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim
        )

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        """
        x : (batch_size, seq_len)
        """
        x = self.embedding(x)              # (batch, seq, embedding_dim)
        out, hidden = self.lstm(x, hidden) # (batch, seq, hidden_size)
        logits = self.fc(out)              # (batch, seq, vocab_size)

        return logits, hidden

## Ã‰tape 2 : Boucle d&#39;entraÃ®nement

In [46]:
# HyperparamÃ¨tres
num_training_iterations = 3000
batch_size = 256
learning_rate = 5e-3
embedding_dim = 256
hidden_size = 1024

In [47]:
# Accuracy caractÃ¨re-par-caractÃ¨re
def accuracy_fn(logits, targets):
    preds = torch.argmax(logits, dim=-1)
    correct = (preds == targets).float()
    return correct.mean()

### Fonction dâ€™entraÃ®nement

In [48]:
torch.backends.cudnn.benchmark = True

In [49]:
from torch.utils.tensorboard import SummaryWriter

In [50]:
def train_model(
    model,
    train_loader,
    val_loader,
    num_iterations,
    learning_rate,
    device,
    patience=5,
    log_every=50
    # validate_every=200
):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    writer = SummaryWriter("runs/music_rnn")

    best_val_loss = float("inf")
    early_stop_counter = 0
    iteration = 0

    model.to(device)

    while iteration < num_iterations:
        model.train()
        train_bar = tqdm(train_loader, desc=f"Training (iter {iteration})", leave=False)

        for x, y in train_bar:
            if iteration >= num_iterations:
                break

            x, y = x.to(device), y.to(device)

            optimizer.zero_grad(set_to_none=True)
            logits, _ = model(x)

            loss = criterion(
                logits.view(-1, logits.size(-1)),
                y.view(-1)
            )

            # acc = accuracy_fn(logits, y)

            loss.backward()
            optimizer.step()

            # writer.add_scalar("Train/Loss", loss.item(), iteration)
            # writer.add_scalar("Train/Accuracy", acc.item(), iteration)

            # train_bar.set_postfix(loss=loss.item(), acc=acc.item())

            if iteration % log_every == 0:
                acc = accuracy_fn(logits, y)
                writer.add_scalar("Train/Loss", loss.item(), iteration)
                writer.add_scalar("Train/Accuracy", acc.item(), iteration)
                train_bar.set_postfix(loss=loss.item(), acc=acc.item())
            else:
                train_bar.set_postfix(loss=loss.item())

            iteration += 1

        # ===== VALIDATION =====
        # if iteration % validate_every == 0:
        model.eval()
        val_loss = 0.0
        val_acc = 0.0

        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits, _ = model(x)

                loss = criterion(
                    logits.view(-1, logits.size(-1)),
                    y.view(-1)
                )

                acc = accuracy_fn(logits, y)
                val_loss += loss.item()
                val_acc += acc.item()

        val_loss /= len(val_loader)
        val_acc /= len(val_loader)

        writer.add_scalar("Val/Loss", val_loss, iteration)
        writer.add_scalar("Val/Accuracy", val_acc, iteration)

        print(f"[Validation  @ {iteration}] Loss={val_loss:.4f} | Acc={val_acc:.4f}")

        # ===== EARLY STOPPING =====
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), "best_music_rnn.pt")
            print("âœ… Best model saved")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("â›” Early stopping triggered")
                break

    writer.close()

### EntraÃ®nement

In [51]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device : {device}")

model = MusicRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size
)

train_model(
    model,
    train_loader,
    val_loader,
    num_training_iterations,
    learning_rate,
    device
)

device : cuda




[Validation  @ 3000] Loss=5.7984 | Acc=0.1745
âœ… Best model saved


# GÃ©nÃ©ration de musique

In [52]:
import torch.nn.functional as F

In [53]:
def generate_music(
    model,
    start_sequence,
    char2idx,
    idx2char,
    length=200,
    temperature=1.0,
    device="cpu"
):
    model.eval()

    input_seq = torch.tensor(
        [char2idx[ch] for ch in start_sequence],
        dtype=torch.long
    ).unsqueeze(0).to(device)

    generated = start_sequence
    hidden = None

    for _ in range(length):
        logits, hidden = model(input_seq, hidden)
        logits = logits[:, -1, :] / temperature

        probs = F.softmax(logits, dim=-1)
        next_idx = torch.multinomial(probs, 1).item()
        next_char = idx2char[next_idx]

        generated += next_char
        input_seq = torch.tensor([[next_idx]], dtype=torch.long).to(device)

    return generated

In [55]:
model.load_state_dict(torch.load("best_music_rnn.pt"))

start = "X:4\nM:3/4\nK:A\n"
generated_song = generate_music(
    model,
    start,
    char2idx,
    idx2char,
    length=200,
    temperature=0.8,
    device=device
)

print(generated_song)

X:4
M:3/4
K:A
 A3 A A :: g | f2 ef | g2 fe | f2 ed | e3 e | f4 | d2 ef | g2 fe | f2 f2 | e2 e/^d/e/d/ | c4 | B3 z | d2 B3 | A2 z2 | F2 D2 | F2 c2 | A4 G2 | F2 F2 G2 :| A2 c2 | e3 e | 
 e2 a2 | e2 a2 | gf ed | c2 A2


# Bonus â€” Augmentation de donnÃ©es

### Transposition (concept)

* Modifier `K:C â†’ K:D`
* DÃ©caler les notes (`Câ†’D`, `Dâ†’E`, â€¦)

### Simplification rythmique

* `1/8 â†’ 1/4`
* Suppression de dÃ©corations (`!fermata!`)

In [None]:
def transpose_abc(text):
    mapping = {"C": "D", "D": "E", "E": "F#", "F": "G", "G": "A", "A": "B", "B": "c"}
    return "".join(mapping.get(ch, ch) for ch in text)

# Conclusion finale

> Un modÃ¨le LSTM avec embeddings a Ã©tÃ© implÃ©mentÃ© pour la gÃ©nÃ©ration de musique en notation ABC.
> Lâ€™entraÃ®nement repose sur une prÃ©diction caractÃ¨re-par-caractÃ¨re avec une sÃ©quence cible dÃ©calÃ©e.
> Le modÃ¨le est entraÃ®nÃ© avec early stopping et monitoring via TensorBoard, puis utilisÃ© pour gÃ©nÃ©rer
> de nouvelles partitions musicales par Ã©chantillonnage probabiliste.