In [1]:
# Imports

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.vocab import vocab
from collections import OrderedDict
import random
import math
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Définition des tokens utilisés par notre transformer


NOTE_SIZE = 128
DUR_SIZE = 160
TIM_SIZE = 1000
VEL_SIZE = 128


NOTE_TOKS = [f'n{i}' for i in range(NOTE_SIZE)] 
DUR_TOKS = [f'd{i}' for i in range(DUR_SIZE)]
TIM_TOKS = [f't{i}' for i in range(TIM_SIZE)]
VEL_TOKS = [f'v{i}' for i in range(VEL_SIZE)]

BOS_TOK = "BOS"
# Le token dummy sert seulement à initialiser les mots du vocab à partir de l'index 1, conformément aux prérequis de la fonction vocab()
VOCAB = ["dummy"] + [BOS_TOK] + NOTE_TOKS + DUR_TOKS + TIM_TOKS + VEL_TOKS 

DICT = [(element, index) for index, element in enumerate(VOCAB)]


In [3]:
# Définition du vocabulaire
custom_vocab = vocab(OrderedDict(DICT))
itos_vocab = custom_vocab.get_itos()

vocab_size = len(custom_vocab)

Les deux cellules suivantes permettent de définir l'architecture utilisée (pour que le fichier du modèle puisse y définir les poids)

In [4]:


class PositionalEncoding(nn.Module):
    def __init__(self, dim_model, dropout_p, max_len):
        super().__init__()
        # Modified version from: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
        # max_len determines how far the position can have an effect on a token (window)
        
        # Info
        self.dropout = nn.Dropout(dropout_p)
        
        # Encoding - From formula
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1) # 0, 1, 2, 3, 4, 5
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model) # 1000^(2i/dim_model)
        
        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)
        
        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)
        
        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pos_encoding",pos_encoding)
        
    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        # Residual connection + pos encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])
class Transformer(nn.Module):
    """
    Model from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/p/c80afbc9ffb1/
    """
    # Constructor
    def __init__(
        self,
        num_tokens,
        dim_model,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        dropout_p,
    ):
        super().__init__()

        # INFO
        self.model_type = "Transformer"
        self.dim_model = dim_model

        # LAYERS
        self.positional_encoder = PositionalEncoding(
            dim_model=dim_model, dropout_p=dropout_p, max_len=5000
        )
        self.embedding = nn.Embedding(num_tokens, dim_model)
        self.transformer = nn.Transformer(
            d_model=dim_model,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dropout=dropout_p,
            batch_first = True
        )
        self.out = nn.Linear(dim_model, num_tokens)
    # A modifier pour utiliser 4 out functions différentes selon les cas    
    def forward(self, src, tgt, tgt_mask=None, src_pad_mask=None, tgt_pad_mask=None):
        # Src size must be (batch_size, src sequence length)
        # Tgt size must be (batch_size, tgt sequence length)

        # Embedding + positional encoding - Out size = (batch_size, sequence length, dim_model)
        src = self.embedding(src) * math.sqrt(self.dim_model)
        tgt = self.embedding(tgt) * math.sqrt(self.dim_model)
        src = self.positional_encoder(src)
        tgt = self.positional_encoder(tgt)
        
        transformer_out = self.transformer(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=tgt_pad_mask)
        out = self.out(transformer_out)
        
        return out

    # Genere un masque triangulaire  
    def get_tgt_mask(self, size) -> torch.tensor:
        # Generates a squeare matrix where the each row allows one word more to be seen
        mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
        mask = mask.float()
        mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
        mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
        
        # EX for size=5:
        # [[0., -inf, -inf, -inf, -inf],
        #  [0.,   0., -inf, -inf, -inf],
        #  [0.,   0.,   0., -inf, -inf],
        #  [0.,   0.,   0.,   0., -inf],
        #  [0.,   0.,   0.,   0.,   0.]]
        
        return mask
    
    # Le pad mask sera utile quand on aura ajouté les PAD tokens
    # def create_pad_mask(self, matrix: torch.tensor, pad_token: int) -> torch.tensor:
    #     # If matrix = [1,2,3,0,0,0] where pad_token=0, the result mask is
    #     # [False, False, False, True, True, True]
    #     return (matrix == pad_token)

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [6]:
# On charge le modèle pré entraîné
model = Transformer(
num_tokens=len(custom_vocab), dim_model=512, num_heads=8, num_encoder_layers=1, num_decoder_layers=4, dropout_p=0.1)
model.load_state_dict(torch.load("modelRAS.pth"))

model.to(device)

Transformer(
  (positional_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embedding): Embedding(1417, 512)
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerD

Génération

In [7]:
# L'input donné à l'encoder (vecteur nul dans notre cas, comme pendant l'entraînement)
taille_bloc = 120
X = torch.tensor([0]*taille_bloc).unsqueeze(0).to(device)

In [8]:


def generate_sequence(model, start_tokens, max_length=100, temperature=1.0):
    model.eval()

    with torch.no_grad():
        tokens = start_tokens
        for _ in range(max_length - len(start_tokens)):
            # Unsqueeze(0) rajoute une dimension qui correspond au batch_size (qui vaut 1 dans ce cas) pour coller aux shape attendues par le modèle
            input_tokens = torch.tensor(tokens).unsqueeze(0).to(device)
            
            output = model(X, input_tokens)
            # Les logits sont des probabilits non normalisées. La température contrôle leur dispersion : permet d'ajouter plus ou moins de bruit lors de la prédiction
            logits = output[:, -1, :] / temperature
            # Softmax transforme les logits en probabilités, multinomial fait une séleciton pondérée par ces probabilités d''un seul indice (num_samples=1), to_list passe de tensor à array
            next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1).squeeze().tolist()
            tokens.append(next_token)

    return tokens




In [11]:

# Define the start tokens for your inference
#start_tokens = [custom_vocab["n60"], custom_vocab["d1"], custom_vocab["t1"], custom_vocab["v64"]]
start_tokens = ['n65','d2','v104','n63','d2','t4','v109','n58','d1','t3','v103','n61','d3','t2','v104','n63','d2','t4','v114','n58','d2','t5','v106']
start_tokens = [custom_vocab[el] for el in start_tokens]
# Generate a sequence of tokens
generated_tokens = generate_sequence(model, start_tokens, max_length=500, temperature=1.0)

# Decode the generated tokens into the original format
decoded_tokens = [itos_vocab[el] for el in generated_tokens]
np.save("generation4.npy", decoded_tokens)

print("Generated sequence:", decoded_tokens)

Generated sequence: ['n65', 'd2', 'v104', 'n63', 'd2', 't4', 'v109', 'n58', 'd1', 't3', 'v103', 'n61', 'd3', 't2', 'v104', 'n63', 'd2', 't4', 'v114', 'n58', 'd2', 't5', 'v106', 'n58', 'd4', 't4', 'v93', 't722', 'v100', 'n60', 'd1', 't2', 'v101', 'n60', 'd2', 't4', 'v113', 'n58', 'd3', 't2', 'v109', 'n66', 'd2', 't2', 'v92', 'n58', 'd1', 't3', 'v90', 'n60', 'd1', 't2', 't67', 'n58', 'd1', 't3', 'v102', 'n63', 'd2', 't2', 'v98', 'n60', 'd1', 't2', 'v111', 'n55', 'd5', 't4', 'v114', 'n58', 'd1', 't2', 'v103', 'n61', 'd2', 't5', 'v100', 'n55', 'd4', 'd111', 'n55', 'd2', 't1', 'v111', 'n61', 'd1', 't1', 'v108', 'n68', 'd2', 't4', 'v105', 'n58', 'd1', 't1', 'v113', 'n66', 'd1', 't5', 'v104', 'n58', 'd2', 't2', 'v105', 'n57', 'd1', 't2', 'v99', 'n62', 'd2', 't13', 'v117', 'n58', 'd2', 't1', 'v104', 'n77', 'd3', 't1', 'v101', 'n70', 'd6', 't2', 'v103', 'n59', 'd2', 't1', 'v95', 'n60', 'd1', 't2', 'v120', 'n63', 'd2', 't20', 'v112', 'n63', 'd2', 't4', 'v109', 'n55', 'd2', 't4', 'v115', 'n61', '