In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchmetrics as metrics
import numpy as np
import audio_preprocessor

saved_var_path = "D:/dlp/"
data_path = "data/maestro-v3.0.0/"
meta = audio_preprocessor.load_metadata(data_path + 'maestro-v3.0.0.json')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def data_idx():
    train_idx, test_idx, val_idx = [], [], []
    for i in range(len(meta['duration'])):
        if meta['split'][str(i)] == 'train':
            train_idx.append(i)
        if meta['split'][str(i)] == 'test':
            test_idx.append(i)
        if meta['split'][str(i)] == 'val':
            val_idx.append(i)
    return np.array(train_idx), np.array(test_idx), np.array(val_idx)

train_idx, test_idx, val_idx = data_idx()


In [4]:
def sample_train_batch(batch_size=2):
    idx = np.random.choice(train_idx)
    X = np.load(saved_var_path + 'train/spec/train_spec_' + str(idx) + '.npy')
    start = np.random.randint(0, X.shape[0] - batch_size)
    X = X[start:start + batch_size]
    y = np.load(saved_var_path + 'train/midi/train_midi_' + str(idx) + '.npy')
    y = y[start:start + batch_size]
    b = torch.nn.functional.one_hot(torch.tensor(y[0]).to(torch.int64), num_classes=512)
    for i in range(batch_size - 1):
        b = torch.stack((b, torch.nn.functional.one_hot(torch.tensor(y[i + 1]).to(torch.int64), num_classes=512)))

    return torch.tensor(X).to(device), b.to(torch.float32).to(device)

sample_train_batch()

(tensor([[[0.0000e+00, 3.1589e-02, 1.1039e-02,  ..., 4.5782e-06,
           2.2328e-06, 2.4014e-06],
          [0.0000e+00, 3.8172e-02, 1.3339e-02,  ..., 3.9699e-06,
           1.7287e-06, 2.6934e-06],
          [0.0000e+00, 4.3522e-02, 1.5209e-02,  ..., 3.6432e-06,
           1.6371e-06, 2.9135e-06],
          ...,
          [0.0000e+00, 1.2246e-01, 4.2792e-02,  ..., 7.4013e-06,
           7.5451e-06, 6.8976e-06],
          [0.0000e+00, 1.3193e-01, 4.6101e-02,  ..., 6.3619e-06,
           7.5539e-06, 7.8163e-06],
          [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]],
 
         [[0.0000e+00, 1.4271e-01, 4.9868e-02,  ..., 5.1201e-06,
           7.3039e-06, 8.1336e-06],
          [0.0000e+00, 1.5432e-01, 5.3926e-02,  ..., 4.0330e-06,
           6.7496e-06, 7.6835e-06],
          [0.0000e+00, 1.6617e-01, 5.8069e-02,  ..., 3.3594e-06,
           5.9003e-06, 6.5843e-06],
          ...,
          [0.0000e+00, 1.0306e-02, 3.6015e-03,  ..., 5.819

In [5]:
model = nn.Transformer(nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, batch_first=True).to(device)

torch.cuda.empty_cache()

src = torch.rand((2, 128, 512)).to(device)
tgt = torch.rand((2, 2048, 512)).to(device)
out = model(src, tgt)

print(out.shape)

del src
del tgt
torch.cuda.empty_cache()

optimizer = optim.Adam(model.parameters())
loss_function = nn.CrossEntropyLoss()

torch.Size([2, 2048, 512])


In [10]:
def train(model, loss_function, optimizer):
    model.train()
    loss = 0
    for i in range(200000):
        X, y = sample_train_batch()
        # zero out the parameter gradients
        optimizer.zero_grad()
        # do forward pass with current batch of input
        outs = model(X, y)
        # get loss with model predictions and true labels
        loss = loss_function(outs, y)
        # update model parameters
        loss.backward()
        optimizer.step()
        if i % 1000 == 0:
            log = "Training Step " + str(i) + ": loss: " + str(loss)
            print(log)
            torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, "model")



In [None]:
train(model, loss_function, optimizer)

Training Step 0: loss: tensor(30.5020, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 1000: loss: tensor(30.4986, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 2000: loss: tensor(30.4991, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 3000: loss: tensor(30.4986, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 4000: loss: tensor(30.4984, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 5000: loss: tensor(30.4986, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 6000: loss: tensor(30.4988, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 7000: loss: tensor(30.4985, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 8000: loss: tensor(30.4984, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 9000: loss: tensor(30.4984, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 10000: loss: tensor(30.4984, device='cuda:0', grad_fn=<DivBackward1>)
Training Step 11000: loss: tensor(30.4986, device='cuda:0', grad_fn=<DivBackward1>)
Train