## LSTM на оригинальном датасете

Попытка сделать монофонический выход из сетки

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np

Сделаем также пользовательский импорт

In [2]:
from decode_patterns import data_conversion

Загружаем датасет

In [3]:
# import dataset
drum, bass = data_conversion.make_lstm_dataset(height=16, patterns_file="decode_patterns/train.tsv", mono=True)


# define shuffling of dataset
def shuffle(A, B, p=0.8):
    # take 80% to training, other to testing
    L = len(A)
    idx = np.arange(L) < p*L
    np.random.shuffle(idx)
    yield A[idx]
    yield B[idx]
    yield A[np.logical_not(idx)]
    yield B[np.logical_not(idx)]
    
# we can shuffle train and test set like this:
drum_train, bass_train, drum_test, bass_test = shuffle(drum, bass)

# selecting a validation set
drum_validation, bass_validation = data_conversion.make_lstm_dataset(height=16
                                                        , patterns_file="decode_patterns/validation.tsv"
                                                        , mono=True)
drum_validation = torch.tensor(drum_validation, dtype=torch.float)
bass_validation = torch.tensor(bass_validation, dtype=torch.float)
    

In [4]:
drum_train.shape

(40000, 16, 14)

Модель определим в самом простом варианте, который только можно себе представить -- как в примере с конечным автоматом

In [5]:
# попробуем определить модель LSTM как конечный автомат
class DrumNBassLSTM(nn.Module):
    def __init__(self):
        super(DrumNBassLSTM, self).__init__()
        # one input neuron, one output neuron, one layer in LSTM block
        self.input_size = 14
        self.hidden_size = 34
        self.layer_count = 1
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.layer_count)
        self.embed_layer = nn.Linear(self.hidden_size, 1)
        self.sigm = nn.Sigmoid()
    
    def forward(self, input):
        # пусть в input у нас приходит вектор размерности (64, 32, 14)
        # то есть 64 отсчёта, тридцать два примера (минибатч), 14 значение в каждом (барабанная партия)
        output, _ = self.lstm(input)
        output = self.sigm(self.embed_layer(output))*37
        return output

In [6]:
# часть обучения
dnb_lstm = DrumNBassLSTM()

criterion = nn.MSELoss()

# для сравнения моделей необходим reconstruction_loss
reconstruction_loss = criterion

# оценим также и разнообразие мелодии по её.. дисперсии?)
# def melody_variety(melody):
#     return 1/(1 + (melody.sum(axis=2) > 1).int())
    
# criterion = nn.NLLLoss() # -- этот товарищ требует, чтобы LSTM выдавал классы,
# criterion = nn.CrossEntropyLoss() # и этот тоже
# (числа от 0 до C-1), но как всё-таки его заставить это делать?...
# optimizer = optim.SGD(dnb_lstm.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(dnb_lstm.parameters(), lr=0.001)

Найденные баги и их решения:

https://stackoverflow.com/questions/56741087/how-to-fix-runtimeerror-expected-object-of-scalar-type-float-but-got-scalar-typ

https://stackoverflow.com/questions/49206550/pytorch-error-multi-target-not-supported-in-crossentropyloss/49209628

https://stackoverflow.com/questions/56243672/expected-target-size-50-88-got-torch-size50-288-88

In [7]:
epoch_count = 72
batch_size = 32
shuffle_every_epoch = True
    
if shuffle_every_epoch:
    print(f"shuffle_every_epoch is on")
else:
    print(f"shuffle_every_epoch is off")
    # shuffle train and test set:
    drum_train, bass_train, drum_test, bass_test = shuffle(drum, bass)
    drum_train = torch.tensor(drum_train, dtype=torch.float)
    bass_train = torch.tensor(bass_train, dtype=torch.float)
    drum_test = torch.tensor(drum_test, dtype=torch.float)
    drum_test = torch.tensor(drum_test, dtype=torch.float)
        
for epoch in range(epoch_count):  # loop over the dataset multiple times
    print(f"Epoch #{epoch}")
    if shuffle_every_epoch:
        # shuffle train and test set:
        drum_train, bass_train, drum_test, bass_test = shuffle(drum, bass)
        drum_train = torch.tensor(drum_train, dtype=torch.float)
        bass_train = torch.tensor(bass_train, dtype=torch.float)
        drum_test = torch.tensor(drum_test, dtype=torch.float)
        bass_test = torch.tensor(bass_test, dtype=torch.float)
        
    examples_count = drum_train.size()[0]
    examples_id = 0
    
    running_loss = 0.0
    runnint_count = 0
    batch_id = 0
    while examples_id < examples_count:
        batch_drum_train = drum_train[examples_id:examples_id + batch_size,:,:].transpose(0,1)
        batch_bass_train = bass_train[examples_id:examples_id + batch_size,].transpose(0,1)
        # transpose нужен для обмена размерности батча и размерности шагов
#         print(f"batch_drum_train:{batch_drum_train.size()}, batch_bass_train:{batch_bass_train.size()}")

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        bass_outputs = dnb_lstm(batch_drum_train).squeeze()
        
        # loss = criterion(bass_outputs, batch_bass_train.long())
        loss = criterion(bass_outputs, batch_bass_train)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        runnint_count += 1
        period = 5
        if batch_id % period == 0 or examples_id + batch_size >= examples_count:
            print('[%d, %5d] train loss: %.7f' %
                  (epoch + 1, batch_id + 1, running_loss / runnint_count))
            running_loss = 0.0
            runnint_count = 1
            
        # update batch info
        examples_id += batch_size
        batch_id += 1
        
    # here we can insert measure error on test set
    with torch.no_grad():
        bass_outputs = dnb_lstm(drum_test).squeeze()
        
        test_count = len(drum_test)
        test_loss = 0
        for k in range(test_count):
            test_loss += reconstruction_loss(bass_outputs[k], bass_test[k])
        print(f"#{epoch + 1} reconstruction test loss: {test_loss/test_count}")
    

#should check accuracy on validation set
with torch.no_grad():
    bass_outputs = dnb_lstm(drum_validation).squeeze()

    validation_count = len(drum_validation)
    validation_loss = 0
    for k in range(validation_count):
        validation_loss += reconstruction_loss(bass_outputs[k], bass_validation[k])
    print(f"#{epoch + 1} reconstruction validation loss: {validation_loss/validation_count}")
print('Finished Training')

shuffle_every_epoch is on
Epoch #0
[1,     1] train loss: 189.2635040
[1,     6] train loss: 172.8012441
[1,    11] train loss: 151.4795583
[1,    16] train loss: 143.3264440
[1,    21] train loss: 131.4135666
[1,    26] train loss: 116.6867142
[1,    31] train loss: 97.0210177
[1,    36] train loss: 82.6438230
[1,    41] train loss: 64.6381620
[1,    46] train loss: 61.3383236
[1,    51] train loss: 58.1539574
[1,    56] train loss: 59.5128174
[1,    61] train loss: 57.8884252
[1,    66] train loss: 58.2775796
[1,    71] train loss: 56.7767785
[1,    76] train loss: 60.5449511
[1,    81] train loss: 53.7443765
[1,    86] train loss: 58.1582381
[1,    91] train loss: 58.6878757
[1,    96] train loss: 54.8057639
[1,   101] train loss: 52.2772503
[1,   106] train loss: 59.5588964
[1,   111] train loss: 54.9090055
[1,   116] train loss: 53.9119129
[1,   121] train loss: 53.6447436
[1,   126] train loss: 51.6982117
[1,   131] train loss: 55.0293427
[1,   136] train loss: 58.9198023
[1,   1

[1,  1196] train loss: 45.7841606
[1,  1201] train loss: 45.0578690
[1,  1206] train loss: 47.5325565
[1,  1211] train loss: 43.6357231
[1,  1216] train loss: 55.8497849
[1,  1221] train loss: 43.5845477
[1,  1226] train loss: 49.6684755
[1,  1231] train loss: 39.3908653
[1,  1236] train loss: 50.5263259
[1,  1241] train loss: 44.8678157
[1,  1246] train loss: 48.5691649
[1,  1250] train loss: 45.1638329
#1 reconstruction test loss: 54.022300720214844
Epoch #1
[2,     1] train loss: 61.5175056
[2,     6] train loss: 42.3451525
[2,    11] train loss: 46.3499959
[2,    16] train loss: 48.5815932
[2,    21] train loss: 47.7377809
[2,    26] train loss: 44.2246806
[2,    31] train loss: 43.4164486
[2,    36] train loss: 44.0706278
[2,    41] train loss: 43.9981117
[2,    46] train loss: 44.6941700
[2,    51] train loss: 40.3781090
[2,    56] train loss: 45.0588748
[2,    61] train loss: 47.6568139
[2,    66] train loss: 44.3869247
[2,    71] train loss: 42.9164124
[2,    76] train loss: 45

KeyboardInterrupt: 

In [8]:
batch_drum_train = drum_train[:,:,:].transpose(0,1)
batch_bass_train = bass_train[:,:].transpose(0,1)
with torch.no_grad():
    bass_outputs = dnb_lstm(batch_drum_train)

In [9]:
result = bass_outputs.squeeze().int()
result

tensor([[5, 5, 6,  ..., 7, 7, 7],
        [6, 7, 6,  ..., 7, 5, 7],
        [7, 8, 6,  ..., 7, 7, 7],
        ...,
        [6, 7, 6,  ..., 7, 5, 8],
        [7, 8, 6,  ..., 7, 6, 7],
        [6, 7, 6,  ..., 8, 5, 5]], dtype=torch.int32)

Попробуем сохранить результаты работы сети. На anaconda нет mido, поэтому сохраняем результаты работы просто в массивчик npy... Однако, как альтернатива, его можно поставить чере pip в conda:
https://github.com/mido/mido/issues/198

In [10]:
import mido
from decode_patterns.data_conversion import build_track, DrumMelodyPair, NumpyImage, Converter


# вспомогательная функция для генерации midi
# переводит посл-ть нот в "картинку"
def seq_to_img(bass_seq):
    bass_output = []
    for bass_note in bass_seq:
        bass_row = np.eye(1, 36, bass_note - 1)[0]
        bass_output.append(bass_row)
    bass_output = torch.tensor(bass_output).int().squeeze()
    return bass_output

converter = Converter((16,50))

# batch_drum = torch.cat((drum_train, drum_test, torch.tensor(drum_validation))).transpose(0,1)
# batch_bass = torch.cat((bass_train.int(), bass_test.int(), torch.tensor(bass_validation).int())).transpose(0,1)
def output_midi(batch_drum, batch_bass, folder, output_original=False):
    batch_drum = batch_drum.transpose(0,1)
    batch_bass = batch_bass.int().transpose(0,1)
    with torch.no_grad():
        bass_outputs = dnb_lstm(batch_drum)
        bass_outputs = bass_outputs.squeeze().int()

        for i in range(bass_outputs.size()[1]):
            def output_seq(bass_seq, prefix="sample"):
                bass_output = seq_to_img(bass_seq)

                img_dnb = torch.cat((batch_drum[:,i,:].int(),bass_output), axis=1)
                numpy_pair = NumpyImage(np.array(img_dnb), 120, 1, 1, 36)
                pair = converter.convert_numpy_image_to_pair(numpy_pair)
                mid = build_track(pair, tempo=pair.tempo)
                mid.save(f"{folder}/{prefix}{i+1}.mid")
                
            output_seq(bass_outputs[:,i])
            
            # TODO неплохо вынести генерацию оригинальной музыки в отдельный .py файл... т.е. нужно убрать этот костыль...
            if output_original:
                output_seq(batch_bass[:,i], "orig/original")

Выводим обучающую и валидационную выборку

In [11]:
# # если очень надо послушать тренировчную -- лучше её перезагрузить, потому что она перемешивается
# drum, bass = data_conversion.make_lstm_dataset(height=16, patterns_file="decode_patterns/train.tsv", mono=True)
# drum = torch.tensor(drum, dtype=torch.float)
# bass = torch.tensor(bass, dtype=torch.float)
# output_midi(drum, bass, "midi/lstm_mono/train")
output_midi(drum_validation, bass_validation, "midi/lstm_mono/validation")

По вкусу, выводим тот же результат для кожанных мешков на ассесмент. На самом деле ничем от валидационной выборки не отличается :)

In [12]:
drum_hum, bass_hum = data_conversion.make_lstm_dataset(height=16, patterns_file="decode_patterns/human.tsv", mono=True)
drum_hum = torch.tensor(drum_hum, dtype=torch.float)
bass_hum = torch.tensor(bass_hum, dtype=torch.float)
output_midi(drum_hum, bass_hum, "midi/lstm_mono/human")