In [1]:
import torch
import math
import torch.nn as nn
from torch.nn import functional as F
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from decode_patterns.create_images import create_images, crop_data, train_test
import numpy as np

In [10]:
batch_size = 32
num_epochs = 500
num_epochs = int(num_epochs)

In [11]:
# load data
limit = 9600
drum, bass, tempo = create_images(file_name="../patterns_pairs.tsv", limit=limit)
# drop conditionining

In [12]:
class FeedforwardNeuralNetModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(FeedforwardNeuralNetModel, self).__init__()
        # Linear function 1: 128 * 14 (+16 with cond)= 1792 --> 2048
        # веса накидываются тут
        self.fc1 = nn.Linear(input_dim, hidden_dim) 
        # решение по весам
#         nn.init.normal_(self.fc1.weight, mean=0, std=1)
#         self.fc1.weight = nn.Parameter(self.fc1.weight * math.sqrt(input_dim/2))
        self.relu1 = nn.ReLU()
        
        # Linear function 2: 2048 --> 2048
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
#         # Non-linearity 2
#         nn.init.normal_(self.fc2.weight, mean=0, std=1)
#         self.fc2.weight = nn.Parameter(self.fc2.weight * math.sqrt(hidden_dim/2))
        self.relu2 = nn.ReLU()

        # Linear function 3: 2048 --> 2048
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        # Non-linearity 3
        nn.init.normal_(self.fc3.weight, mean=0, std=1)
        self.fc3.weight = nn.Parameter(self.fc3.weight * math.sqrt(hidden_dim/2))
        self.relu3 = nn.ReLU()

        # Linear function 4 (readout): 2048 --> 128 * 36 = 4608
        self.fc4 = nn.Linear(hidden_dim, output_dim)  
        nn.init.normal_(self.fc4.weight, mean=0, std=1)
        self.fc4.weight = nn.Parameter(self.fc4.weight * math.sqrt(hidden_dim/2))
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Linear function 1
        out = self.fc1(x)
        # Non-linearity 1
        out = self.relu1(out)

        # Linear function 2
        out = self.fc2(out)
        # Non-linearity 2
        out = self.relu2(out)

        # Linear function 2
        out = self.fc3(out)
        # Non-linearity 2
        out = self.relu3(out)

        # Linear function 4 (readout)
        out = self.fc4(out)
        out = self.sigmoid(out)
        return out

In [9]:
input_dim = 128 * 14 
output_dim = 128 * 36
hidden_dim = 2048
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
model.to(device)

FeedforwardNeuralNetModel(
  (fc1): Linear(in_features=1792, out_features=2048, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=2048, out_features=2048, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=2048, out_features=2048, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=2048, out_features=4608, bias=True)
  (sigmoid): Sigmoid()
)

In [13]:
learning_rate = 0.001

In [14]:
criterion = nn.MSELoss()

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
iter = 0
iter_epoch = 0
for epoch in range(num_epochs):
    (train_set, train_t), (test_set, test_t) = train_test(drum, bass, tempo, batch_size=batch_size, img_size=(128, 50))
    iter_epoch += 1
    print(f"Epoch # {iter_epoch}")
    for i, (images, labels) in enumerate(zip(*train_set)): 
        images = images.view(-1, input_dim).requires_grad_().to(device).float()
        labels = labels.to(device).float().reshape((32, 4608))

        optimizer.zero_grad()

        outputs = model(images)

        loss = criterion(outputs, labels.view(-1, output_dim).float())

        loss.backward()

        # Updating parameters
        optimizer.step()

        iter += 1

        if iter % 100 == 0:
            error = 0
            for img, lbl in zip(*test_set):
            # Calculate Accuracy         
                # Iterate through test dataset
                out = model(img.view(-1, input_dim).to(device).float())
                error += criterion(out.float().to(device), lbl.float().to(device))
            print('Iteration: {}. error: {}'.format(iter, error))

Epoch # 1
Iteration: 100. error: 923.1240844726562
Iteration: 200. error: 918.2725219726562
Epoch # 2
Iteration: 300. error: 907.4913330078125
Iteration: 400. error: 916.2799072265625
Epoch # 3
Iteration: 500. error: 915.0504760742188
Iteration: 600. error: 906.0369262695312
Iteration: 700. error: 911.4366455078125
Epoch # 4
Iteration: 800. error: 916.8666381835938
Iteration: 900. error: 915.8767700195312
Epoch # 5
Iteration: 1000. error: 908.9605712890625
Iteration: 1100. error: 907.869873046875


KeyboardInterrupt: 

In [20]:
# загрузить ранее обученную модель
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
model.to(device)
model.load_state_dict(torch.load("../model_0_state"))
model.eval()

FeedforwardNeuralNetModel(
  (fc1): Linear(in_features=1792, out_features=2048, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=2048, out_features=2048, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=2048, out_features=2048, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=2048, out_features=4608, bias=True)
  (sigmoid): Sigmoid()
)

In [46]:
# получить мелодии 
drum, bass, tempo = create_images(file_name="../patterns_pairs.tsv", limit=limit)
(train_set, train_tempo), (test_set, test_tempo) = train_test(drum, bass, tempo, batch_size = batch_size)
drum_set = torch.cat((train_set[0].reshape([-1, 128 * 14]), test_set[0]), 0)
melody_set = torch.cat((train_set[1].reshape([-1, 128, 36]), test_set[1].reshape([-1, 128, 36])), 0)
temp_set = np.concatenate((train_tempo,test_tempo))
result = []
for d in drum_set:
    output = (model(d.view(-1, 128 * 14).float().to(device)).cpu() > 0.5).float().reshape([128, 36])
    d = d.reshape([128, 14]).float()
    result.append(np.array(torch.cat((d, output), 1)))

result = np.array(result)
origin = []
for i in range(len(drum_set)):
    origin.append(np.array(torch.cat((drum_set[i].reshape([128, 14]), melody_set[i]), 1)))
    

In [48]:
import mido
from decode_patterns.data_conversion import build_track, DrumMelodyPair, Converter

converter = Converter((128,50))

with torch.no_grad():

    bass_outputs = result

    for i in range(0, 500):
        img_dnb = bass_outputs[i]
        t = temp_set[i]
        pair = converter.convert_numpy_image_to_pair(np.array(img_dnb))
        pair = DrumMelodyPair(pair.drum_pattern, pair.melody, tempo, t[1], pair.denominator)
        mid = build_track(pair, tempo=t[0])
        mid.save(f"../midi/model_0_gen/sample{i+1}.mid")
    for i in range(0, 500):
        img_dnb = origin[i]
        t = temp_set[i]
        pair = converter.convert_numpy_image_to_pair(np.array(img_dnb))
        pair = DrumMelodyPair(pair.drum_pattern, pair.melody, tempo, t[1], pair.denominator)
        mid = build_track(pair, tempo=t[0])
        mid.save(f"../midi/origin/sample{i+1}.mid")