In [9]:
# OMR Model 
# Goal: recognize images of music excerpts

# Modules
import torch
from torch.autograd import Variable
import numpy as np
import pylab as pl
import torch.nn.init as init
import torch.optim as optim
import torch.nn as nn
import cv2
import ctc_utils
from primus import CTC_PriMuS
from torch import cuda

import matplotlib as mpl


In [2]:
# Data
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
corpus = './Data/package'# PATH
set = 'Data/train.txt' 
vocabulary = 'Data/vocabulary_semantic.txt'  
save_model = './trained_\semantic_model'

primus = CTC_PriMuS(corpus, set, vocabulary, semantic = True, val_split = 0.1)

Training with 70880 and validating with 7875


In [3]:
# Variables
img_height = 128
N_EPOCHS = 10
dropout = 0.5

BATCH_SIZE = 16
vocabulary_size = primus.vocabulary_size

In [4]:
# Default params
# With image height of 128, width will be 1870
params = dict()
params['img_height'] = img_height
params['img_width'] = None
params['batch_size'] = 16
params['img_channels'] = 1
params['conv_blocks'] = 4
params['conv_filter_n'] = [32, 64, 128, 256]
params['conv_filter_size'] = [ [3,3], [3,3], [3,3], [3,3] ]
params['conv_pooling_size'] = [ [2,2], [2,2], [2,2], [2,2] ]
params['rnn_units'] = 512
params['rnn_layers'] = 2
params['vocabulary_size'] = vocabulary_size
params['max_width'] = 1500

In [5]:
# Model Classes

class cnn_model(torch.nn.Module):
    def __init__(self, batch_size):
        super(cnn_model, self).__init__()

        kernel_size = [3,3]

        self.conv1 = nn.Conv2d(1, 16, kernel_size = kernel_size)
        self.batch1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16,32, kernel_size = kernel_size)
        self.batch2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32,64, kernel_size = kernel_size)
        self.batch3 = nn.BatchNorm2d(64)

        self.act = nn.LeakyReLU()
        self.pool = nn.MaxPool2d(2,2)

    def forward(self, x):

        # FORWARD PASS
        x = self.conv1(x)
        x = self.batch1(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv2(x)
        x = self.batch2(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv3(x)
        x = self.batch3(x)
        x = self.act(x)
        x = self.pool(x)

        output = x

        return x

In [6]:
# Loop RNN
# num steps: IMAGE WIDTH
# batch size 16
# n_inputs 64 * 14 (from CNN output)
# output of CNN: (64 by 14 by width) - width same across batch

class LoopRNN(nn.Module):
    def __init__(self, batch_size = 16, n_inputs = 896, n_neurons = 4, n_outputs = vocabulary_size +1): # N_ STEPS AFTER BATCH_SIZE
        super(LoopRNN, self).__init__()
        
        self.n_neurons = n_neurons
        #self.batch_size = batch_size
        #self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons) 
        
        self.FC = nn.Linear(self.n_neurons, self.n_outputs)
        
    def init_hidden(self, batch_size):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, batch_size, self.n_neurons))
        
    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        #X = X.permute(1, 0, 2) 
        # maybe batch size should be width
        # each batch is 1 by 64 by 14
        
        self.batch_size = X.size(2)
        self.hidden = self.init_hidden(self.batch_size)
        
        # try using a loop - delete this if it breaks
        #lstm_out, self.hidden = self.basic_rnn(X, self.hidden)      
        #out = self.FC(self.hidden)
        out = []
        
        for x in X:
            lstm_out, self.hidden = self.basic_rnn(x, self.hidden)
            out_step = self.FC(self.hidden)
            out.append(out_step)

            
        return out#.view(-1, self.n_outputs) # batch_size X n_output

In [7]:
# CTC Loss (homemade from Medium article recipe)

def ctcloss_reference(log_probs, targets, input_lengths, target_lengths, blank=0, reduction='mean'):
    
    input_lengths = torch.as_tensor(input_lengths, dtype=torch.long)
    target_lengths = torch.as_tensor(target_lengths, dtype=torch.long)
    dt = log_probs.dtype
    log_probs = log_probs.double()
    targets = targets.long()
    cum_target_lengths = target_lengths.cumsum(0)
    losses = []
    
    for i in range(log_probs.size(1)):
        input_length = input_lengths[i].item()
        target_length = target_lengths[i].item()
        cum_target_length = cum_target_lengths[i].item()
        
        # fill target sequences with blank symbol
        targets_prime = targets.new_full((2 * target_length + 1,), blank)
        
        
        # Then fill every odd value with target symbol
        if targets.dim() ==2:
                targets_prime[1::2] = targets[i, :target_length]
                
        else:
            targets_prime[1::2] = targets[cum_target_length - target_length:cum_target_length]
        
        # convert original inputs from log space by exponentiating
        probs = log_probs[:input_length, i].exp()
        
        # the length is the same as the target sequences
        alpha = log_probs.new_zeros((target_length * 2 + 1,))
        alpha[0] = probs[0, blank]
        alpha[1] = probs[0, targets_prime[1]]
        
        # this mask is only true when a[current] != a[current - 2]
        # please note that every odd element is blank, so this condition never holds for them
        mask_third = (targets_prime[:-2] != targets_prime[2:])
        
        for t in range(1, input_length):
            alpha_next = alpha.clone()
            
            # we always add a[current-1] to a[current]
            alpha_next[1:] += alpha[:-1]
            
            # but we add a[current-2] to a[current] only when mask condition is true
            alpha_next[2:] += torch.where(mask_third, alpha[:-2], alpha.new_zeros(1))
            alpha = probs[t, targets_prime] * alpha_next
            
        # to evaluate maximum likelihodd error, we need the natural logs of the target labelling probs
        losses.append(-alpha[-2:].sum().log()[None])
        
        output = torch.cat(losses, 0)
        
        if reduction == 'mean':
            return (output / target_lengths.to(dtype=output.dtype, device=output.device)).mean()
        
        elif reduction == 'sum':
            return output.sum()
        
        output = output.to(dt)
        
        return output


In [11]:
learning_rate = 0.0001
criterion = torch.nn.CTCLoss()
model_cnn = cnn_model(BATCH_SIZE)
model_rnn = LoopRNN()
optimizer = optim.Adam(list(model_cnn.parameters()) + list(model_rnn.parameters()), lr = learning_rate)
len_data = len(primus.training_list) + len(primus.validation_list)

len_data_train = len(primus.training_list)
len_data_valid = len(primus.validation_list)

log_softmax = nn.LogSoftmax(dim=1)

gpu = cuda.is_available()

In [13]:
# Train
for epoch in range(N_EPOCHS):
    train_loss = 0.
    train_acc = 0.
    valid_loss = 0
    
    model_cnn.train()
    model_rnn.train()
    
    for i in range(0, len_data, BATCH_SIZE):
        
        # Training
        if i < len_data_train:
            # zero parameter gradients
            optimizer.zero_grad()

            # Get inputs
            batch = primus.nextBatch(params)

            data = batch['inputs'] # size (batch, height, width, channels)
            max_input_length = data.shape[2]
            padded_targets, lengths = ctc_utils.pad_sequences(batch['targets'], maxlen=max_input_length)

            tensor_data = torch.from_numpy(data)

            tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
            
            if gpu:
                model_cnn = model_cnn.cuda()
                model_rnn = model_rnn.cuda()
                tensor_data_reshape = tensor_data_reshape.cuda()
                

            # forward, backward, optim
            cnn_output = model_cnn(tensor_data_reshape)

            output_to_loop = torch.reshape(cnn_output, (16, 1, cnn_output.shape[3], 64*14))

            model_rnn.hidden = model_rnn.init_hidden(BATCH_SIZE)
            loop_output = model_rnn(output_to_loop)

            for t in range(0,len(loop_output)):
                loop_output[t] = torch.reshape(loop_output[t],(loop_output[t].size(1), loop_output[t].size(2)))

            loop_tensor = torch.stack(loop_output)

            # Reshape to correct shape for Medium article
            loop_tensor = torch.permute(loop_tensor, (1,0,2))


            # Convert to log softmax
            loop_logits = log_softmax(loop_tensor)

            #Target to tensor
            list_of_target_tensors = []
            lens = 0
            for i in range(0,16):
                list_of_target_tensors.append(torch.as_tensor(batch['targets'][i]))
                lens += len(batch['targets'][i])
            tensor_of_target_tensors = torch.cat(list_of_target_tensors)
            
            if gpu:
                tensor_of_target_tensors = tensor_of_target_tensors.cuda()

            # Set parameters for loss
            target_lengths = tuple(lengths)
            input_lengths = tuple(int(b) for b in batch['seq_lengths'])
            targets = tensor_of_target_tensors
            log_probs = loop_logits

            result = criterion(log_probs, targets, input_lengths, target_lengths)
            
            # Debugging statements to check CTC loss calculation
            #expected = ctcloss_reference(log_probs, targets, input_lengths, target_lengths).float()

            #print("Result from Torch: %f" %result)
            #print("Custom Function: %f" %expected)

            loss = result
            loss.backward()
            optimizer.step()

            train_loss += loss.detach().item()

            #print("Loss: %f", %train_loss)
            
        # Validation
        else:
            with torch.no_grad():
                model_cnn.eval()
                model_rnn.eval()
                
                
                # Get inputs
                batch = primus.nextBatch(params)

                data = batch['inputs'] # size (batch, height, width, channels)
                max_input_length = data.shape[2]
                padded_targets, lengths = ctc_utils.pad_sequences(batch['targets'], maxlen=max_input_length)

                tensor_data = torch.from_numpy(data)

                tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
                
                if gpu:
                    model_cnn = model_cnn.cuda()
                    model_rnn = model_rnn.cuda()
                    tensor_data_reshape = tensor_data_reshape.cuda()

                # forward, backward, optim
                cnn_output = model_cnn(tensor_data_reshape)

                output_to_loop = torch.reshape(cnn_output, (16, 1, cnn_output.shape[3], 64*14))

                model_rnn.hidden = model_rnn.init_hidden(BATCH_SIZE)
                loop_output = model_rnn(output_to_loop)

                for t in range(0,len(loop_output)):
                    loop_output[t] = torch.reshape(loop_output[t],(loop_output[t].size(1), loop_output[t].size(2)))

                loop_tensor = torch.stack(loop_output)

                # Reshape to correct shape for Medium article
                loop_tensor = torch.permute(loop_tensor, (1,0,2))


                # Convert to log softmax
                loop_logits = log_softmax(loop_tensor)

                #Target to tensor
                list_of_target_tensors = []
                lens = 0
                for i in range(0,16):
                    list_of_target_tensors.append(torch.as_tensor(batch['targets'][i]))
                    lens += len(batch['targets'][i])
                tensor_of_target_tensors = torch.cat(list_of_target_tensors)
                
                if gpu:
                    tensor_of_target_tensors = tensor_of_target_tensors.cuda()

                # Set parameters for loss
                target_lengths = tuple(lengths)
                input_lengths = tuple(int(b) for b in batch['seq_lengths'])
                targets = tensor_of_target_tensors
                log_probs = loop_logits

                result = criterion(log_probs, targets, input_lengths, target_lengths)
                #expected = ctcloss_reference(log_probs, targets, input_lengths, target_lengths).float()

                #print("Result from Torch: %f" %result)
                #print("Custom Function: %f" %expected)

                loss = result
                
                valid_loss += loss
                
    epoch_loss_train = train_loss/(len_data_train/BATCH_SIZE)
    epoch_loss_valid = valid_loss/(len_data_valid/BATCH_SIZE)
            
    if epoch_loss_valid < valid_loss_min:
        # Save model
        torch.save(model_cnn.state_dict(), './model_cnn.pt')
        torch.save(model_rnn.state_dict(), './model_rnn.pt')
            
        valid_loss_min = valid_loss
                 
        best_epoch = epoch
        
    print('training loss for epoch %d:' %epoch)
    print(epoch_loss_train)
    
    print('validation loss for epoch %d:' %epoch)
    print(epoch_loss_valid)
    
print("best epoch %d with loss %f" %epoch %valid_loss_min)

NVIDIA GeForce RTX 3070 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the NVIDIA GeForce RTX 3070 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

Training with 70880 and validating with 7875
