In [1]:
# OMR Model 
# Goal: recognize images of music excerpts

# Modules
import torch
from torch.autograd import Variable
import numpy as np
import pylab as pl
import torch.nn.init as init
import torch.optim as optim
import torch.nn as nn
import cv2

import matplotlib as mpl

class cnn_model(torch.nn.Module):
    def __init__(self, batch_size):
        super(cnn_model, self).__init__()

        kernel_size = [3,3]

        self.conv1 = nn.Conv2d(1, 16, kernel_size = kernel_size)
        self.batch1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16,32, kernel_size = kernel_size)
        self.batch2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32,64, kernel_size = kernel_size)
        self.batch3 = nn.BatchNorm2d(64)

        self.act = nn.LeakyReLU()
        self.pool = nn.MaxPool2d(2,2)

    def forward(self, x):

        # FORWARD PASS
        x = self.conv1(x)
        x = self.batch1(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv2(x)
        x = self.batch2(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv3(x)
        x = self.batch3(x)
        x = self.act(x)
        x = self.pool(x)

        output = x

        return x

class rnn_model(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super(rnn_model, self).__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.rnn = nn.RNN(input_size = embed_size, hidden_size = hidden_size,num_layers = 2)
        self.fc = nn.Linear(hidden_size, vocab_size + 1)

    def forward(x):

        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)

        out, _ = self.rnn(x, (h0, c0))

        out = out.reshape(out.shape,[0], -1)
        out = self.fc(out)

        return out
   

In [2]:
import ctc_utils
from primus import CTC_PriMuS

In [3]:
# Data
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
corpus = './Data/package'# PATH
set = 'Data/train.txt' 
vocabulary = 'Data/vocabulary_semantic.txt'  
save_model = './trained_\semantic_model'

primus = CTC_PriMuS(corpus, set, vocabulary, semantic = True, val_split = 0.1)
primus.training_list

Training with 70880 and validating with 7875


['000121703-1_1_1',
 '000124231-1_1_1',
 '220001481-1_1_1',
 '190101241-1_1_1',
 '100017462-1_1_2',
 '000136731-1_1_1',
 '212001212-1_5_1',
 '210097099-1_5_1',
 '225001756-1_3_1',
 '000107899-1_1_1',
 '000105044-1_1_1',
 '230002016-1_2_1',
 '201008619-1_4_1',
 '230002061-1_5_2',
 '000116852-12_1_1',
 '000115890-1_1_1',
 '190013615-1_1_1',
 '000126109-1_1_1',
 '000126818-1_1_1',
 '000120454-1_1_1',
 '000127235-1_1_1',
 '000127858-1_1_2',
 '200021899-1_75_1',
 '201002394-1_3_2',
 '190007135-1_1_1',
 '200022645-1_32_1',
 '000111391-1_1_1',
 '211002118-1_1_1',
 '000115380-1_1_1',
 '211006664-1_1_1',
 '000142173-1_1_1',
 '100501047-1_1_2',
 '000127352-1_1_1',
 '000100256-1_1_1',
 '000125165-1_1_1',
 '220000638-1_1_2',
 '000119915-1_1_1',
 '190023633-1_1_1',
 '190026744-1_1_1',
 '000140871-1_3_1',
 '000141397-1_2_1',
 '000131611-1_1_1',
 '201008592-1_6_2',
 '000125206-1_2_1',
 '000104189-1_1_1',
 '220015798-1_1_1',
 '210000304-1_6_1',
 '000120963-1_1_1',
 '000130989-1_1_1',
 '100017393-1_1_1

In [4]:
import os
os.getcwd()

'/home/myranda/Documents/DSI/ML/OMR'

In [9]:
#IMAGE DEBUGGING
sample_filepath = primus.training_list[0]
sample_fullpath = corpus + '/' + sample_filepath + '/' + sample_filepath
print(sample_fullpath)

# Get image
sample_img = cv2.imread(sample_fullpath + '.png', 0)
print(sample_img.shape)


./Data/package/000142396-1_1_1/000142396-1_1_1
(130, 1427)


In [6]:
# IMAGE DEBUGGING - MPL
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

PATH = './Data/package/' + sample_filepath + '/' + sample_filepath

img = mpimg.imread(PATH + '.png')
print(img)

[[[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 ...

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]]


In [4]:
img_height = 128
max_epochs = 1
dropout = 0.5

batch_size = 16
vocabulary_size = primus.vocabulary_size
model_cnn = cnn_model(batch_size)
model_rnn = rnn_model(embed_size = 512, hidden_size = 512, vocab_size = primus.vocabulary_size)

In [5]:
# Loss and optimizer

learning_rate = 0.001
criterion = torch.nn.CTCLoss()
optimizer = optim.Adam(model_cnn.parameters(), lr = learning_rate) ## ADD MODEL PARAMS


In [6]:
# Default params
# With image height of 128, width will be 1870
params = dict()
params['img_height'] = img_height
params['img_width'] = None
params['batch_size'] = 16
params['img_channels'] = 1
params['conv_blocks'] = 4
params['conv_filter_n'] = [32, 64, 128, 256]
params['conv_filter_size'] = [ [3,3], [3,3], [3,3], [3,3] ]
params['conv_pooling_size'] = [ [2,2], [2,2], [2,2], [2,2] ]
params['rnn_units'] = 512
params['rnn_layers'] = 2
params['vocabulary_size'] = vocabulary_size


In [None]:
# Input shape for CTC loss
input_shape = (None, params['img_height'])

In [13]:
for epoch in range(max_epochs):
    train_loss = 0.
    valid_loss = 0.
    
    train_acc = 0.
    valid_acc = 0.
    
    for i in range(0, 70880 + 7875, 16):
        batch = primus.nextBatch(params)

        data = batch['inputs']

        targets = ctc_utils.sparse_tuple_from(batch['targets'])
        
        tensor_data = torch.from_numpy(data)
        print(tensor_data.shape)
        tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
        
        output = model_cnn(tensor_data_reshape)
        print(output.shape)
        
        # Reshape output for RNN
        #features = torch.permute(output, (3, 0, 2, 1))
        output_rnn = model_rnn(features)
        
        #Input and target shape
        input_shape = (None, params['img_height'], tensor_data_reshape.shape[3],1)
        target_shape = batch['seq_lengths']
        
        loss = criterion(output, targets, input_shape, target_shape)

        loss.backward()
        optimizer.step()

        #Calc loss
        train_loss += loss.detach().item()
        train_acc += 0 # ADD ACCURACY
    print(train_loss)

torch.Size([16, 128, 1722, 1])
torch.Size([16, 64, 14, 213])


TypeError: forward() takes 1 positional argument but 2 were given