In [3]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.5.4.60-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (60.3 MB)
     |████████████████████████████████| 60.3 MB 53.8 MB/s            
Installing collected packages: opencv-python
Successfully installed opencv-python-4.5.4.60


In [1]:
# OMR Model 
# Goal: recognize images of music excerpts

# Modules
import torch
from torch.autograd import Variable
import numpy as np
import pylab as pl
import torch.nn.init as init
import torch.optim as optim
import torch.nn as nn
import cv2

import matplotlib as mpl

class cnn_model(torch.nn.Module):
    def __init__(self, batch_size):
        super(cnn_model, self).__init__()

        kernel_size = [3,3]

        self.conv1 = nn.Conv2d(1, 16, kernel_size = kernel_size)
        self.batch1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16,32, kernel_size = kernel_size)
        self.batch2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32,64, kernel_size = kernel_size)
        self.batch3 = nn.BatchNorm2d(64)

        self.act = nn.LeakyReLU()
        self.pool = nn.MaxPool2d(2,2)

    def forward(self, x):

        # FORWARD PASS
        x = self.conv1(x)
        x = self.batch1(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv2(x)
        x = self.batch2(x)
        x = self.act(x)
        x = self.pool(x)

        x = self.conv3(x)
        x = self.batch3(x)
        x = self.act(x)
        x = self.pool(x)

        output = x

        return x

class rnn_model(torch.nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super(rnn_model, self).__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        #self.rnn = nn.LSTMCell(input_size = embed_size, hidden_size = hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size + 1)

    def forward(self,x, input_size):

        #h0 = torch.zeros(16, x.size(0), self.hidden_size).to(device)
        #c0 = torch.zeros(16, x.size(0), self.hidden_size).to(device)
        
        h0 = torch.zeros(16,self.hidden_size,self.hidden_size)#.to(device)
        c0 = torch.zeros(16, self.hidden_size,self.hidden_size)#.to(device)
        
        self.rnn = nn.LSTMCell(input_size = input_size, hidden_size = self.hidden_size)
        out, _ = self.rnn(x, (h0, c0))


        out = out.reshape(out.shape[0], -1)
        out = self.fc(out)

        return out
   

In [2]:
class BasicRNN(nn.Module):
    def __init__(self, batch_size, n_steps, n_inputs, n_neurons, n_outputs):
        super(BasicRNN, self).__init__()
        
        self.n_neurons = n_neurons
        self.batch_size = batch_size
        self.n_steps = n_steps
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        self.basic_rnn = nn.RNN(self.n_inputs, self.n_neurons) 
        
        self.FC = nn.Linear(self.n_neurons, self.n_outputs)
        
    def init_hidden(self,):
        # (num_layers, batch_size, n_neurons)
        return (torch.zeros(1, self.batch_size, self.n_neurons))
        
    def forward(self, X):
        # transforms X to dimensions: n_steps X batch_size X n_inputs
        #X = X.permute(1, 0, 2) 
        
        self.batch_size = X.size(1)
        self.hidden = self.init_hidden()
        
        # lstm_out => n_steps, batch_size, n_neurons (hidden states for each time step)
        # self.hidden => 1, batch_size, n_neurons (final state from each lstm_out)
        #lstm_out, self.hidden = self.basic_rnn(X, self.hidden)      
        out, self.hidden = nn.basic_rnn(n_inputs, n_neurons)
        out = self.FC(self.hidden)
        
        return out#.view(-1, self.n_outputs) # batch_size X n_output

In [3]:
import ctc_utils
from primus import CTC_PriMuS

In [4]:
# Data
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
corpus = './Data/package'# PATH
set = 'Data/train.txt' 
vocabulary = 'Data/vocabulary_semantic.txt'  
save_model = './trained_\semantic_model'

primus = CTC_PriMuS(corpus, set, vocabulary, semantic = True, val_split = 0.1)
primus.training_list

Training with 70880 and validating with 7875


['000136122-1_2_1',
 '230003636-1_21_2',
 '000123529-1_1_2',
 '000118332-1_1_2',
 '000135764-1_1_1',
 '000110955-1_1_1',
 '190014525-1_1_1',
 '210000218-1_2_1',
 '000122545-1_1_2',
 '000106165-1_1_1',
 '000115764-1_1_1',
 '190101947-1_1_1',
 '000115976-11_1_1',
 '220014638-1_1_2',
 '000102615-1_1_2',
 '211005421-1_4_1',
 '000127845-1_2_1',
 '190015388-1_1_1',
 '200185762-1_1_1',
 '000120336-1_1_1',
 '190018598-1_1_1',
 '211004611-1_5_1',
 '000102431-1_1_1',
 '000136986-1_1_1',
 '000105383-1_1_1',
 '000140766-1_2_1',
 '000126811-1_1_1',
 '212003679-1_1_1',
 '211007011-1_12_1',
 '230001487-1_1_1',
 '110002343-1_2_1',
 '190003571-1_1_1',
 '100016392-1_1_1',
 '000100153-1_2_1',
 '230005816-1_1_1',
 '000104575-1_2_1',
 '180000107-1_8_1',
 '210097285-1_26_1',
 '211004455-1_2_1',
 '000114468-1_1_2',
 '190001219-1_1_1',
 '190001990-1_1_1',
 '000124874-1_1_1',
 '201004334-1_21_1',
 '000142431-1_1_1',
 '225001058-1_55_1',
 '190012417-1_1_1',
 '230002835-1_3_1',
 '220000595-1_1_1',
 '000136800-1_

In [10]:
import os
os.getcwd()

'/home/myranda/Documents/DSI/ML/OMR'

In [11]:
#IMAGE DEBUGGING
sample_filepath = primus.training_list[0]
sample_fullpath = corpus + '/' + sample_filepath + '/' + sample_filepath
print(sample_fullpath)

# Get image
sample_img = cv2.imread(sample_fullpath + '.png', 0)
print(sample_img.shape)


./Data/package/000118390-1_1_2/000118390-1_1_2
(155, 1639)


In [12]:
# IMAGE DEBUGGING - MPL
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

PATH = './Data/package/' + sample_filepath + '/' + sample_filepath

img = mpimg.imread(PATH + '.png')
print(img)

[[[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 ...

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]

 [[1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  ...
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]
  [1. 1. 1. 1.]]]


In [5]:
img_height = 128
max_epochs = 1
dropout = 0.5

batch_size = 16
vocabulary_size = primus.vocabulary_size
model_cnn = cnn_model(batch_size)
model_rnn = rnn_model(embed_size = 512, hidden_size = 512, vocab_size = primus.vocabulary_size)

In [6]:
# Loss and optimizer

learning_rate = 0.001
criterion = torch.nn.CTCLoss()
optimizer_cnn = optim.Adam(model_cnn.parameters(), lr = learning_rate) ## ADD MODEL PARAMS
optimizer_rnn = optim.Adam(model_rnn.parameters(), lr = learning_rate)
optimizer = optim.Adam(list(model_cnn.parameters()) + list(model_rnn.parameters()))

In [7]:
# Default params
# With image height of 128, width will be 1870
params = dict()
params['img_height'] = img_height
params['img_width'] = None
params['batch_size'] = 16
params['img_channels'] = 1
params['conv_blocks'] = 4
params['conv_filter_n'] = [32, 64, 128, 256]
params['conv_filter_size'] = [ [3,3], [3,3], [3,3], [3,3] ]
params['conv_pooling_size'] = [ [2,2], [2,2], [2,2], [2,2] ]
params['rnn_units'] = 512
params['rnn_layers'] = 2
params['vocabulary_size'] = vocabulary_size


In [8]:
# Input shape for CTC loss
input_shape = (None, params['img_height'])

In [74]:
data[2].shape

(128, 2017, 1)

In [12]:
# Train using model_rnn
for epoch in range(max_epochs):
    train_loss = 0.
    valid_loss = 0.
    
    train_acc = 0.
    valid_acc = 0.
    
    for i in range(0, 70880 + 7875, 16):
        batch = primus.nextBatch(params)

        data = batch['inputs']

        targets = ctc_utils.sparse_tuple_from(batch['targets'])
        
        tensor_data = torch.from_numpy(data)
        print(tensor_data.shape)
        tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
        
        output = model_cnn(tensor_data_reshape)
        print(output.shape)
        #output_size = 64 * 14 * output.shape[3]
        output_size = output.shape[3]
        # Reshape output for RNN
        output = output.view(output.size(0), output.size(3), -1)
        output = output.permute(0,2,1)
        #features = torch.permute(output, (3, 0, 2, 1))
        #features = torch.reshape(features, (16, features.shape[0], 64 * 14)) # width, batch, features
        output_rnn = model_rnn(output, input_size = 64*14)
        
        #Input and target shape
        input_shape = (None, params['img_height'], tensor_data_reshape.shape[3],1)
        target_shape = batch['seq_lengths']
        
        loss = criterion(output, targets, input_shape, target_shape)

        loss.backward()
        optimizer.step()

        #Calc loss
        train_loss += loss.detach().item()
        train_acc += 0 # ADD ACCURACY
    print(train_loss)

torch.Size([16, 128, 2153, 1])
torch.Size([16, 64, 14, 267])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (14336x267 and 896x2048)

In [72]:
14336/896

16.0

In [105]:
# Train using Basic RNN

# Setup
BATCH_SIZE = 16
IMG_HEIGHT = img_height
N_EPOCHS = 1
N_OUTPUTS = vocabulary_size + 1
N_NEURONS = 512
#N_INPUTS = 512
N_INPUTS = 896
model_cnn = cnn_model(BATCH_SIZE)
basic_rnn = BasicRNN(BATCH_SIZE, 1, N_INPUTS, N_NEURONS, N_OUTPUTS)
optimizer = optim.Adam(list(model_cnn.parameters()) + list(basic_rnn.parameters()))
len_data = len(primus.training_list) + len(primus.validation_list)

In [106]:
# Train

for epoch in range(N_EPOCHS):
    train_loss = 0.
    train_acc = 0.
    model_cnn.train()
    basic_rnn.train()
    
    for i in range(0, len_data, BATCH_SIZE):
        # zero parameter gradients
        optimizer.zero_grad()
        
        # reset hidden states
        basic_rnn.hidden = basic_rnn.init_hidden()
        
        # Get inputs
        batch = primus.nextBatch(params)

        data = batch['inputs'] # size (batch, height, width, channels)
        #print(data)
        #print(data.shape)
        max_input_length = data.shape[2]
        
        # list of indices, values, shape
        seq_len = int(batch['seq_lengths'][0])
        targets = ctc_utils.sparse_tuple_from(batch['targets'])
        #print(tuple(targets[2]))
        #print(type(t[0]) for t in targets)
        #targets = torch.sparse_coo_tensor(targets[0], targets[1], tuple(targets[2]))
        targets_0 = torch.as_tensor((targets[0]))
        #print(targets.shape)
        #targets = torch.reshape(targets, (16, 1))
        padded_targets, lengths = ctc_utils.pad_sequences(batch['targets'], maxlen=max_input_length)
        padded_targets_tensor = torch.tensor(padded_targets)
        
        tensor_data = torch.from_numpy(data)
        #print(tensor_data.shape)
        tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
        
        # forward, backward, optim
        cnn_output = model_cnn(tensor_data_reshape)
        output_size = 64 * 14 * cnn_output.shape[3]
        #print(cnn_output.shape)
        #print(cnn_output[0])
        print(cnn_output[0].shape)
        print(cnn_output[0][0].shape)
        
        # Change shape for rnn
        output = torch.reshape(cnn_output, (cnn_output.shape[3], 16, 64 * 14)) # width, batch, features
        print(output.shape)
        rnn_output = basic_rnn(output)
        print(rnn_output[0].shape)
        print(batch['seq_lengths'])
        
        #rnn_output_reshape = torch.reshape(rnn_output, (cnn_output[0].shape[2], BATCH_SIZE, N_OUTPUTS))
        #rnn_output_reshape = torch.reshape(rnn_output[0], (1, BATCH_SIZE, N_OUTPUTS))
        #rnn_output_reshape = rnn_output[0].view(-1, BATCH_SIZE, N_OUTPUTS)
        
        
        log_probs = nn.functional.log_softmax(rnn_output)
        #Input and target shape
        #print(rnn_output_reshape.shape)
        input_shape = (BATCH_SIZE, params['img_height'], tensor_data_reshape.shape[3],1)
        input_len = tuple([1 for i in range (0, BATCH_SIZE)])
        #print(input_shape)
        target_shape = tuple(int(b) for b in batch['seq_lengths'])
        
        # MUST BE TENSOR, TENSOR, TUPLE, TUPLE OR TENSOR TENSOR TENSOR TENSOR
        #loss = criterion(rnn_output_reshape, padded_targets_tensor, input_len, target_shape)
        #loss = criterion(log_probs, padded_targets_tensor, input_len, target_shape)
        loss = criterion(log_probs, padded_targets_tensor, target_shape, tuple(lengths))
        loss.backward()
        optimizer.step()
        
        train_loss += loss.detach().item()
        train_acc += 0
        print("Loss: %f", train_loss)
        
    #model.eval()
    print('training loss:')
    print(train_loss)
        

torch.Size([64, 14, 179])
torch.Size([14, 179])
torch.Size([179, 16, 896])


TypeError: cannot unpack non-iterable LSTM object

In [35]:
print("DONE")

DONE


In [263]:
28512/16
# 16 times seq_len * n_inputs

1782.0

In [261]:
128 * data.shape[1]

16384

In [146]:
targets[2].shape

(2,)

In [158]:
torch.as_tensor(tuple(batch['targets']))

ValueError: expected sequence of length 26 at dim 1 (got 20)

In [47]:
padded_targets, lengths = ctc_utils.pad_sequences(batch['targets'], maxlen=125)
len(padded_targets)
lengths

array([25, 36, 24, 31, 21, 18, 27, 31, 15, 26, 26, 18, 17, 15, 18, 38])

In [46]:
torch.tensor(padded_targets).shape

torch.Size([16, 125])

In [168]:
padded_targets_list = [torch.tensor(padded_targets[i]) for i in range(0,len(padded_targets))]
padded_targets_list

[tensor([[1.0000e+01, 2.3400e+02, 1.7790e+03, 1.5990e+03, 0.0000e+00, 1.0180e+03,
          1.0180e+03, 1.0180e+03, 1.0180e+03, 1.6470e+03, 1.4830e+03, 1.2370e+03,
          1.0360e+03, 0.0000e+00, 8.2300e+02, 6.0400e+02, 8.5300e+02, 4.0200e+02,
          1.0180e+03, 6.0400e+02, 4.2600e+02, 1.6180e+03, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [7.0000e+00, 2.2800e+02, 1.7800e+03, 1.7220e+03, 9.8300e+02, 0.0000e+00,
          9.8300e+02, 9.8300e+02, 0.0000e+00, 3.8100e+02, 5.6100e+02, 7.7900e+02,
          0.0000e+00, 9.9200e+02, 7.9000e+02, 5.5600e+02, 0.0000e+00, 3.7400e+02,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.000

In [50]:
torch.tensor(lengths)

tensor([29, 31, 23, 26, 37, 27, 22, 23, 13, 20, 20, 21, 17, 30, 24, 25])

In [245]:
len(data[0][1])

1743

In [203]:
rnn_output.shape

torch.Size([16, 1782])

In [209]:
len(data[1])

128

In [130]:
sum(target_shape)

1520

In [27]:
# DEBUGGING
# Train

for epoch in range(N_EPOCHS):
    train_loss = 0.
    train_acc = 0.
    #model.train()
    
    for i in range(0, len_data, BATCH_SIZE):
        # zero parameter gradients
        optimizer.zero_grad()
        
        # reset hidden states
        basic_rnn.hidden = basic_rnn.init_hidden()
        
        # Get inputs
        batch = primus.nextBatch(params)

        data = batch['inputs'] # size (batch, height, width, channels)
        #print(data)
        #print(data.shape)
        max_input_length = data.shape[2]
        
        # list of indices, values, shape
        seq_len = int(batch['seq_lengths'][0])
        targets = ctc_utils.sparse_tuple_from(batch['targets'])
        #print(tuple(targets[2]))
        #print(type(t[0]) for t in targets)
        #targets = torch.sparse_coo_tensor(targets[0], targets[1], tuple(targets[2]))
        targets_0 = torch.as_tensor((targets[0]))
        #print(targets.shape)
        #targets = torch.reshape(targets, (16, 1))
        padded_targets, lengths = ctc_utils.pad_sequences(batch['targets'], maxlen=max_input_length)
        padded_targets_tensor = torch.tensor(padded_targets)
        
        tensor_data = torch.from_numpy(data)
        #print(tensor_data.shape)
        tensor_data_reshape = torch.permute(tensor_data,(0,3, 1, 2))
        
        # forward, backward, optim
        cnn_output = model_cnn(tensor_data_reshape)
        
        # Change shape for rnn
        output = cnn_output.view(cnn_output.size(0), cnn_output.size(1), -1)
        print(output.shape)
        output.permute(2,0,1)
        rnn_output = basic_rnn(output)
        print(rnn_output.shape)
        #print(batch['seq_lengths'])
        
        #rnn_output_reshape = torch.reshape(rnn_output, (cnn_output[0].shape[2], BATCH_SIZE, N_OUTPUTS))
        rnn_output_reshape = torch.reshape(rnn_output, (1, BATCH_SIZE, N_OUTPUTS))
        
        log_probs = nn.functional.log_softmax(rnn_output_reshape)
        #Input and target shape
        #print(rnn_output_reshape.shape)
        input_shape = (BATCH_SIZE, params['img_height'], tensor_data_reshape.shape[3],1)
        input_len = tuple([1 for i in range (0, BATCH_SIZE)])
        #print(input_shape)
        target_shape = tuple(int(b) for b in batch['seq_lengths'])
        
        # MUST BE TENSOR, TENSOR, TUPLE, TUPLE OR TENSOR TENSOR TENSOR TENSOR
        #loss = criterion(rnn_output_reshape, padded_targets_tensor, input_len, target_shape)
        loss = criterion(log_probs, padded_targets_tensor, input_len, target_shape)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.detach().item()
        train_acc += 0
        
    #model.eval()
    print('training loss:')
    print(train_loss)
        

ModuleNotFoundError: No module named 'ctc_model'

In [22]:
torch.load('./Models/Semantic-Model.zip')

RuntimeError: [enforce fail at inline_container.cc:115] . file in archive is not in a subdirectory: semantic_model.data-00000-of-00001