In [1]:
import os
import random
import numpy as np
from skimage.color import gray2rgb

import tensorflow as tf
from tensorflow.keras.applications import nasnet, mobilenet_v2
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dot, Concatenate, Softmax, GlobalAveragePooling2D, Dropout
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import FalsePositives, Precision

from config import models_folder, output_data_folder
from config import n_mels

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
IMG_HEIGHT = n_mels

In [4]:
# model = nasnet.NASNetMobile(weights='imagenet', include_top=True)
# model = nasnet.NASNetMobile(include_top=True)
base_model = mobilenet_v2.MobileNetV2(
    include_top=False,   # Remove last layer
    input_shape=(IMG_HEIGHT, IMG_HEIGHT, 3)   # change input size 
)
base_model.summary()

Model: "mobilenetv2_1.00_128"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
Conv1_pad (ZeroPadding2D)       (None, 129, 129, 3)  0           input_1[0][0]                    
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 64, 64, 32)   864         Conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_Conv1 (BatchNormalization)   (None, 64, 64, 32)   128         Conv1[0][0]                      
_______________________________________________________________________________

In [5]:
# images = mobilenet_v2.preprocess_input(images)
# preds = model.predict(images)
# preds = mobilenet_v2.decode_predictions(preds)
# preds

### TODO: Create generator for contrastive training
* Read numpy spectrograms
* Convert spectrograms to RGB
* Generate batches (with labels)
* Resize batches

### TODO: Contrastive Classifier
* Add encoding layer
* Add classifier
* Freeze relevant weights

### TODO: Overall
* Train classifier to test if it runs
* Train classifier with validation and model saving
* Build binary classifier

In [6]:
class DataGenerator:
    
    def __init__(self, spectrogram_samples_files, batch_size, num_batches, num_sub_samples, img_height):
        self.spectrogram_samples_files = spectrogram_samples_files   # list of filepaths
        self.batch_size = batch_size   # 1 positive, n-1 negatives
        self.num_batches = num_batches   # num batches per epoch
        self.num_sub_samples = num_sub_samples   # num sub-samples per epoch
        self.img_height = img_height   # height of square img to be generated in the batches
        self.sub_samples = []   # list of RGB converted spectrograms
    
    def generate_batches(self):
        while True:
            self.create_sub_samples()
            batch_count = self.num_batches
            while batch_count > 0:
                batch_count -= 1
                sample_spectrograms_indices = random.sample(range(self.num_sub_samples), self.batch_size)   # sample a batch
                pos_idx = sample_spectrograms_indices[0]   # positive sample
                # Generate query image
                query_img = self.get_img_slice_from_spectrogram(self.sub_samples[pos_idx])
                # Generate batch images
                random.shuffle(sample_spectrograms_indices)
                batch_imgs = [self.get_img_slice_from_spectrogram(self.sub_samples[idx]) for idx in sample_spectrograms_indices]
                # Create tf batch
                labels = [(pos_idx==idx)*1 for idx in sample_spectrograms_indices]
                
#                 print(pos_idx)
#                 print(sample_spectrograms_indices)
                
                labels = np.asarray(labels)
                labels = np.asarray([labels])
                batch_imgs.insert(0, query_img)
                batch_imgs = [img / np.amax(np.absolute(img)) for img in batch_imgs]   # normalize to range [-1, 1]
                batch_imgs = [np.asarray([img]) for img in batch_imgs]
                yield (batch_imgs, labels)
    
    def create_sub_samples(self):
        self.sub_samples = []   # reset
        files = random.sample(self.spectrogram_samples_files, self.num_sub_samples)   # sampling without replacement
        for file in files:
#             print(file)
            spectrogram = np.load(file)
            assert spectrogram.shape[0] == self.img_height, "Input spectrogram height does not match img height"
            self.sub_samples.append(self.spectrogram_to_RGB(spectrogram))   # RGB converted spectrograms
        
    @classmethod
    def spectrogram_to_RGB(cls, spectrogram):
        assert len(spectrogram.shape) == 2, "Spectrogram input should be a 2D array"
        spectrogram_rgb = gray2rgb(spectrogram)
#         spectrogram_rgb = tf.convert_to_tensor(spectrogram_rgb)   # tf tensor
        return spectrogram_rgb
    
    @classmethod
    def get_img_slice_from_spectrogram(cls, spectrogram):
        height = spectrogram.shape[0]
        slice_start = random.randint(0, spectrogram.shape[1] - height - 1)
        return spectrogram[:, slice_start:slice_start+height]
        

In [7]:
# training_folder = os.path.join(output_data_folder, "training_dataset_full_spectrogram/vox1_dev_wav")
# spectrogram_samples_files = [os.path.join(training_folder, file) for file in os.listdir(training_folder)]
# batch_size = 5
# num_batches = 5
# # num_sub_samples = 100
# num_sub_samples = 10

# data_generator = DataGenerator(spectrogram_samples_files, batch_size, num_batches, num_sub_samples, IMG_HEIGHT)
# i = 0
# for batch in data_generator.generate_batches():
#     batch_imgs, labels = batch
#     print(batch_imgs.shape)
#     for img in batch_imgs: print(img[0:2,0,0])
#     print(labels)
#     i += 1
#     if i == 5: break

In [8]:
# ### Validation data

# validation_data_size = 50

# training_folder = os.path.join(output_data_folder, "training_dataset_full_spectrogram/vox1_test_wav")
# spectrogram_samples_files = [os.path.join(training_folder, file) for file in os.listdir(training_folder)]
# batch_size = 5
# num_batches = validation_data_size
# num_sub_samples = 10
# # num_sub_samples = len(spectrogram_samples_files)

# x_val = []
# y_val = []
# i = 0
# validation_data_generator = DataGenerator(spectrogram_samples_files, batch_size, num_batches, num_sub_samples, IMG_HEIGHT)
# for batch in validation_data_generator.generate_batches():
#     x_val.append(batch[0])
#     y_val.append(batch[1])
#     i += 1
# #     print(batch[0].shape)
#     print(batch[0])
#     print(type(batch[0]))
# #     print(batch[1].shape)
#     print(batch[1])
#     print(type(batch[1]))
#     if i == validation_data_size: break

# validation_data = tf.data.Dataset.from_tensor_slices((x_val, y_val)) 
# validation_data

In [9]:
### Training data

training_folder = os.path.join(output_data_folder, "training_dataset_full_spectrogram/vox1_dev_wav")
spectrogram_samples_files = [os.path.join(training_folder, file) for file in os.listdir(training_folder)]
batch_size = 3
num_batches = 1000
num_sub_samples = 70
# num_sub_samples = 20

training_data_generator = DataGenerator(spectrogram_samples_files, batch_size, num_batches, num_sub_samples, IMG_HEIGHT)
# training_data_generator = tf.data.Dataset.from_generator(
#     training_data_generator.generate_batches, 
#     output_types=(tf.float64, tf.int32)
# )

In [10]:
### Create multi-siamese model

# Inputs
input_shape = (IMG_HEIGHT, IMG_HEIGHT, 3)
# models
input_query_img = Input(shape=input_shape)
inputs_batch_imgs = [Input(shape=input_shape) for _ in range(batch_size)]

# Encoding layer
# for layer in base_model.layers: layer.trainable = False   # freeze base model
input_ = Input(shape=input_shape)
base_encoding_model = base_model(input_)
base_encoding_model = Flatten()(base_encoding_model)
# base_encoding_model = GlobalAveragePooling2D()(base_encoding_model)
# base_encoding_model = Dropout(0.3, name="dropout")(base_encoding_model)
base_encoding_model = Dense(128, name="encoder")(base_encoding_model)
base_encoding_model = Model(inputs=input_, outputs=base_encoding_model)   # create model
# models
encoding_model_query_img = base_encoding_model(input_query_img)
encoding_models_batch_imgs = [base_encoding_model(input_) for input_ in inputs_batch_imgs]

# Cosine similarity layer
query_encoder = encoding_model_query_img
cosine_similarity_batch_imgs = [   # cosine sim for all batch encodings vs query encoding 
    Dot(axes=1, normalize=True)([query_encoder, batch_encoder])   
    for batch_encoder in encoding_models_batch_imgs
]

# Output: Concatenate cosine similarities and softmax
output = Concatenate(name="concat_cosine_sim")(cosine_similarity_batch_imgs)   # concat cosine sim.
# output = Dense(batch_size, activation="softmax", name="Output")(output)
output = Softmax(name="softmax")(output)

# Overall model
inputs = [input_query_img, *inputs_batch_imgs]
outputs = output
model = Model(inputs=inputs, outputs=outputs)
model.compile(
    loss=categorical_crossentropy,
    optimizer=Adam(lr=0.001),
    metrics=['accuracy', FalsePositives(), "mean_absolute_error", Precision()]
)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 128, 128, 3) 0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 128, 128, 3) 0                                            
____________________________________________________________________________________________

In [11]:
### Train

epochs = 50

model.fit(
    x=training_data_generator.generate_batches(),
#     x=training_data_generator,
    epochs=epochs,
    steps_per_epoch=num_batches,
    verbose=1,
#     validation_data=validation_data,
#     workers=5,
#     use_multiprocessing=True,
)


  ...
    to  
  ['...']
Train for 1000 steps
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50


Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x27dc1690048>

In [12]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [13]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"