In [17]:
import tensorflow as tf
prelu=tf.keras.layers.PReLU

In [25]:
def encoder_block(inputs, num_filters): 
    x = tf.keras.layers.Conv2D(num_filters,(3,3), padding = 'same')(inputs) 
    x = prelu()(x)   
    x = tf.keras.layers.MaxPool2D(pool_size = 2, strides = 2)(x)      
    return x


def decoder_block(inputs, skip_features, num_filters): 
    # deconvolution with prelu 
    x = tf.keras.layers.Conv2DTranspose(num_filters, 2, strides = 2, padding = 'same')(inputs) 
    x = prelu()(x) 
      
    # forward fine grained features
    #skip_features = tf.image.resize(skip_features, size = (x.shape[1], x.shape[2])) 
    x = tf.keras.layers.Concatenate()([x, skip_features]) 
    return x


def tower(inputs, num_filters1, num_filters2, num_filters_target, strides1, strides2, strides3, strides4):
    c1 = tf.keras.layers.Conv2D(num_filters1,(3,3), padding = 'same')(inputs) 
    tower_1 = tf.keras.layers.MaxPooling2D(pool_size = strides1, strides=strides1, padding='same')(c1)
    tower_1 = tf.keras.layers.Conv2D(num_filters_target, (3,3), padding='same')(tower_1)
    tower_1 = tf.keras.layers.MaxPooling2D(pool_size = strides2, strides=strides2, padding='same')(tower_1)


    tower_2 = tf.keras.layers.Conv2D(num_filters2,(3,3), padding = 'same')(inputs) 
    tower_2 = prelu()(tower_2)
    tower_2 = tf.keras.layers.MaxPool2D(pool_size = strides3, strides = strides3)(tower_2)
    tower_2 = tf.keras.layers.Conv2D(num_filters_target,(3,3), padding = 'same')(tower_2) 
    tower_2 = prelu()(tower_2)
    tower_2 = tf.keras.layers.MaxPool2D(pool_size = strides4, strides = strides4)(tower_2)   
    
    merged = tf.keras.layers.concatenate([tower_1, tower_2], axis=1)
    return merged

In [66]:
# Unet code
import tensorflow as tf

def unet_model(input_shape = (512, 512, 3), num_filters=512, num_classes = 10):
    inputs = tf.keras.layers.Input(input_shape)

    #       BLOCK 1

    #convolution with prelu
    c1 = tf.keras.layers.Conv2D(num_filters/32,(3,3), padding = 'same')(inputs)
    c1 = prelu()(c1)
    c1 = tf.keras.layers.MaxPool2D(pool_size = 1, strides = 1)(c1)

    # Contracting Path
    s1 = encoder_block(c1, num_filters/16)
    s2 = encoder_block(s1, num_filters/8)
    s3 = encoder_block(s2, num_filters/4)

    # Bottleneck
    b1 = tf.keras.layers.Conv2D(num_filters/2, (3,3), padding = 'same')(s3)
    b1 = prelu()(b1)
    b1 = tf.keras.layers.MaxPool2D(pool_size = 2, strides = 2)(b1)

    # Expansive Path
    s4 = decoder_block(b1, s3, num_filters/4)
    s5 = decoder_block(s4, s2, num_filters/8)
    s6 = decoder_block(s5, s1, num_filters/16)


    #forward fine grained features with deconvolution
    c2 = tf.keras.layers.Conv2DTranspose(num_filters/32, (2, 2), strides = 2, padding = 'same')(s6)

    #       BLOCK 2

    #parallel convolutions
    pc1=tower(c2 , num_filters/4 ,num_filters/8  ,num_filters/4 ,   1,1,1,1)
    pc2=tower(pc1, num_filters/4 ,num_filters/2 ,num_filters/2 ,   1,1,1,1)
    pc3=tower(pc2, num_filters/2 ,num_filters/2 ,num_filters,   1,4,2,2)
    pc4=tower(pc3, num_filters,num_filters,num_filters,   1,2,2,1)



    #       OUTPUT
    flattened=tf.keras.layers.Flatten()(pc4)
    outputs = tf.keras.layers.Dense(num_classes, activation = 'softmax')(flattened)

    model = tf.keras.models.Model(inputs = inputs, outputs = outputs, name = 'U-Net')
    return model

In [69]:
model = unet_model(input_shape=(512, 512,3), num_classes=10) 
model.summary()

Model: "U-Net"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_19 (InputLayer)       [(None, 512, 512, 3)]        0         []                            
                                                                                                  
 conv2d_79 (Conv2D)          (None, 512, 512, 16)         448       ['input_19[0][0]']            
                                                                                                  
 p_re_lu_111 (PReLU)         (None, 512, 512, 16)         4194304   ['conv2d_79[0][0]']           
                                                                                                  
 max_pooling2d_74 (MaxPooli  (None, 511, 511, 16)         0         ['p_re_lu_111[0][0]']         
 ng2D)                                                                                        

  total_memory_size += weight_shape * per_param_size


In [None]:
import librosa
import matplotlib.pyplot as plt
import numpy as np

# Load the audio file
audio_file = "pop.00006.wav"
y, sr = librosa.load(audio_file)
# Compute the spectrogram
#D = librosa.stft(y)
#spectrogram = librosa.amplitude_to_db(abs(D), ref=np.max)

In [None]:
spectrogram=librosa.feature.melspectrogram(y=y,sr=sr)

# Plot the spectrogram
plt.figure(figsize=(10, 6))
librosa.display.specshow(spectrogram, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram')
plt.show()

In [None]:
spectrogram = np.expand_dims(spectrogram, axis=0)
spectrogram = np.expand_dims(spectrogram, axis=-1)
prediction=model.predict(spectrogram)
prediction