In [1]:
import os
import librosa
import time
import glob
import csv
import cv2
import librosa.display
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd


from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D, Activation
from keras.layers.normalization import BatchNormalization
from numpy import array, asarray, ndarray, swapaxes

Using TensorFlow backend.


### Transforms images from a directory in spectrograms

In [2]:
def audio_to_image(filepath, savepath='./images/'):
    
    filepath = os.path.abspath(filepath)
    savepath = os.path.abspath(savepath)
    mpl.rcParams['savefig.pad_inches'] = 0
    # Reads every audio file from filepath, returns amplitude in time(y) and sample rate(sr)
    for folder in os.listdir(filepath):
        folderpath = '{}/{}'.format(filepath, folder)
        
        for filename in os.listdir(folderpath):
            fullpath = '{}/{}'.format(folderpath, filename)
            print(fullpath)
            audio_wave, sample_rate = librosa.load(fullpath)

            # Applies Fourier transform in audio's amplitude
            fourier = librosa.stft(audio_wave)

            # Converts amplitude to DBs
            """
            ref:scalar or callable
                If scalar, the amplitude abs(S) is scaled relative to ref: 20 * log10(S / ref). 
                Zeros in the output correspond to positions where S == ref.

                If callable, the reference value is computed as ref(S).
            """
            D = librosa.amplitude_to_db(np.abs(fourier), ref=np.max)
            
           
            librosa.display.specshow(D, x_axis='time', y_axis='log')
            plt.axis('off')
            #plt.axis('off')
            
            savefolder = '{}/{}'.format(savepath, folder)
            
            if not os.path.exists(savefolder):
                os.makedirs(savefolder)
            
            plt.savefig(savefolder + '/' + filename.split('.')[0] + '-spectogram.png', bbox_inches='tight',  pad_inches=0)

In [None]:
audio_to_image('./teste/')

### Handles Genres DataSet import and conversion

In [2]:
### Loads audio Data CSV
df_tracks = pd.read_csv('fma_metadata/tracks.csv')
df_tracks.dropna(subset = ['genre_top'], inplace = True)

genres_id = {"Hip-Hop": 1,
             "Pop": 2,
             "Rock": 3,
             "Experimental": 4,
             "Spoken": 5,
             "Folk": 6,
             "Jazz": 7,
             "Electronic": 8,
             "International": 9,
             "Soul-RnB": 10,
             "Blues": 11,
             "Country": 12,
             "Classical": 13,
             "Old-Time / Historic": 14,
             "Instrumental": 15,
             "Easy Listening": 16,
             "1": 3}

df_tracks = df_tracks.replace({"genre_top": genres_id})
df_tracks.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,track_id,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,...,publisher,tags,title,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
0,2,256000,0,,2008-11-26 01:48:12,2008-11-26 00:00:00,168,2,1,[21],...,,[],Food,,,,,,,
1,3,256000,0,,2008-11-26 01:48:14,2008-11-26 00:00:00,237,1,1,[21],...,,[],Electric Ave,,,,,,,
2,5,256000,0,,2008-11-26 01:48:20,2008-11-26 00:00:00,206,6,1,[21],...,,[],This World,,,,,,,
3,10,192000,0,Kurt Vile,2008-11-25 17:49:06,2008-11-26 00:00:00,161,178,2,[10],...,,[],Freeway,,,,,,,
9,134,256000,0,,2008-11-26 01:43:19,2008-11-26 00:00:00,207,3,1,[21],...,,[],Street Music,,,,,,,


### Defines model's training options and params

In [3]:
# Number of samples propagated through the network
batch_size = 1

# Number of epochs  to avoid overfitting
epochs = 4

# Default Training size
training_size = 0.7

# Image's dimensions
img_rows, img_cols = 248, 387 

# Training data size (Where 95 = n of images)
length = int(95*training_size)

### Create train and tests sets from image's directory

In [4]:
def create_set(filepath):
    
    # Initializes train sets
    x_test = []
    x_train = []

    y_test = []
    y_train = []
    
    filepath = os.path.abspath(filepath)
    
    i = 0
    for folder in os.listdir(filepath):
        folderpath = '{}/{}'.format(filepath, folder)
        
        for filename in os.listdir(folderpath):
            fullpath = '{}/{}'.format(folderpath, filename)
            
            current_track_id = filename.split('-')[0]
            current_track_id = int(current_track_id)
            
            # Searchs for current track in df
            genre = df_tracks[df_tracks.track_id == current_track_id]
            
            # Gets genre from pandas Series
            genre = genre.genre_top.to_list()[0]
            
            print(i, fullpath, current_track_id, genre)

            img = cv2.imread(fullpath)

            if(i < length):
                x_train.append(img)
                y_train.append(genre)
            else:
                x_test.append(img)
                y_test.append(genre)
        
            i = i + 1

    return x_train, y_train, x_test, y_test   

In [5]:
x_train, y_train, x_test, y_test = create_set('images')

# Converting the data from lists to numpy arrays
x_train = asarray(x_train)
x_test = asarray(x_test)
y_train = asarray(y_train)
y_test = asarray(y_test)

# Scaling down the RGB data
x_train = x_train / 255
x_test = x_test / 255

# Printing stats about the features
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

0 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001069-spectogram.png 1069 4
1 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001040-spectogram.png 1040 3
2 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001039-spectogram.png 1039 3
3 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001066-spectogram.png 1066 4
4 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033049-spectogram.png 33049 6
5 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033064-spectogram.png 33064 8
6 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033069-spectogram.png 33069 9
7 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033070-spectogram.png 33070 9
8 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033050-spectogram.png 33050 6
9 /home/iago/Documentos/visao_computacional/music-genre-ai/images/033/033020-spectogram.png 33020 3
10 /

### Model's definitions and Convolutions

In [None]:
model = Sequential()

model.add(Conv2D(128,data_format = 'channels_last', kernel_size=(3, 3),
                 input_shape=(img_rows, img_cols,3)))
model.add(Activation('relu'))
model.add(Dropout(0.3))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(32, (2,2)))
model.add(Activation('relu'))
model.add(Dropout(0.2))

# Dense layers
model.add(Flatten())

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.15))

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.15))

model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dropout(0.15))

# Output
model.add(Dense(1))

# Printing model summary
print(model.summary())

# Compiling the model
model.compile(optimizer='RMSprop', loss='mse', metrics=['mae'])

# Training the model
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 246, 385, 128)     3584      
_________________________________________________________________
activation_1 (Activation)    (None, 246, 385, 128)     0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 246, 385, 128)     0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 244, 383, 64)      73792     
_________________________________________________________________
activation_2 (Activation)    (None, 244, 383, 64)      0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 81, 127, 64)       0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 81, 127, 64)       0         
__________

In [1]:
# Testing the model
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

NameError: name 'model' is not defined