In [4]:
import os
import librosa
import time
import glob
import csv
import cv2
import librosa.display
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd


from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D, Activation
from keras.layers.normalization import BatchNormalization
from numpy import array, asarray, ndarray

### Transforms images from a directory in spectrograms

In [None]:
def audio_to_image(filepath, savepath='./images/'):
    
    filepath = os.path.abspath(filepath)
    savepath = os.path.abspath(savepath)
    mpl.rcParams['savefig.pad_inches'] = 0
    # Reads every audio file from filepath, returns amplitude in time(y) and sample rate(sr)
    for folder in os.listdir(filepath):
        folderpath = '{}/{}'.format(filepath, folder)
        
        for filename in os.listdir(folderpath):
            fullpath = '{}/{}'.format(folderpath, filename)
            print(fullpath)
            audio_wave, sample_rate = librosa.load(fullpath)

            # Applies Fourier transform in audio's amplitude
            fourier = librosa.stft(audio_wave)

            # Converts amplitude to DBs
            """
            ref:scalar or callable
                If scalar, the amplitude abs(S) is scaled relative to ref: 20 * log10(S / ref). 
                Zeros in the output correspond to positions where S == ref.

                If callable, the reference value is computed as ref(S).
            """
            D = librosa.amplitude_to_db(np.abs(fourier), ref=np.max)
            
           
            librosa.display.specshow(D, x_axis='time', y_axis='log')
            plt.axis('off')
            #plt.axis('off')
            
            savefolder = '{}/{}'.format(savepath, folder)
            
            if not os.path.exists(savefolder):
                os.makedirs(savefolder)
            
            plt.savefig(savefolder + '/' + filename.split('.')[0] + '-spectogram.png', bbox_inches='tight',  pad_inches=0)

In [None]:
audio_to_image('./teste/')

In [63]:
### Loads audio Data CSV
df_tracks = pd.read_csv('fma_metadata/tracks.csv')
df_tracks.dropna(subset = ['genre_top'], inplace = True)

In [52]:
# Training options

# Number of samples propagated through the network
batch_size = 25
epochs = 1
training_size = 0.7

# Image's dimensions
img_rows, img_cols = 224, 224


x_test = []
x_train = []

y_test = []
y_train = []
tempY = []

# Training data size
length = int(95*training_size)

In [101]:
def create_set(filepath):
    
    filepath = os.path.abspath(filepath)
    
    i = 0
    for folder in os.listdir(filepath):
        folderpath = '{}/{}'.format(filepath, folder)
        
        for filename in os.listdir(folderpath):
            fullpath = '{}/{}'.format(folderpath, filename)
            
            current_track_id = filename.split('-')[0]
            genre = df_tracks[df_tracks.track_id == current_track_id]
            #genre.reset_index()
            #print(genre)
            #genre = genre.at[0, 'genre_top']
            
            print(i, fullpath, current_track_id, genre)
            #genre = genre.genre_top.to_list()[0]
            
            
            img = cv2.imread(fullpath)
            
            if(i < length):
                x_train.append(array(img))
                y_train.append(genre)
            else:
                x_test.append(array(img))
                y_test.append(genre)
        
        i = i + 1

    return x_train, y_train, x_test, y_test   

In [102]:
x_train, y_train, x_test, y_test = create_set('images')

# Converting the data from lists to numpy arrays

x_train = asarray(x_train, dtype = float)
x_test = asarray(x_test, dtype = float)
y_train = asarray(y_train, dtype = float)
y_test = asarray(y_test, dtype = float)

# Scaling down the RGB data
x_train /= 255
x_test /= 255

# Printing stats about the features
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

0 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001069-spectogram.png 001069 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
0 /home/iago/Documentos/visao_computacional/music-genre-ai/images/001/001040-spectogram.png 001040 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
0 /home/iago/Documentos/visao_computacional/music-genre-ai/image

4 /home/iago/Documentos/visao_computacional/music-genre-ai/images/030/030519-spectogram.png 030519 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
4 /home/iago/Documentos/visao_computacional/music-genre-ai/images/030/030520-spectogram.png 030520 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
4 /home/iago/Documentos/visao_computacional/music-genre-ai/image

7 /home/iago/Documentos/visao_computacional/music-genre-ai/images/038/038354-spectogram.png 038354 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
8 /home/iago/Documentos/visao_computacional/music-genre-ai/images/000/000210-spectogram.png 000210 Empty DataFrame
Columns: [track_id, bit_rate, comments, composer, date_created, date_recorded, duration, favorites, genre_top, genres, genres_all, information, interest, language_code, license, listens, lyricist, number, publisher, tags, title, Unnamed: 21, Unnamed: 22, Unnamed: 23, Unnamed: 24, Unnamed: 25, Unnamed: 26, Unnamed: 27]
Index: []

[0 rows x 28 columns]
8 /home/iago/Documentos/visao_computacional/music-genre-ai/image

ValueError: setting an array element with a sequence.

In [None]:
#defining the model
model = Sequential()
#input
model.add(Conv2D(128,data_format = 'channels_last', kernel_size=(3, 3),
                 input_shape=(img_rows, img_cols,3)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
#convolutions

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(64, (2,2)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(32, (2,2)))
model.add(Activation('relu'))
model.add(Dropout(0.2))

#dense layers
model.add(Flatten())

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.15))

model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.15))

model.add(Dense(8))
model.add(Activation('relu'))
model.add(Dropout(0.15))

#output
model.add(Dense(1))

#printing model summary
print(model.summary())

#compiling the model
model.compile(optimizer='RMSprop', loss='mse', metrics=['mae'])

#training the model
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

#testing the model
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [77]:
teste = int("001069")
df_tracks[df_tracks.track_id == teste]

Unnamed: 0,track_id,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,...,publisher,tags,title,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27
806,1069,192000,0,,2008-11-26 02:33:08,2005-04-15 00:00:00,110,6,Experimental,[22],...,,[],Un Lagrima en la Discoteca,,,,,,,
