In [1]:
import os 
from os import path
import subprocess
import pydub
from pydub import AudioSegment
import glob
import pandas as pd

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import librosa
import librosa.display
import scipy
import cv2

In [2]:
def create_balckwhite_128(librosa_audio_data, number_of_windows_per_file = 1305):
    
    pic = np.ones(shape=(128,number_of_windows_per_file), dtype=float)*255.0
    for i in range(0,number_of_windows_per_file):
        #multiplying fileds 1 and 2 with 1000 to change them to msec
        #I am using 23 here, because the hop_length is 23
        if((float(librosa_audio_data[1])*1000)<=(i*23) and (i*23 + 46)<float(librosa_audio_data[2])*1000):
            for k in range(0,128):
                pic[k][i]=0 #because we want the balckwhite picture to be in the middle
    return pic

In [3]:
def build_spectrogram_annotate(audio_file_path, i=0, img_path = "img_spectrogram", windows_size = 256, annotationFormat="image"):
    y, sr = librosa.load(audio_file_path, sr=44100)
    S=librosa.feature.melspectrogram(y, sr=44100, n_mels=128, n_fft=2028, hop_length=1014)
    log_S = librosa.power_to_db(S, ref=np.max)

    log_S =np.abs(log_S)/ np.max(np.abs(log_S),axis=0)
    log_S *= (255.0 )
    spectogram_img_path = os.path.join(img_path,"/".join(audio_file_path.split("/")[1:-1]))
    os.makedirs(spectogram_img_path,exist_ok=True) # Extract folder name & create folder if not already existing  : img_spectrogram/Common Whitethroat
    # spectogram_img_path = os.path.join(img_path,"/".join(audio_file_path.split("/")[1:]))[:-4]
    img = log_S[:,:windows_size]
    # img = cv2.resize(log_S, (128, windows_size))
    cv2.imwrite(spectogram_img_path+'/'+str(i)+'.png', img)
    # imageio.imwrite(spectogram_img_path+'/'+str(i)+'.png', log_S)

    
    # annotation
    if annotationFormat=="image":
        # imageio.imwrite(directory_name+'/'+str(i)+'.png', p)
        p = create_balckwhite_128(y, windows_size)
        directory_name = os.path.join(spectogram_img_path,str(i))
    
        os.makedirs(directory_name,exist_ok=True)
        cv2.imwrite(directory_name+'/'+str(i)+'.png', p)


## Convert mp3 to wav and build spectrogram

## Use little sample

In [4]:
annotation_template = """
<annotation>
	<folder>{folderName}</folder>
	<filename>{imageName}</filename>
	<path>{fullNamePath}</path>
	<source>
		<database>Unknown</database>
	</source>
	<size>
		<width>{imgWidth}</width>
		<height>{imgHeight}</height>
		<depth>{channel}</depth>
	</size>
	<segmented>0</segmented>
	<object>
		<name>{className}</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>{bbox_xmin}</xmin>
			<ymin>{bbox_ymin}</ymin>
			<xmax>{bbox_xmax}</xmax>
			<ymax>{bbox_ymax}</ymax>
		</bndbox>
	</object>
</annotation>
"""

In [11]:
!rm -R audio_wav/*
!rm -R temp_datasets/*
!rm -R temp_datasets/annots/*

In [6]:
folder_path = "audio_mp3/**"
dest_folder = "audio_wav"
i = 0
data_list = []
samples = 5
img_path = "temp_datasets"
windows_size = 256
n_mels = 128

In [7]:
# meta_info = {}
for path2file in glob.glob(folder_path):
    # print(path2file)
    cnt = 0
    for files in glob.glob(path2file+"/*.mp3"):
        if(cnt>samples-1):
            break
        temp = files.split(".mp3")[0] # discard the extension
        temp = temp.split("-")
        id = temp[-1]
        classe = temp[-2].split("/")[-1]
        meta_info = {
        "id":i,
        "audio_id":id,
        "class":classe,
        "t_start_sec":"0",
        "t_end_sec":"5"
        }   
        
        new_path = os.path.join(dest_folder, classe)
        os.makedirs(new_path,exist_ok=True)
        data_list.append(meta_info)
        audio_file_path = os.path.join(new_path, str(id + ".wav"))
        subprocess.call([pydub.AudioSegment.converter, '-i', files,audio_file_path])        
        build_spectrogram_annotate(audio_file_path, i, img_path+"/images", windows_size, annotationFormat=None)

        directory_name = os.path.join(img_path,"annots")
    
        os.makedirs(directory_name,exist_ok=True)
        with open(directory_name+'/'+str(i)+'.xml',"w") as fp:
            fp.write(annotation_template.format(folderName = img_path,
                                                imageName =str(i)+".png",
                                                fullNamePath =img_path+"/"+str(i)+".png",
                                                imgWidth = windows_size,
                                                imgHeight = n_mels,
                                                channel = 1,
                                                className = classe.replace(" ","_"),
                                                bbox_xmin = 1,
                                                bbox_ymin = 1,
                                                bbox_xmax = windows_size-1,
                                                bbox_ymax = n_mels-1)
                    )
            fp.close()
        cnt+=1
        i+=1

ffmpeg version 4.0 Copyright (c) 2000-2018 the FFmpeg developers
  built with gcc 7.2.0 (crosstool-NG fa8859cb)
  configuration: --prefix=/home/obasekore/anaconda3/envs/RL_local --cc=/opt/conda/conda-bld/ffmpeg_1531088893642/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-shared --enable-static --enable-zlib --enable-pic --enable-gpl --enable-version3 --disable-nonfree --enable-hardcoded-tables --enable-avresample --enable-libfreetype --disable-openssl --disable-gnutls --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --disable-libx264
  libavutil      56. 14.100 / 56. 14.100
  libavcodec     58. 18.100 / 58. 18.100
  libavformat    58. 12.100 / 58. 12.100
  libavdevice    58.  3.100 / 58.  3.100
  libavfilter     7. 16.100 /  7. 16.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  1.100 /  5.  1.100
  libswresample   3.  1.100 /  3.  1.100
  libpostproc    55.  1.100 / 55.  1.100
Input #0, mp3, from 'audio_mp3/Common Whitethroat

In [8]:
df = pd.DataFrame(data_list)
df.to_csv("datasets/dataset.csv")
df

Unnamed: 0,id,audio_id,class,t_start_sec,t_end_sec
0,0,199781,Common Whitethroat,0,5
1,1,492805,Common Whitethroat,0,5
2,2,619033,Common Whitethroat,0,5
3,3,256109,Common Whitethroat,0,5
4,4,736699,Common Whitethroat,0,5
5,5,518779,Common Blackbird,0,5
6,6,630362,Common Blackbird,0,5
7,7,631014,Common Blackbird,0,5
8,8,598974,Common Blackbird,0,5
9,9,625892,Common Blackbird,0,5


In [None]:
df.groupby("class").head(3)

### copy all images and annotations in the temporary dataset folder to the 
### actual dataset

In [13]:
pwd

'g:\\birdSounds\\Mask_RCNN\\samples\\birdsounds'

In [7]:
!mkdir ${PWD}/datasets/images
!mkdir ${PWD}/datasets/annots
!find ${PWD}/temp_datasets -name '*.png' -exec cp -t ${PWD}/datasets/images {} +
!cp ${PWD}/temp_datasets/annots/*.xml ${PWD}/datasets/annots/*.xml 

File not found - '*.png'


## Train