<a href="https://colab.research.google.com/github/marccasals98/PiuPiuNet/blob/main/Download_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download bird songs
A script to download bird sound files from the www.xeno-canto.org archives with metadata
The program downloads all the files found with the search terms into subdirectory data/xeno-canto-dataset/searchTerm with corresponding json files.


In [None]:
import urllib.request, json
import sys
import os

bPath = "../data-fixed/xeno-canto-dataset-full-all-Countries/"
basePath = "../data-fixed/xeno-canto-dataset-full-all-Countries/"

## Defining necessary functions
* Creates the subdirectory data/xeno-canto-dataset if necessary
* Downloads and saves json files for number of pages in a query
* and directory path to saved json's

In [None]:
def save_json(searchTerms, birdName, country):
    numPages = 1
    page = 1
    # create a path to save json files and recordings
    path = bPath + birdName+ "/" + country
    if not os.path.exists(path):
        print("Creating subdirectory " + path + " for downloaded files...")
        os.makedirs(path)
        # download a json file for every page found in a query
    while page < numPages + 1:
        print("Loading page " + str(page) + "...")
        url = 'https://www.xeno-canto.org/api/2/recordings?query={0}&page={1}'.format(searchTerms.replace(' ', '%20'),
                                                                                      page)
        print(url)
        jsonPage = urllib.request.urlopen(url)
        jsondata = json.loads(jsonPage.read().decode('utf-8'))
        filename = path + "/jsondata_p" + str(page) + ".json"
        with open(filename, 'w') as outfile:
            json.dump(jsondata, outfile)
        # check number of pages
        numPages = jsondata['numPages']
        page = page + 1
    print("Found ", numPages, " pages in total.")
    # return number of files in json
    # each page contains 500 results, the last page can have less than 500 records
    print("Saved json for ", (numPages - 1) * 500 + len(jsondata['recordings']), " files")
    return path


# reads the json and return the list of values for selected json part
# i.e. "id" - ID number, "type": type of the bird sound such as call or song
# for all Xeno Canto files found with the given search terms.
def read_data(searchTerm, path):
    data = []
    numPages = 1
    page = 1
    # read all pages and save results in a list
    while page < numPages + 1:
        # read file
        with open(path + "/jsondata_p" + str(page) + ".json", 'r') as jsonfile:
            jsondata = jsonfile.read()
        jsondata = json.loads(jsondata)
        # check number of pages
        numPages = jsondata['numPages']
        # find "recordings" in a json and save a list with a search term
        for k in range(len(jsondata['recordings'])):
            data.append(jsondata["recordings"][k][searchTerm])
        page = page + 1
    return data


# downloads all sound files found with the search terms into xeno-canto directory
# into catalogue named after the search term (i.e. Apus apus)
# filename have two parts: the name of the bird in latin and ID number
def download(searchTerms, birdName, country):
    # create data/xeno-canto-dataset directory
    path = save_json(searchTerms, birdName, country)
    # get filenames: recording ID and bird name in latin from json
    filenamesID = read_data('id', path)
    filenamesCountry = read_data('cnt', path)
    # get website recording http download address from json
    fileaddress = read_data('file', path)
    numfiles = len(filenamesID)
    print("A total of ", numfiles, " files will be downloaded")
    for i in range(0, numfiles):
        print("Saving file ", i + 1, "/", numfiles,
              basePath + birdName.replace(':', '') + filenamesID[
                  i] + ".mp3")      
        urllib.request.urlretrieve(fileaddress[i], path + "/" + birdName + filenamesID[i] + ".mp3")

## Download
Example download with all sounds type song, from bird list recorded in selected countries 

*example query*: query = "Dendrocopos major cnt:Poland type:song"
can be downloaded as:

download(query)

Other options can be specified according to xeno-canto query list
http://www.xeno-canto.org/explore?query=common+snipe

In [None]:
countries = ['Spain']
'''
birds = ['Dendrocopos major',
         'Chloris chloris',
         'Corvus frugilegus',
         'Coccothraustes coccothraustes',
         'Columba palumbus',
         'Delichon urbicum',
         'Apus apus',
         'Sitta europaea',
         'Corvus monedula',
         'Phoenicurus ochruros',
         'Turdus merula',
         'Turdus pilaris',
         'Passer montanus',
         'Phylloscopus trochilus',
         'Phylloscopus collybita',
         'Phoenicurus phoenicurus',
         'Motacilla alba',
         'Erithacus rubecula',
         'Streptopelia decaocto',
         'Parus major',
         'Parus caeruleus',
         'Alauda arvensis',
         'Luscinia luscinia',
         'Garrulus glandarius',
         'Turdus philomelos',
         'Pica pica',
         'Troglodytes troglodytes',
         'Carduelis carduelis',
         'Sturnus vulgaris',
         'Emberiza citrinella',
         'Passer domesticus',
         'Corvus corone',
         'Fringilla coelebs']
'''

birds = ['Turdus merula',
         #'Passer domesticus',
         #'Carduelis carduelis',
         #'Sturnus unicolor',
         #'Motacilla alba',
         #'Upupa epops',
         #'Pica pica',
         #'Erithacus rubecula',
         #'Carduelis cannabina',
         #'Galerida cristata',
         #'Miliaria calandra' 
]

'''
birds = ['Sturnus unicolor',
         'Upupa epops',
         'Pica pica',
         'Carduelis cannabina',
         'Miliaria calandra',
         'Merops apiaster'
]
'''
'''
birds = ['Carduelis cannabina',
         'Miliaria calandra',
         'Merops apiaster'
]
'''
for country in countries:
    for bird in birds:
        download(bird + ' cnt:' + country + '&type:song', bird.replace(' ', ''), country)


In [None]:
%matplotlib inline
import matplotlib
matplotlib.interactive(False)
matplotlib.use('Agg')

# find all of the files in the directory
import os
import gc
basePath="../data-fixed/xeno-canto-dataset-full-all-Countries/"
melsPath= "../data-fixed/mels-27class/"

birds=[] # list of all birds
for root, dirs, files in os.walk(basePath):
    if root == basePath:
        birds=dirs
birds50=[]                
flist=[] # list of all files
blist=[] # list of files for one bird 
i50=0;
for i, bird in enumerate(birds):
    for root, dirs, files in os.walk(basePath+bird):
        for file in files:
            if file.endswith(".mp3"):
                blist.append(os.path.join(root, file))
    if len(blist) > 50:
        i50 = i50+1;
        print(i50, ". Found ", len(blist), ' files for ', bird ,'(',i+1,')')
        birds50.append(bird)
        flist.append(blist)
    blist=[]
print(birds50) 
print(root)

1 . Found  130  files for  Meropsapiaster ( 1 )
['Meropsapiaster']
../data-fixed/xeno-canto-dataset-full-all-Countries/Cardueliscannabina/Spain


In [None]:
def saveMel(signal, directory):
    gc.enable()
    # MK_spectrogram modified
    N_FFT = 1024         # 
    HOP_SIZE = 1024      #  
    N_MELS = 128          # Higher   
    WIN_SIZE = 1024      # 
    WINDOW_TYPE = 'hann' # 
    FEATURE = 'mel'      # 
    FMIN = 1400

    fig = plt.figure(1,frameon=False)
    fig.set_size_inches(6,6)

    ax = plt.Axes(fig, [0., 0., 1., 1.])
    ax.set_axis_off()
    fig.add_axes(ax)
    
    S = librosa.feature.melspectrogram(y=signal, sr=sr,
                                        n_fft=N_FFT,
                                        hop_length=HOP_SIZE, 
                                        n_mels=N_MELS, 
                                        htk=True, 
                                        fmin=FMIN, # higher limit ##high-pass filter freq.
                                        fmax=sr/2) # AMPLITUDE
    librosa.display.specshow(librosa.power_to_db(S**2,ref=np.max), fmin=FMIN) #power = S**2

    fig.savefig(directory)
    plt.ioff()
    #plt.show(block=False)
    fig.clf()
    ax.cla()
    plt.clf()
    plt.close('all')

In [None]:
import warnings
warnings.filterwarnings('ignore')

import sys
from tqdm import tqdm_notebook as tqdm
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

size = {'desired': 5, # [seconds]
        'minimum': 4, # [seconds]
        'stride' : 0, # [seconds]
        'name': 5 # [number of letters]
       } # stride should not be bigger than desired length

print('Number of directories to check and cut: ', len(flist))

#step = (size['desired']-size['stride'])*sr # length of step between two cuts in seconds
step=1
if step>0:
    for bird, birdList in enumerate(flist):
        print("Processing ",bird,'. ', birds50[bird], "...")
        for birdnr, path in tqdm(enumerate(birdList)):
            # load the mp3 file
            directory=melsPath+str(bird)+birds50[bird][:size['name']]+"/"

            if not os.path.exists(directory):
                os.makedirs(directory)
                
            if not os.path.exists(directory+path.rsplit('/',1)[1].replace(' ', '')[:-4]+"1_1.png"):
                  
                signal, sr = librosa.load(path) # sr = sampling rate
                step = (size['desired']-size['stride'])*sr # length of step between two cuts in seconds
                
                nr=0;
                for start, end in zip(range(0,len(signal),step),range(size['desired']*sr,len(signal),step)):
                    # cut file and save each piece
                    nr=nr+1
                    # save the file if its length is higher than minimum
                    if end-start > size['minimum']*sr:
                        melpath=path.rsplit('/',1)[1]
                        melpath=directory+melpath.replace(' ', '')[:-4]+str(nr)+"_"+str(nr)+".png"
                        saveMel(signal[start:end],melpath)
                    #print('New file...',start/sr,' - ',end/sr)
                    #print('Start: ',start,'end: ', end, 'length: ', end-start)
                
            pass
else:    
    print("Error: Stride should be lower than desired length.")
    
print('Number of files after cutting: ')

Number of directories to check and cut:  1
Processing  0 .  Meropsapiaster ...


0it [00:00, ?it/s]

Number of files after cutting: 


In [None]:
# Test
import matplotlib.image as mpimg
ilist=[]
for root, dirs, fil


es in os.walk(melsPath):
    print(dirs)
    for file in files:
        if file.endswith(".png"):
            ilist.append(os.path.join(root, file))
img=mpimg.imread(ilist[0])
imgplot = plt.imshow(img)
plt.show()
img=mpimg.imread(ilist[100])
imgplot = plt.imshow(img)
plt.show()
img=mpimg.imread(ilist[70])
imgplot = plt.imshow(img)
plt.show()
img=mpimg.imread(ilist[2035])
imgplot = plt.imshow(img)
plt.show()
print("Found ",len(ilist)," files")

['0Merop']
[]
Found  3515  files



..data-fixed/mels-27class

/data-fixed/mels-27class

In [None]:
!zip -r /content/mels.zip /data-fixed/mels-27class

In [None]:
from google.colab import files
files.download("mels.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Download all the mp3 files of the provided list of birds from Spain

In [None]:
countries = ['Spain']

birds = [#'Turdus merula',
         #'Passer domesticus',
         #'Carduelis carduelis',
         #'Sturnus unicolor',
         #'Motacilla alba',
         #'Upupa epops',
         #'Pica pica',
         #'Erithacus rubecula',
         #'Carduelis cannabina', # any samples found
         #'Galerida cristata',
         #'Miliaria calandra' # any samples found
         'Merops apiaster',
]

for country in countries:
    for bird in birds:
        download(bird + ' cnt:' + country + '&type:song', bird.replace(' ', ''), country)

Creating subdirectory ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster/Spain for downloaded files...
Loading page 1...
https://www.xeno-canto.org/api/2/recordings?query=Merops%20apiaster%20cnt:Spain&type:song&page=1
Found  1  pages in total.
Saved json for  130  files
A total of  130  files will be downloaded
Saving file  1 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster746925.mp3
Saving file  2 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster742422.mp3
Saving file  3 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster740249.mp3
Saving file  4 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster740076.mp3
Saving file  5 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster740074.mp3
Saving file  6 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapiaster740072.mp3
Saving file  7 / 130 ../data-fixed/xeno-canto-dataset-full-all-Countries/Meropsapia

### Splitting mp3 files into Train, Valid and Test. For each bird class split in 80% train, 10% valid and 10% test.

In [None]:
train_path = os.path.join("/data-fixed", 'train')
if not os.path.exists(train_path):
                os.makedirs(train_path)

valid_path = os.path.join("/data-fixed", 'validation')
if not os.path.exists(valid_path):
                os.makedirs(valid_path)

test_path = os.path.join("/data-fixed", 'test')
if not os.path.exists(test_path):
                os.makedirs(test_path)

In [None]:
import random
import math
import shutil
'''
birds = ['Turdusmerula',
         'Passerdomesticus',
         'Cardueliscarduelis',
         'Sturnusunicolor',
         'Motacillaalba',
         'Upupaepops',
         'Picapica',
         'Erithacusrubecula',
         'Galeridacristata',
]
''' 
birds = ['Meropsapiaster']
country = "Spain"

val_percentage = 10
test_percentage = 10

for bird_name in birds:

    print(f'---------[{bird_name}]----------')
    mdata_path = os.path.join("/data-fixed/xeno-canto-dataset-full-all-Countries", bird_name, country)

    mp3_files = [f for f in sorted(os.listdir(mdata_path))]
    print(f'We have a total of {len(mp3_files)} mp3 files.')
    random.shuffle(mp3_files) # randomly shuffle the files

    val_samples = math.floor(val_percentage/100*len(mp3_files))
    print(f'We will have {val_samples} mp3 files from {bird_name} on the Validation split')

    test_samples = math.floor(test_percentage/100*len(mp3_files))
    print(f'We will have {test_samples} mp3 files from {bird_name} on the Test split')
    print(f'We will have {len(mp3_files)-val_samples-test_samples} mp3 files from {bird_name} on the Training split')

    for idx, f in enumerate(mp3_files):

      if idx < val_samples: # copy file to validation folder
          destination_path = os.path.join(valid_path, bird_name, country)
          if not os.path.exists(destination_path):
              os.makedirs(destination_path)

          shutil.copyfile(os.path.join(mdata_path, f), os.path.join(destination_path, f))

      elif idx < val_samples + test_samples: # copy file to test folder
          destination_path = os.path.join(test_path, bird_name, country)
          if not os.path.exists(destination_path):
              os.makedirs(destination_path)

          shutil.copyfile(os.path.join(mdata_path, f), os.path.join(destination_path, f))

      else: # copy file to train folder
          destination_path = os.path.join(train_path, bird_name, country)
          if not os.path.exists(destination_path):
              os.makedirs(destination_path)

          shutil.copyfile(os.path.join(mdata_path, f), os.path.join(destination_path, f))
      
    # Check number of samples on each split

    train_files = [f for f in os.listdir(os.path.join(train_path, bird_name, country))]
    valid_files = [f for f in os.listdir(os.path.join(valid_path, bird_name, country))]
    test_files = [f for f in os.listdir(os.path.join(test_path, bird_name, country))]
    print('\nChecking: ...')
    print(f'There are {len(train_files)}/{len(valid_files)}/{len(test_files)} train/valid/test mp3 files of {bird_name}.')
    print("\n------------------------------------------------------------\n")

---------[Meropsapiaster]----------
We have a total of 131 mp3 files.
We will have 13 mp3 files from Meropsapiaster on the Validation split
We will have 13 mp3 files from Meropsapiaster on the Test split
We will have 105 mp3 files from Meropsapiaster on the Training split

Checking: ...
There are 105/13/13 train/valid/test mp3 files of Meropsapiaster.

------------------------------------------------------------



In [None]:
!zip -r /data-fixed/train-data.zip /data-fixed/train/Meropsapiaster
!zip -r /data-fixed/valid-data.zip /data-fixed/validation/Meropsapiaster
!zip -r /data-fixed/test-data.zip /data-fixed/test/Meropsapiaster

  adding: data-fixed/train/Meropsapiaster/ (stored 0%)
  adding: data-fixed/train/Meropsapiaster/Spain/ (stored 0%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster578317.mp3 (deflated 8%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster179274.mp3 (deflated 1%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster33995.mp3 (deflated 10%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster176058.mp3 (deflated 1%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster392185.mp3 (deflated 2%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster430815.mp3 (deflated 2%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster698379.mp3 (deflated 1%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster392184.mp3 (deflated 2%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster61503.mp3 (deflated 4%)
  adding: data-fixed/train/Meropsapiaster/Spain/Meropsapiaster284637.mp3 (deflated 4%)
  adding: data-

In [None]:
from google.colab import files
files.download("/data-fixed/train-data.zip")
files.download("/data-fixed/valid-data.zip")
files.download("/data-fixed/test-data.zip")