## Downloading audio data

In [None]:
import urllib.request
import json
import os

# Define the base path where the data will be saved
base_path = "../data/xeno-canto-dataset/"

In [None]:
def save_json(search_terms, bird_name, country):
    num_pages = 1
    page = 1
    path = base_path + bird_name.replace(' ', '_') + "/" + country
    if not os.path.exists(path):
        print("Creating subdirectory " + path + " for downloaded files...")
        os.makedirs(path)
    
    while page < num_pages + 1:
        print("Loading page " + str(page) + "...")
        url = 'https://www.xeno-canto.org/api/2/recordings?query={0}&page={1}'.format(
            search_terms.replace(' ', '%20'), page)
        print(url)
        json_page = urllib.request.urlopen(url)
        json_data = json.loads(json_page.read().decode('utf-8'))
        filename = path + "/jsondata_p" + str(page) + ".json"
        with open(filename, 'w') as outfile:
            json.dump(json_data, outfile)
        num_pages = json_data['numPages']
        page = page + 1
    print("Found ", num_pages, " pages in total.")
    print("Saved json for ", (num_pages - 1) * 500 + len(json_data['recordings']), " files")
    return path



In [None]:
def read_data(search_term, path):
    data = []
    num_pages = 1
    page = 1
    
    while page < num_pages + 1:
        with open(path + "/jsondata_p" + str(page) + ".json", 'r') as jsonfile:
            json_data = jsonfile.read()
        json_data = json.loads(json_data)
        num_pages = json_data['numPages']
        for recording in json_data['recordings']:
            data.append(recording[search_term])
        page = page + 1
    return data



In [None]:
def download(search_terms, bird_name, country):
    path = save_json(search_terms, bird_name, country)
    filenames_id = read_data('id', path)
    file_addresses = read_data('file', path)
    
    num_files = len(filenames_id)
    print("A total of ", num_files, " files will be downloaded")
    
    for i in range(num_files):
        print("Saving file ", i + 1, "/", num_files,
              base_path + bird_name.replace(' ', '_') + "/" + filenames_id[i] + ".mp3")
        # Fix the URL by adding "http://" if it's missing
        url = "http:" + file_addresses[i] if not file_addresses[i].startswith("http") else file_addresses[i]
        urllib.request.urlretrieve(url, path + "/" + filenames_id[i] + ".mp3")



In [None]:
# List of countries and bird species to download
countries = ['Poland', 'Germany', 'Slovakia', 'Czech', 'Lithuania']
birds = [
    'Dendrocopos major', 'Chloris chloris', 'Corvus frugilegus', 'Coccothraustes coccothraustes',
    'Columba palumbus', 'Delichon urbicum', 'Apus apus', 'Sitta europaea', 'Corvus monedula',
    'Phoenicurus ochruros', 'Turdus merula', 'Turdus pilaris', 'Passer montanus', 'Phylloscopus trochilus',
    'Phylloscopus collybita', 'Phoenicurus phoenicurus', 'Motacilla alba', 'Erithacus rubecula',
    'Streptopelia decaocto',
     'Parus major', 'Parus caeruleus', 
      'Alauda arvensis', 'Luscinia luscinia',
    'Garrulus glandarius', 'Turdus philomelos', 'Pica pica', 'Troglodytes troglodytes', 'Carduelis carduelis',
    'Sturnus vulgaris', 'Emberiza citrinella', 'Passer domesticus', 'Corvus corone', 
    'Fringilla coelebs'
]

# Download audio files for each bird species in each country
for country in countries:
    for bird in birds:
        search_terms = bird + ' cnt:' + country + ' type:song'
        download(search_terms, bird.replace(' ', ''), country)

# Download audio files for each bird species across all countries
for bird in birds:
    search_terms = bird + ' type:song'
    download(search_terms, bird.replace(' ', ''), 'countries')
