# Bird Data Download

Download bird data from [Macaulay Library – Collecting, Archiving, and Distributing Wildlife Media Since 1929](https://www.macaulaylibrary.org/#_ga=2.137140912.1870833980.1539159576-815199649.1538639191)

## Meta Data Downloader

In [1]:
import requests
import os
from tqdm import tqdm, trange
from io import BytesIO
import time
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool 
import numpy as np

In [84]:
class BirdMetaDataDownloader(object):
    def __init__(self):
        self.cookies = {
            "__hssrc": "1",
            "__hstc": "264660688.9cc6bee4b68d06ca45c0fdddce762800.1539180999451.1539180999451.1539180999451.1",
            "_dc_gtm_UA-51396009-1": "1",
            "_ga": "GA1.3.42668022.1539180956",
            "_gid": "GA1.3.2078748312.1539180956",
            "hubspotutk": "9cc6bee4b68d06ca45c0fdddce762800",
            "PIZOTE_SESSIONID": "19E257E0ACD97C2086B7E21786D242DC",
        }
        
    
    def download_an_query_in_a_month_in_a_year(self, query, taxon_code, year, month):
        url = "https://search.macaulaylibrary.org/catalog.csv"
        parameter = {
            'mediaType': 'p',
            'taxonCode': taxon_code,
            'q': query,
            'yr': 'YCUSTOM',
            'mr': 'MCUSTOM',
            'sort': 'rating_rank_desc',
            'by': year,
            'ey': year,
            'bmo': month,
            'emo': month,
        }
        
        r = requests.get(url, params=parameter, cookies=self.cookies)

        if len(r.content) > 703:
            os.makedirs(os.path.join('..', f'data/meta/{taxon_code}'), exist_ok=True)
            with open(os.path.join('..', f'data/meta/{taxon_code}/{taxon_code}_{year}_{month:0>2}.csv'), 'wb') as f:
                f.write(r.content)
            
        return 0
    
    def download_random_query_in_a_month_in_a_year(self, year, month):
        url = "https://search.macaulaylibrary.org/catalog.csv"
        parameter = {
            'mediaType': 'p',
            'yr': 'YCUSTOM',
            'mr': 'MCUSTOM',
            'sort': 'rating_rank_desc',
            'by': year,
            'ey': year,
            'bmo': month,
            'emo': month,
        }
        
        r = requests.get(url, params=parameter, cookies=self.cookies)

        if len(r.content) > 703:
            os.makedirs(os.path.join('..', f'data/meta/random'), exist_ok=True)
            with open(os.path.join('..', f'data/meta/random/{year}_{month:0>2}.csv'), 'wb') as f:
                f.write(r.content)
            
        return 0
    
    def download_an_query_all_time(self, query, taxon_code):
        for year in tqdm(range(1900, 2019)):
            for month in range(1, 13):
                self.download_an_query_in_a_month_in_a_year(query, taxon_code, year, month)
                
        return 0
    
    def download_random_query_all_time(self):
        for year in tqdm(range(1900, 2019)):
            for month in range(1, 13):
                self.download_random_query_in_a_month_in_a_year(year, month)
                
        return 0
    
    def download_all_egrets(self):
        querys = [
            'Great Egret - Ardea alba',
            'Intermediate Egret - Ardea intermedia',
            'Little Egret - Egretta garzetta',
            'Cattle Egret - Bubulcus ibis',
        ]
        
        taxon_codes = [
            'greegr',
            'integr',
            'litegr',
            'categr',
        ]
        
        for query, taxon_code in zip(querys, taxon_codes):
            print(f'{query}: ')
            time.sleep(1)
            self.download_an_query_all_time(query, taxon_code)
            
        return 0

In [81]:
bmd_dldr = BirdMetaDataDownloader()

In [82]:
bmd_dldr.download_all_egrets()

Cattle Egret - Bubulcus ibis: 



  0%|          | 0/43 [00:00<?, ?it/s][A
100%|██████████| 43/43 [13:45<00:00, 19.19s/it]


0

In [85]:
bmd_dldr.download_random_query_all_time()

100%|██████████| 119/119 [44:23<00:00, 22.38s/it]


0

## Merge Meta Data

we select data since 2010, for aligning data and removing bad quatity data. We do this via create directories named "old" to store the meta data for old data.

In [111]:
def merge_meta_data(taxon_code, dir_path):
    file_paths = os.listdir(dir_path)
    file_paths = [os.path.join(dir_path, x) for x in file_paths if '.csv' in x]
    
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path) 
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True)
    
    if taxon_code == "random":
        df = df[df.loc[:, 'Common Name'].str.contains("Great Egret") == False]
        df = df[df.loc[:, 'Common Name'].str.contains("Intermediate Egret") == False]
        df = df[df.loc[:, 'Common Name'].str.contains("Little Egret") == False]
        df = df[df.loc[:, 'Common Name'].str.contains("Cattle Egret") == False]
        
    print(f'{taxon_code}: {len(df)} photos')
    
    df.to_csv(os.path.join('..', 'data/meta_merge', f'{taxon_code}.csv'), index=False)

In [112]:
taxon_codes = [
    'greegr',
    'integr',
    'litegr',
    'categr',
    'random',
]

dir_names = [
    os.path.join('..', 'data/meta/greegr'),
    os.path.join('..', 'data/meta/integr'),
    os.path.join('..', 'data/meta/litegr'),
    os.path.join('..', 'data/meta/categr'),
    os.path.join('..', 'data/meta/random'),
]

for taxon_code, dir_name in zip(taxon_codes, dir_names):
    merge_meta_data(taxon_code, dir_name)

greegr: 35002 photos
integr: 2239 photos
litegr: 8480 photos
categr: 19167 photos
random: 105250 photos


## Data Downloader

In [4]:
class BirdDataDownloader(object):
    def download_by_ml_id(self, taxon_code, ml_id):
        url = f"https://download.ams.birds.cornell.edu/api/v1/asset/{ml_id}/large"
        
        r = requests.get(url)
        
        os.makedirs(os.path.join('..', f'data/data/{taxon_code}'), exist_ok=True)
        with open(os.path.join('..', f'data/data/{taxon_code}/{ml_id}.jpg'), 'wb') as f:
            f.write(r.content)
            
        return 0
    
    def download_by_file_sub(self, param):
        taxon_code, ml_ids = param
        
        for ml_id in ml_ids:
            self.download_by_ml_id(taxon_code, ml_id)
            
        return 0
    
    def download_by_file(self, taxon_code, file_path):
        df = pd.read_csv(file_path, usecols=['ML Catalog #'])
        
        ml_ids = df.loc[:, 'ML Catalog #'].values
        
        for ml_id in tqdm(ml_ids):
            self.download_by_ml_id(taxon_code, ml_id)
        
        return 0
    
    def download_by_file_mt(self, taxon_code, file_path, nt=20):
        df = pd.read_csv(file_path, usecols=['ML Catalog #'])
        
        ml_ids = df.loc[:, 'ML Catalog #'].values
        ml_ids = np.array_split(ml_ids, nt)
        taxon_codes = [taxon_code for i in range(nt)]
        
        pool = ThreadPool(nt) 
        pool.map(self.download_by_file_sub, zip(taxon_codes, ml_ids))
        
        return 0
    
    def download_by_directory(self, taxon_code, dir_path):
        file_paths = os.listdir(dir_path)
        file_paths = [os.join(dir_path, x) for x in file_paths if '.csv' in x]
        
        for file_path in file_paths:
            self.download_by_file(taxon_code, file_path)
            
        return 0
            
    def download_all(self):
        taxon_codes = [
            'greegr',
            'integr',
            'litegr',
            'categr',
            'random',
        ]
        
        dir_names = [
            os.path.join('..', 'data/meta/greegr'),
            os.path.join('..', 'data/meta/integr'),
            os.path.join('..', 'data/meta/litegr'),
            os.path.join('..', 'data/meta/categr'),
            os.path.join('..', 'data/meta/random'),
        ]
        
        for taxon_code, dir_name in zip(taxon_codes, dir_names):
            self.download_by_directory(taxon_code, dir_path)
            
        return 0
    
    def download_all_from_merge_meta(self, mt=False):
        taxon_codes = [
            'greegr',
            'integr',
            'litegr',
            'categr',
            'random',
        ]
        
        file_paths = [
            os.path.join('..', 'data/meta_merge/greegr.csv'),
            os.path.join('..', 'data/meta_merge/integr.csv'),
            os.path.join('..', 'data/meta_merge/litegr.csv'),
            os.path.join('..', 'data/meta_merge/categr.csv'),
            os.path.join('..', 'data/meta_merge/random.csv'),
        ]
        
        for taxon_code, file_path in zip(taxon_codes, file_paths):
            if mt:
                self.download_by_file_mt(taxon_code, file_path, nt=10)
            else:
                self.download_by_file(taxon_code, file_path)
            
        return 0

In [5]:
bd_dlr = BirdDataDownloader()

In [6]:
bd_dlr.download_all_from_merge_meta(mt=True)

0