In [1]:
import requests
import time
import os
import zipfile
import pandas as pd

<h3>Non-chordata request for GBIF</h3>
Excludes Chordata <b>except</b> tunicates (Ascidiacea, Thaliacea, Appendicularia) </br>
<b>and</b> lancelets (Leptocardii) </br>
Includes only Animalia


In [2]:

request = '''{
    "creator": "mihtmo",
    "notificationAddresses": [
        "mitch.d.webb@gmail.com"
    ],
    "format": "SIMPLE_CSV",
    "sendNotification": true,
    "predicate": {
        "type": "and",
        "predicates": [
            {
                "type": "not",
                "predicate": {
                    "type": "equals", 
                    "key": "PHYLUM_KEY", 
                    "value": "44"
                }
            },
            {
                "type": "equals",
                "key": "KINGDOM_KEY",
                "value": "1"
            },
            {
                "type": "in",
                "key": "DATASET_KEY",
                "values": [
                    "96193ea2-f762-11e1-a439-00145eb45e9a", 
                    "50c9509d-22c7-4a22-a47d-8c48425ef4a7",
                    "ba9984d8-d982-4fe6-b81c-a7585790034a"
                ]
            }
        ]
    }
}'''

In [None]:
url = 'occurrence/download/request'
# # Currently UTIC (ba9984d8-d982-4fe6-b81c-a7585790034a), A&M (96193ea2-f762-11e1-a439-00145eb45e9a), and iNat Research Grade (50c9509d-22c7-4a22-a47d-8c48425ef4a7)

# TODO:
# This is, for some reason, only getting records from the iNat catalogue. Delve into this.

request = '''{
    "creator": "mihtmo",
    "notificationAddresses": [
        "mitch.d.webb@gmail.com"
    ],
    "format": "DWCA",
    "sendNotification": true,
    "predicate": {
        "type": "and",
        "predicates": [
            {
                "type": "or",
                "predicates": [
                    {
                        "type": "not",
                        "predicate": {
                            "type": "equals", 
                            "key": "PHYLUM_KEY", 
                            "value": "44"
                        }
                    },
                    {
                        "type": "in",
                        "key": "CLASS_KEY",
                        "values": [
                            "207",
                            "356",
                            "211",
                            "7375758"
                        ]
                    }
                ]
            },
            {
                "type": "equals",
                "key": "KINGDOM_KEY",
                "value": "1"
            },
            {
                "type": "in",
                "key": "DATASET_KEY",
                "values": [
                    "96193ea2-f762-11e1-a439-00145eb45e9a", 
                    "50c9509d-22c7-4a22-a47d-8c48425ef4a7",
                    "ba9984d8-d982-4fe6-b81c-a7585790034a"
                ]
            },
            {
                "type": "equals",
                "key": "OCCURRENCE_STATUS",
                "value": "PRESENT"
            }
        ]
    }
}'''

In [3]:
def GBIF_request(endpoint: str, request_body: str, test=False):
    
    headers = {
        "Content-Type": "application/json"
    }
    
    if test == True:
        GBIF_url = "http://api.gbif-uat.org/v1/"
    else:
        GBIF_url = "http://api.gbif.org/v1/"
    
    full_url = GBIF_url + endpoint
    
    try:
        response = requests.post(full_url, data=request_body, auth=("mihtmo", "NmRC3yhF2jz#3L"), headers=headers)
        if response.status_code == 201:
            print('Download request submitted successfully.')
            print(f'Find this download request at https://www.gbif.org/occurrence/download/{response.text}')
            return response.text
        elif response.status_code == 400:
            print(f'Error: {response.status_code}: Invalid Query')
            return 
        elif response.status_code == 429:
            print(f'Error: {response.status_code}: Too many concurrent downloads')
            return
        else:
            print(f'Error: {response.status_code}: {response.text}')
            
    except Exception as e:
        print(f"Request failed: {e}")
        return None

In [4]:
# Waits for GBIF download, attempting a download every 2 minutes for a default of around 17 minutes
def get_GBIF_download(key: str, output_fp: str, time_to_wait: int = 1000):
    
    waiting_interval = 60
    start_time = time.time()
    end_time = start_time + time_to_wait
    
    while time.time() < end_time:
        try:
            # Send API request
            response = requests.get(f'http://api.gbif.org/v1/occurrence/download/request/{key}')
            if response.status_code == 302:
                print(f'Download found. {response.text}')
                return
            elif response.status_code == 200:
                print('Download found. Getting File.')
                zip_fp = os.path.join(output_fp, f'{key}.zip') 
                with open(zip_fp, 'wb') as file:
                    for chunk in response.iter_content(chunk_size=8192):
                        file.write(chunk)
                # with zipfile.ZipFile(zip_fp, 'r') as zip_ref:
                #     print('Extracting files from zip...')
                #     zip_ref.extractall(output_fp)
                print('Deleting original zip...')
                os.remove(zip_fp)
                print(f'File downloaded successfully to {output_fp}')
                return
            elif response.status_code == 404:
                print(f"No response for that key. Download is likely still being processed in GBIF's system. Trying again in {waiting_interval} seconds.")
            elif response.status_code == 410:
                print('Occurrence download file was erased and no longer exists.')
                return
            else:
                print(f'Attempt failed. Status code: {response.status_code}.')
                return
        except Exception as e:
            print(f'Error occurred: {e}')
            
        time.sleep(waiting_interval)
    
    print(f'No successful response received within {time_to_wait} seconds.')
    return None
            
    

In [5]:
# Define data root for output
current_directory = os.getcwd()
data_root = os.path.join(current_directory, 'GBIF_downloads')

In [6]:
# Request download
key = GBIF_request(url, request)

Download request submitted successfully.
Find this download request at https://www.gbif.org/occurrence/download/0007239-250310093411724


In [12]:
 
# Attempt to get .zip download and unpack .csv
get_GBIF_download(key, data_root, 2000)

No response for that key. Download is likely still being processed in GBIF's system. Trying again in 60 seconds.
No response for that key. Download is likely still being processed in GBIF's system. Trying again in 60 seconds.
No response for that key. Download is likely still being processed in GBIF's system. Trying again in 60 seconds.
No response for that key. Download is likely still being processed in GBIF's system. Trying again in 60 seconds.
No response for that key. Download is likely still being processed in GBIF's system. Trying again in 60 seconds.
Download found. Getting File.
Deleting original zip...
File downloaded successfully to c:\Users\mdw2685\Desktop\Projects\early_tests\GBIF_downloads


In [40]:
df = pd.read_csv(os.path.join(data_root, f'{key}.csv'), sep='\t')


# df = pd.read_csv(os.path.join(output_fp, '0000970-250227182400228.csv'), sep='\t')

In [41]:
print(df['datasetKey'].unique())

['50c9509d-22c7-4a22-a47d-8c48425ef4a7']


In [44]:
df['order'].unique()

array(['Stolidobranchia', 'Aplousobranchia', 'Phlebobranchia', 'Salpida',
       'Pyrosomatida', nan], dtype=object)