# Create AVH collectors data set

Create a data set of the 1000 most prolific collectors in the Australasian Virtual Herbarium (AVH)

## Get collectors from AVH

Use `occurrences/search` API (https://biocache-ws.ala.org.au/ws/occurrences/search) with the following query parameters:

parameter | value | comment
-|-|-
q | data_hub_uid:dh6 | AVH hub
fq | country:Australia | Australian records only
fq | occurrence_date:[* TO *] | records must have eventDate
pageSize | 1 | only return 1 occurrence
facets | collector | 
flimit | 1000 | 
foffset | 0 |
fsort | count


In [4]:
import json
import requests
import pandas as pd
import os

url = 'https://biocache-ws.ala.org.au/ws/occurrences/search'
params = [
    ('q', 'data_hub_uid:dh9'),
    ('fq', 'country:Australia'),
    ('fq', 'occurrence_date:[* TO *]'),
    ('pageSize', 1),
    ('facets', 'collector'),
    ('flimit', 1000),
    ('foffset', 0),
    ('fsort', 'count')
]

response = requests.get(url, params)
dict = response.json()
collectors = dict['facetResults'][0]['fieldResult']

df = pd.json_normalize(collectors)

if not os.path.exists('data'):
    os.makedirs('data')

print(df.head())


               label                     i18nCode  count  \
0  Beauglehole, A.C.  collector.Beauglehole, A.C.  90942   
1      Forster, P.I.      collector.Forster, P.I.  64649   
2         Hyland, B.         collector.Hyland, B.  57265   
3           Latz, P.           collector.Latz, P.  51230   
4      Streimann, H.      collector.Streimann, H.  45346   

                              fq  
0  collector:"Beauglehole, A.C."  
1      collector:"Forster, P.I."  
2         collector:"Hyland, B."  
3           collector:"Latz, P."  
4      collector:"Streimann, H."  


## Get collecting period

For each collector string two queries are sent to the ALA occurrence search, one to get the first record and one to get the last record from a colector.

In [6]:
from datetime import datetime
import numpy as np

def get_activity_period(collstr):
    url = 'https://biocache-ws.ala.org.au/ws/occurrences/search'
    params = [
        ('q', 'data_hub_uid:dh9'),
        ('fq', 'country:Australia'),
        ('fq', 'occurrence_date:[* TO *]'),
        ('fq', 'collector:"' + collstr + '"'),
        ('facet', 'off'),
        ('fl', 'id,occurrence_date'),
        ('sort', 'occurrence_date'),
        ('pageSize', 1)
    ]
    response = requests.get(url, params=params)
    dict = response.json()
    
    if dict['totalRecords'] == 0:
        return {
            'startDate': None,
            'endDate': None
        }
        
    df_occ = pd.json_normalize(dict['occurrences'])
    
    event_date = df_occ.at[0, 'eventDate'];
    start_date = None
    if (isinstance(event_date, np.integer)):
        start_date = datetime.fromtimestamp(event_date/1000).year

    params.append(('dir', 'desc'))
    response = requests.get(url, params=params)
    dict = response.json()
    df_occ = pd.json_normalize(dict['occurrences'])

    event_date = df_occ.at[0, 'eventDate'];
    end_date = None
    if (isinstance(event_date, np.integer)):
        end_date = datetime.fromtimestamp(event_date/1000).year

    return {
        'startDate': start_date,
        'endDate': end_date
    }

# test
test = get_activity_period("Klazenga, N.")
print(test)


{'startDate': 1999, 'endDate': 2018}


In [7]:
df['start_date'] = None
df['end_date'] = None
# df.head()
for i, row in df.iterrows():
    period = get_activity_period(row['label'])
    if (i % 50 == 0):
        print(str(i) + ' records done...')
        
    df.at[i, 'start_date'] = period['startDate']
    df.at[i, 'end_date'] = period['endDate']

print(df.head())
df.to_csv('data/avh_collectors.csv')


0 records done...
50 records done...
100 records done...
150 records done...
200 records done...
250 records done...
300 records done...
350 records done...
400 records done...
450 records done...
500 records done...
550 records done...
600 records done...
650 records done...
700 records done...
750 records done...
800 records done...
850 records done...
900 records done...
950 records done...
1000 records done...
               label                     i18nCode  count  \
0  Beauglehole, A.C.  collector.Beauglehole, A.C.  90942   
1      Forster, P.I.      collector.Forster, P.I.  64649   
2         Hyland, B.         collector.Hyland, B.  57265   
3           Latz, P.           collector.Latz, P.  51230   
4      Streimann, H.      collector.Streimann, H.  45346   

                              fq start_date end_date  
0  collector:"Beauglehole, A.C."       1865     2005  
1      collector:"Forster, P.I."       1955     2018  
2         collector:"Hyland, B."       1952     2008  
3

In [9]:
df['activity_span'] = df['end_date'] - df['start_date']
df.to_csv('data/avh_collectors.csv')
df.head()

Unnamed: 0,label,i18nCode,count,fq,start_date,end_date,activity_span
0,"Beauglehole, A.C.","collector.Beauglehole, A.C.",90942,"collector:""Beauglehole, A.C.""",1865,2005,140
1,"Forster, P.I.","collector.Forster, P.I.",64649,"collector:""Forster, P.I.""",1955,2018,63
2,"Hyland, B.","collector.Hyland, B.",57265,"collector:""Hyland, B.""",1952,2008,56
3,"Latz, P.","collector.Latz, P.",51230,"collector:""Latz, P.""",1875,2019,144
4,"Streimann, H.","collector.Streimann, H.",45346,"collector:""Streimann, H.""",1896,2001,105
