# Create CHAH collectors and illustrators data set

Content is scraped from the [Australian Plant Collectors and Illustrators](http://www.anbg.gov.au/bot-biog/) page on the CHAH website.

In [3]:
import requests
import re
import pandas as pd

Parse function loads and parses the content of a single page. It takes a letter – which is the first letter of the collector's family name – as its argument.

In [4]:
def parse_collector_page(firstLetter):
    url = 'http://www.anbg.gov.au/bot-biog/bot-biog-' + firstLetter + '.html'

    response = requests.get(url)
    
    if response.status_code == 200:
        text = response.text

        start_search = '<table BORDER cellpadding=2 cellspacing="0" bordercolor="#CCCCCC" >'

        start = text.find(start_search)

        end_search = '</table>'
        end = text.find(end_search, start)

        table = text[start + len(start_search):end]
        rows = re.split('</tr>\s*<tr[^>]*>', table)

        recs = []
        for row in rows:
            cells = re.split('</td>\s*<td[^>]*>', row)
            rec = []
            for cell in cells:
                clcell = re.sub(re.compile('<.*?>'), '', cell)
                clcell = clcell.strip(' \r\n').replace('&ndash;', '–').replace('&nbsp;', '')
                rec.append(clcell)
            recs.append(rec)
    else: 
        recs = []
        
    df = pd.DataFrame.from_records(recs, columns=['name', 'active', 'comment', 'info_link', 'portrait_link'])
    
    return df

# test = parse_collector_page('UV')
# test.head()


Iterate over all letters in the alphabet. Some letters are lumped together on a page, so these combinations of letters are appended to the letter list.

In [6]:
import string

frames = []

letters = list(string.ascii_uppercase[:])
letters.extend(['PQ', 'UV']) # There is an error in the X-Z page, so it has been left out

for letter in letters:
    frames.append(parse_collector_page(letter))
    
df = pd.concat(frames)

df.tail()

Unnamed: 0,name,active,comment,info_link,portrait_link
40,"VON MYGIND, F., see MYGIND",,,,
41,"VON SCHOMBURGK, M.R., see SCHOMBURGK",,,,
42,"VON SZENT-LELER, J.K., see KOVATS",,,,
43,"VOS, Hubertus",1933-,"Nurseryman, forestry technician",notes,
44,"VROLAND, Anton W.R.",fl. 1936,Botanist,,


In [87]:
df.to_csv('data/chah_collectors.csv')

X, Y and Z have been added to the CSV manually, so from here on we work from the CSV file.

In [12]:
df = pd.read_csv('data/chah_collectors.csv')
df.head()

Unnamed: 0,name,active,comment,info_link,portrait_link
0,"ABBOTT, Francis, Jnr",1834–1903,Gardens Superintendent,notes,portrait
1,"ABID, Munir. A., See MUNIR",,,,
2,"ABRAHAMS, L.",fl. 1910,,,
3,"ABRAHAMSON, Ada",fl. 1890s,"Amateur seaweed collector, WA",notes,
4,"ACKLAND, Judith Joan",fl. 1960s,Botanist,,


In [59]:
print("Judith Joan".replace('.', '. ').split())
print("T.W.".replace('.', '. ').split())


['Judith', 'Joan']
['T.', 'W.']


In [61]:
surname = []
initials = []
canonical = []
coll_index = []

for i, row in df.iterrows():
    n = i + 1
    coll_index.append('c' + f'{n:05}')
    name = row['name'].replace('^^', '').replace('*', '').strip()
    bits = name.split(', ')
    surname.append(bits[0])
    if len(bits) > 1:
        words = bits[1].replace('.', '. ').split()
        init = ".".join([x[0] for x in words]) + '.'
        if init == '.':
            init = None
        initials.append(init)
        canonical.append(bits[0] + ', ' + init)
    else:
        initials.append(None)
        canonical.append(bits[0])

df['surname'] = surname
df['initials'] = initials
df['canonical_string'] = canonical
df['coll_index'] = coll_index

# print(df.columns.tolist())
df = df[['coll_index', 'surname', 'initials', 'canonical_string', 'name', 'active', 'comment', 'info_link', 'portrait_link']]

df.to_csv('data/chah_collectors_clean.csv')
