# NSL export to CoLDP

In [1]:
import pandas as pd
import numpy as np
import urllib.request
import re


group = 'bryophytes'

urls = {
    'bryophytes': 'https://moss.biodiversity.org.au/nsl/services/export/',
    'fungi': 'https://fungi.biodiversity.org.au/nsl/services/export/',
    'lichens': 'https://lichen.biodiversity.org.au/nsl/services/export/',
    'algae': 'https://algae.biodiversity.org.au/nsl/services/export/',
    'tracheophytes': 'https://biodiversity.org.au/nsl/services/export/'
}

urllib.request.urlretrieve(urls[group] + 'namesCsv', 'data/' + group + '/names.csv')
urllib.request.urlretrieve(urls[group] + 'taxonCsv', 'data/' + group + '/taxa.csv')

# Map Latin rank names to less wankerish English ones
ranks = {
    'genus': 'genus',
    'species': 'species',
    'familia': 'family',
    'subfamilia': 'subfamily',
    'tribus': 'tribe',
    'subtribus': 'subtribe',
    'ordo': 'order',
    'subordo': 'suborder',
    'superordo': 'superorder',
    'subspecies': 'subspecies',
    'classis': 'class',
    'subclassis': 'subclass',
    'subdivision': 'subphylum',
    'subbdivision': 'subphylum',
    'varietas': 'variety',
    'nothovarietas': 'nothovariety',
    'subvarietas': 'subvariety',
    'subgenus': 'subgenus',
    'superspecies': 'superspecies',
    'forma': 'form',
    'subforma': 'subform',
    'division': 'phylum',
    'regnum': 'kingdom',
    'special form': 'special form',
    'sectio': 'section',
    'subsectio': 'subsection',
    'series': 'series',
    'subseries': 'subseries',
    'regio': 'domain',
    '[unknown]': '[unknown]',
    '[unranked]': '[unranked]',
    '[infragenus]': '[infragenus]',
    '[infraspecies]': '[infraspecies]'
}

### Names

In [2]:
df_names = pd.read_csv('data/' + group + '/names.csv')

# Remove excess columns
columns = [
    'scientificNameID', 
    'nameAccordingToID', 
    'taxonRank', 
    'scientificName', 
    'genericName', 
    'specificEpithet', 
    'infraspecificEpithet', 
    'scientificNameAuthorship', 
    'nomenclaturalStatus',
    'originalNameUsageID', 
    'originalNameUsage'
]

df1 = df_names[columns]

# Instance ID is in nameAccordingToID for reasons unfathomable; rename to taxonID
df1 = df1.rename(columns={'nameAccordingToID': 'taxonID'})

# Filter for names with originalNameUsage
# Replace instance IDs in originalNameUsageID with scientificNameIDs; we will not need taxonID anymore after that
df2 = df1[~df1['originalNameUsageID'].isna()].merge(df1, how='left', left_on='originalNameUsageID', right_on='taxonID')[[
    'scientificNameID_x', 
    'taxonRank_x', 
    'scientificName_x', 
    'genericName_x', 
    'specificEpithet_x', 
    'infraspecificEpithet_x', 
    'scientificNameAuthorship_x', 
    'nomenclaturalStatus_x', 
    'scientificNameID_y', 
    'scientificName_y'
]]

column_mappings = {
    'scientificNameID_x': 'scientificNameID',
    'scientificName_x': 'scientificName',
    'taxonRank_x': 'taxonRank',
    'genericName_x': 'genericName',
    'specificEpithet_x': 'specificEpithet',
    'infraspecificEpithet_x': 'infraspecificEpithet',
    'scientificNameAuthorship_x': 'scientificNameAuthorship',
    'nomenclaturalStatus_x': 'nomenclaturalStatus',
    'scientificNameID_y': 'originalNameUsageID',
    'scientificName_y': 'originalNameUsage'
}

df2 = df2.rename(columns=column_mappings)

# Filter for names without originalNameUsage
df3 = df_names[df_names['originalNameUsage'].isna()][columns]
df3.drop(columns=['nameAccordingToID'], inplace=True)

# Merge dataframes
df5 = pd.concat([df2, df3])
df5 = df5.sort_values(by='scientificName')

# Make taxonRank lowercase
df5['taxonRank'] = df5['taxonRank'].str.lower()

# Map Latin rank names to less wankerish English ones
taxon_ranks = []
uninomials = []
generic_names = []
for index, row in df5.iterrows():
    taxon_ranks.append(ranks[row['taxonRank']])

    uninomial = np.NaN
    generic_name = row['genericName']

    if not isinstance(row['genericName'], str):
        if not isinstance(row['scientificNameAuthorship'], str):
            uninomial = row['scientificName']
        else:
            uninomial = row['scientificName'][0:len(row['scientificName'])-len(row['scientificNameAuthorship'])-1]

    if row['taxonRank'] == 'genus':
        uninomial = row['genericName']
        generic_name = np.NaN

    uninomials.append(uninomial)
    generic_names.append(generic_name)

df5['taxonRank'] = taxon_ranks
df5['genericName'] = generic_names
df5['uninomial'] = uninomials

# Add nomenclaturalCode; 'ICN' in GBIF vocab. but 'botanical' in CoLDP
df5['code'] = 'botanical'

df5 = df5[~df5['nomenclaturalStatus'].isin(['isonym', 
        'nom. illeg.', 
        'nom. illeg., nom. rej.', 
        'nom. illeg., nom. superfl.', 
        'nom. inval.', 
        'nom. inval., nom. nud.', 
        'nom. inval., pro syn.', 
        'nom. superfl.', 
        'orth. var.'])]

df5 = df5[[
        'scientificNameID', 
        'taxonRank', 
        'scientificName', 
        'uninomial',
        'genericName', 
        'specificEpithet', 
        'infraspecificEpithet', 
        'scientificNameAuthorship', 
        'code', 
        'nomenclaturalStatus', 
        'originalNameUsageID', 
        'originalNameUsage']]

df_nam = df5

df5

Unnamed: 0,scientificNameID,taxonRank,scientificName,uninomial,genericName,specificEpithet,infraspecificEpithet,scientificNameAuthorship,code,nomenclaturalStatus,originalNameUsageID,originalNameUsage
2796,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crinitum (Hook.f. & Wilson) Bro...,,Acanthocladium,crinitum,,(Hook.f. & Wilson) Broth. ex Paris,botanical,,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum crinitum Hook.f. & Wilson
7582,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crossii Broth. & Geh. ex Broth.,,Acanthocladium,crossii,,Broth. & Geh. ex Broth.,botanical,,,
2797,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium extenuatum (Brid.) Mitt.,,Acanthocladium,extenuatum,,(Brid.) Mitt.,botanical,,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum extenuatum Brid.
8149,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium macgregorii (Broth. & Geh.) Broth.,,Acanthocladium,macgregorii,,(Broth. & Geh.) Broth.,botanical,,,
7587,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium rigidifolium Dixon,,Acanthocladium,rigidifolium,,Dixon,botanical,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
6279,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon obtusifolius Hook.,,Zygodon,obtusifolius,,Hook.,botanical,,,
7394,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon preissianus Hampe,,Zygodon,preissianus,,Hampe,botanical,,,
6280,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon reinwardtii (Hornsch.) A.Braun,,Zygodon,reinwardtii,,(Hornsch.) A.Braun,botanical,,,
7831,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon remotidens Müll.Hal.,,Zygodon,remotidens,,Müll.Hal.,botanical,,,


### Clean up scientific names
Remove ex-authors and remove authorship from autonyms

In [3]:
def remove_ex_author(auth):
    return auth[auth.index(' ex ')+4:] if ' ex ' in auth else auth

print(remove_ex_author('Hook.f. & Taylor'))
print(remove_ex_author('Taylor ex Lehm.'))
print(remove_ex_author('Broth. & Geh. ex Broth.'))

Hook.f. & Taylor
Lehm.
Broth.


In [4]:
def cleanup_authorship(authorship):
    authorshipParentheses = None
    if '(' in authorship:
        authorshipParentheses = remove_ex_author(authorship[1:authorship.index(')')])
        authorship = remove_ex_author(authorship[authorship.index(')')+2:])
    if authorshipParentheses:
        return '(' + authorshipParentheses + ') ' + authorship
    else:
        return remove_ex_author(authorship)

print(cleanup_authorship('(Hook.f. & Taylor) Taylor ex Lehm.'))
print(cleanup_authorship('Broth. & Geh. ex Broth.'))

(Hook.f. & Taylor) Lehm.
Broth.


In [5]:
clean_names = []
for index, row in df_names.iterrows():
    if row['taxonRank'] in ['Regnum', 'Phylum', 'Division', 'Classis', 'Subclassis', 'Superordo', 'Ordo', 'Subordo'] \
                or (row['infraspecificEpithet'] and row['infraspecificEpithet'] == row['specificEpithet']) \
                or not isinstance(row['scientificNameAuthorship'], str):
        clean_names.append({
            'scientificNameID': row['scientificNameID'],
            'scientificName': row['canonicalName'],
            'authorship': np.NaN
        })
    else:
        clean_names.append({
            'scientificNameID': row['scientificNameID'],
            'scientificName': row['canonicalName'],
            'authorship': cleanup_authorship(row['scientificNameAuthorship'])
        })

df_clean_names = pd.DataFrame.from_dict(clean_names)

df_clean_names
        

Unnamed: 0,scientificNameID,scientificName,authorship
0,https://id.biodiversity.org.au/name/ausmoss/20...,Acrobolbus cinerascens f. attenuata,Rodway
1,https://id.biodiversity.org.au/name/ausmoss/13...,Acrobolbus cinerascens,(Lehm. & Lindenb.) Schiffn.
2,https://id.biodiversity.org.au/name/ausmoss/20...,Acrobolbus cinerascens,(Lehm. & Lindenb.) Bastow
3,https://id.biodiversity.org.au/name/ausmoss/13...,Acrobolbus concinnus,(Mitt.) Grolle
4,https://id.biodiversity.org.au/name/ausmoss/21...,Acrobolbus cyaneus,Herzog
...,...,...,...
9073,https://id.biodiversity.org.au/name/ausmoss/12...,Jungermanniaceae,Rchb.
9074,https://id.biodiversity.org.au/name/ausmoss/12...,Lophocoleaceae,Vanden Berghen
9075,https://id.biodiversity.org.au/name/ausmoss/12...,Scapaniaceae,Mig.
9076,https://id.biodiversity.org.au/name/ausmoss/12...,Lepidoziaceae,Limpr.


In [6]:
df_merge = df_nam.merge(df_clean_names, how='left', left_on='scientificNameID', right_on='scientificNameID')

df_merge.rename(columns={'scientificName_y': 'scientificName'}, inplace=True)
df_merge.drop(columns=['scientificName_x'], inplace=True)

df_nam = df_merge[['scientificNameID', 'taxonRank', 'scientificName', 'uninomial', 'genericName', 'specificEpithet', 
                'infraspecificEpithet', 'authorship', 'code', 'nomenclaturalStatus', 
                'originalNameUsageID', 'originalNameUsage']]

df_nam

Unnamed: 0,scientificNameID,taxonRank,scientificName,uninomial,genericName,specificEpithet,infraspecificEpithet,authorship,code,nomenclaturalStatus,originalNameUsageID,originalNameUsage
0,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crinitum,,Acanthocladium,crinitum,,(Hook.f. & Wilson) Paris,botanical,,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum crinitum Hook.f. & Wilson
1,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crossii,,Acanthocladium,crossii,,Broth.,botanical,,,
2,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium extenuatum,,Acanthocladium,extenuatum,,(Brid.) Mitt.,botanical,,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum extenuatum Brid.
3,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium macgregorii,,Acanthocladium,macgregorii,,(Broth. & Geh.) Broth.,botanical,,,
4,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium rigidifolium,,Acanthocladium,rigidifolium,,Dixon,botanical,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7833,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon obtusifolius,,Zygodon,obtusifolius,,Hook.,botanical,,,
7834,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon preissianus,,Zygodon,preissianus,,Hampe,botanical,,,
7835,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon reinwardtii,,Zygodon,reinwardtii,,(Hornsch.) A.Braun,botanical,,,
7836,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon remotidens,,Zygodon,remotidens,,Müll.Hal.,botanical,,,


### Set nomenclatural status

In [7]:
df_nam.rename(columns={'nomenclaturalStatus': 'status'}, inplace=True)

stat = []
for index, row in df_nam.iterrows():
    if isinstance(row['status'], str):
        if 'cons.' in row['status']:
            stat.append('conserved')
        else:
            if 'rej.' in row['status']:
                stat.append('rejected')
            else:
                stat.append(np.NaN)
    else:
        stat.append('acceptable')

df_nam['status'] = stat

df_nam

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nam.rename(columns={'nomenclaturalStatus': 'status'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nam['status'] = stat


Unnamed: 0,scientificNameID,taxonRank,scientificName,uninomial,genericName,specificEpithet,infraspecificEpithet,authorship,code,status,originalNameUsageID,originalNameUsage
0,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crinitum,,Acanthocladium,crinitum,,(Hook.f. & Wilson) Paris,botanical,acceptable,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum crinitum Hook.f. & Wilson
1,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium crossii,,Acanthocladium,crossii,,Broth.,botanical,acceptable,,
2,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium extenuatum,,Acanthocladium,extenuatum,,(Brid.) Mitt.,botanical,acceptable,https://id.biodiversity.org.au/name/ausmoss/10...,Hypnum extenuatum Brid.
3,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium macgregorii,,Acanthocladium,macgregorii,,(Broth. & Geh.) Broth.,botanical,acceptable,,
4,https://id.biodiversity.org.au/name/ausmoss/10...,species,Acanthocladium rigidifolium,,Acanthocladium,rigidifolium,,Dixon,botanical,acceptable,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7833,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon obtusifolius,,Zygodon,obtusifolius,,Hook.,botanical,acceptable,,
7834,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon preissianus,,Zygodon,preissianus,,Hampe,botanical,acceptable,,
7835,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon reinwardtii,,Zygodon,reinwardtii,,(Hornsch.) A.Braun,botanical,acceptable,,
7836,https://id.biodiversity.org.au/name/ausmoss/10...,species,Zygodon remotidens,,Zygodon,remotidens,,Müll.Hal.,botanical,acceptable,,


### Name relations

basionyms and replaced names

In [8]:
df_name_relationships = df5[~df5['originalNameUsageID'].isna()][['scientificNameID', 'scientificName', 'originalNameUsageID', 'originalNameUsage']]

namerels = []
for index, row in df_name_relationships.iterrows():
    rel = {}
    if '(' in row['scientificName']:
        rel['nameID'] = row['scientificNameID']
        # rel['name'] = row['scientificName']
        rel['type'] = 'basionym'
        rel['relatedNameID'] = row['originalNameUsageID']
        # rel['relatedName'] = row['originalNameUsage']
    else:
        rel['nameID'] = row['originalNameUsageID']
        # rel['name'] = row['originalNameUsage']
        rel['type'] = 'replacement name'
        rel['relatedNameID'] = row['scientificNameID']
        # rel['relatedName'] = row['scientificName']
    namerels.append(rel)

df_namerel = pd.DataFrame.from_dict(namerels)

# Remove originalNameUsage columns from Names
drop_columns = [
    'originalNameUsageID',
    'originalNameUsage',
    'nomenclaturalStatus'
]

rename_columns = {
    'scientificNameID': 'ID',
    'taxonRank': 'rank',
    'scientificName': 'scientificName',
    'genericName': 'genus',
    'scientificNameAuthorship': 'authorship'
}

df5.drop(columns=drop_columns, inplace=True)
df5.rename(columns=rename_columns, inplace=True)

### Taxa

In [9]:
# Read Taxon export into dataframe
df_taxa = pd.read_csv('data/' + group + '/taxa.csv')

# Create dataframe with IDs
# This is used later to replace tree element IDs (in taxonID) with instance IDs (in taxonConceptID)
df_id = df_taxa[['taxonID', 'taxonConceptID']]
# df_id.to_csv('coldp/' + group + '/id.tsv', sep='\t', index=False)

df_taxa = df_taxa[df_taxa['nameType'].isin(['scientific', 'autonym'])]

In [10]:
# Filter on accepted names; these are the taxa
df_tax = df_taxa[df_taxa['taxonomicStatus'] == 'accepted']

# Replace tree element IDs in taxonID with instance IDs (see above)
df_tax['taxonID'] = df_tax['taxonConceptID']

# Replace tree element IDs in parentNameUsageID with instance IDs
df_tax = df_tax.merge(df_id, how='left', left_on='parentNameUsageID', right_on='taxonID')
df_tax['parentNameUsageID'] = df_tax['taxonConceptID_y']
df_tax.drop(columns=['taxonID_y', 'taxonConceptID_y'], inplace=True)
df_tax.rename(columns={'taxonID_x': 'taxonID', 'taxonConceptID_x': 'taxonConceptID'}, inplace=True)

df_tax = df_tax[['taxonID',
 'scientificNameID',
 'scientificName',
 'scientificNameAuthorship',
 'nameAccordingTo',
 'nameAccordingToID',
 'parentNameUsageID',
 'taxonRank',
 'taxonRankSortOrder',
 'kingdom',
 'class',
 'subclass',
 'family',
 'taxonConceptID',
 'taxonRemarks',
 'higherClassification'
]]

# Translate ranks into English
df_tax['taxonRank'] = df_tax['taxonRank'].str.lower()

taxon_ranks = []
for index, row in df_tax.iterrows():
    taxon_ranks.append(ranks[row['taxonRank']])

df_tax['taxonRank'] = taxon_ranks

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tax['taxonID'] = df_tax['taxonConceptID']


#### Higher classification

(will not be added to CoLDP)

In [11]:
# Create dictionary with higher taxa from instance ID
def get_higher_taxa(id, higher={}):
    row = df_tax.loc[df_tax['taxonID'] == id]

    if len(row) > 0:
        parent = row[[
            'taxonRank', 
            'scientificName', 
            'scientificNameAuthorship', 
            'parentNameUsageID']].to_dict(orient='records')[0]
        
        if isinstance(parent['scientificNameAuthorship'], str):
            higher[parent['taxonRank']] = parent['scientificName'][0:len(parent['scientificName'])-len(parent['scientificNameAuthorship'])-1]
        else:
            higher[parent['taxonRank']] = parent['scientificName']

        if isinstance(parent['parentNameUsageID'], str):
            return get_higher_taxa(parent['parentNameUsageID'], higher)
        else:
            return higher
    else:
        return higher

In [12]:
# Create higher classification
def create_higher_classification(id):
    higher = get_higher_taxa(id, {})
    keys = list(higher.keys())

    hcl = {}

    hcl['taxonID'] = id
    hcl['species'] = higher['species'] if 'species' in keys else np.NaN
    hcl['section'] = higher['section'] if 'section' in keys else np.NaN
    hcl['subgenus'] = higher['subgenus'] if 'subgenus' in keys else np.NaN
    hcl['genus'] = higher['genus'] if 'genus' in keys else np.NaN
    hcl['subtribe'] = higher['subtribe'] if 'subtribe' in keys else np.NaN
    hcl['tribe'] = higher['tribe'] if 'tribe' in keys else np.NaN
    hcl['subfamily'] = higher['subfamily'] if 'subfamily' in keys else np.NaN
    hcl['family'] = higher['family'] if 'family' in keys else np.NaN
    hcl['superfamily'] = higher['superfamily'] if 'superfamily' in keys else np.NaN
    hcl['suborder'] = higher['suborder'] if 'suborder' in keys else np.NaN
    hcl['order'] = higher['order'] if 'order' in keys else np.NaN
    hcl['subclass'] = higher['subclass'] if 'subclass' in keys else np.NaN
    hcl['class'] = higher['class'] if 'class' in keys else np.NaN
    hcl['subphylum'] = higher['subphylum'] if 'subphylum' in keys else np.NaN
    hcl['phylum'] = higher['phylum'] if 'phylum' in keys else np.NaN
    hcl['kingdom'] = higher['kingdom'] if 'kingdom' in keys else np.NaN

    cl = list(higher.values())
    for i in range(len(cl) // 2):
        cl[i], cl[-1 - i] = cl[-1 - i], cl[i]

    hcl['classification'] = ' | '.join(cl)

    return hcl


In [13]:
cl = []
for index, row in df_tax.iterrows():
    cl.append(create_higher_classification(row['taxonID']))

df_higher = pd.DataFrame.from_dict(cl)

df_higher = df_higher.merge(df_tax[['taxonID', 'scientificName', 'taxonRank']], how='left', left_on='taxonID', right_on='taxonID')

df_higher = df_higher[['taxonID',
 'scientificName',
 'taxonRank',
 'kingdom',
 'phylum',
 'subphylum',
 'class',
 'subclass',
 'order',
 'suborder',
 'superfamily',
 'family',
 'tribe',
 'subfamily',
 'subtribe',
 'genus',
 'subgenus',
 'section',
 'species',
 'classification']]

df_higher.rename(columns={'classification': 'higherClassification'}, inplace=True)

df_higher

Unnamed: 0,taxonID,scientificName,taxonRank,kingdom,phylum,subphylum,class,subclass,order,suborder,superfamily,family,tribe,subfamily,subtribe,genus,subgenus,section,species,higherClassification
0,https://id.biodiversity.org.au/instance/ausmos...,Plantae Haeckel,kingdom,Plantae,,,,,,,,,,,,,,,,Plantae
1,https://id.biodiversity.org.au/instance/ausmos...,Anthocerotophyta Rothm. ex Stotler & Crand.-St...,phylum,Plantae,Anthocerotophyta,,,,,,,,,,,,,,,Plantae | Anthocerotophyta
2,https://id.biodiversity.org.au/instance/ausmos...,Anthocerotopsida de Bary ex Jancz.,class,Plantae,Anthocerotophyta,,Anthocerotopsida,,,,,,,,,,,,,Plantae | Anthocerotophyta | Anthocerotopsida
3,https://id.biodiversity.org.au/instance/ausmos...,Anthocerotidae Rosenv.,subclass,Plantae,Anthocerotophyta,,Anthocerotopsida,Anthocerotidae,,,,,,,,,,,,Plantae | Anthocerotophyta | Anthocerotopsida ...
4,https://id.biodiversity.org.au/instance/ausmos...,Anthocerotales Limpr.,order,Plantae,Anthocerotophyta,,Anthocerotopsida,Anthocerotidae,Anthocerotales,,,,,,,,,,,Plantae | Anthocerotophyta | Anthocerotopsida ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2607,https://id.biodiversity.org.au/instance/ausmos...,Riella halophila Banwell,species,Plantae,Marchantiophyta,,Marchantiopsida,Marchantiidae,Sphaerocarpales,,,Riellaceae,,,,Riella,,,Riella halophila,Plantae | Marchantiophyta | Marchantiopsida | ...
2608,https://id.biodiversity.org.au/instance/ausmos...,Riella spiculata J.Taylor,species,Plantae,Marchantiophyta,,Marchantiopsida,Marchantiidae,Sphaerocarpales,,,Riellaceae,,,,Riella,,,Riella spiculata,Plantae | Marchantiophyta | Marchantiopsida | ...
2609,https://id.biodiversity.org.au/instance/ausmos...,Sphaerocarpaceae Heeg,family,Plantae,Marchantiophyta,,Marchantiopsida,Marchantiidae,Sphaerocarpales,,,Sphaerocarpaceae,,,,,,,,Plantae | Marchantiophyta | Marchantiopsida | ...
2610,https://id.biodiversity.org.au/instance/ausmos...,Sphaerocarpos Boehm.,genus,Plantae,Marchantiophyta,,Marchantiopsida,Marchantiidae,Sphaerocarpales,,,Sphaerocarpaceae,,,,Sphaerocarpos,,,,Plantae | Marchantiophyta | Marchantiopsida | ...


#### Synonyms

In [14]:
# Filter on synonyms

if group == 'bryophytes':
    df_syn = df_taxa[df_taxa['taxonomicStatus'].isin(['taxonomic synonym']) & ~df_taxa['acceptedNameUsageID'].isna()][[
        'taxonID',
        'scientificNameID',
        'scientificName',
        'acceptedNameUsageID',
        'acceptedNameUsage',
        'taxonomicStatus'
    ]]
    df_syn = df_syn[~df_syn['scientificName'].str.contains('\(')]
else:
    df_syn = df_taxa[df_taxa['taxonomicStatus'].isin(['synonym', 'nomenclatural synonym', 'taxonomic synonym']) & ~df_taxa['acceptedNameUsageID'].isna()][[
        'taxonID',
        'scientificNameID',
        'scientificName',
        'acceptedNameUsageID',
        'acceptedNameUsage',
        'taxonomicStatus'
    ]]

# Replace tree element IDs in acceptedNameUsageID with instance IDs
df_syn = df_syn.merge(df_id, how='left', left_on='acceptedNameUsageID', right_on='taxonID')

df_syn.drop(columns=['acceptedNameUsageID', 'taxonID_y'], inplace=True)
df_syn.rename(columns={
    'taxonID_x': 'ID',
    'taxonConceptID': 'taxonID',
    'scientificNameID': 'nameID'    
}, inplace=True)
df_syn = df_syn[['ID', 'taxonID', 'nameID', 'scientificName', 'acceptedNameUsage', 'taxonomicStatus']]

# Get accepted names
df_dwc_syn = df_syn.merge(df_tax[['taxonID', 'scientificName']], left_on='taxonID', right_on='taxonID')

df_dwc_syn.drop(columns=['taxonID', 'nameID', 'acceptedNameUsage'], inplace=True)

df_dwc_syn.rename(columns={
    'ID': 'taxonID',
    'scientificName_x': 'scientificName',
    'scientificName_y': 'acceptedNameUsage'
}, inplace=True)

# df_syn.drop(columns=['scientificName', 'acceptedNameUsage', 'taxonomicStatus'], inplace=True)

df_syn = df_syn[['ID', 'taxonID', 'acceptedNameUsage', 'nameID', 'scientificName']].rename(columns={'acceptedNameUsage': 'acceptedName', 'scientificName': 'synonym'})
df_syn['remarks'] = 'heterotypic synonym'
df_syn

Unnamed: 0,ID,taxonID,acceptedName,nameID,synonym,remarks
0,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/instance/ausmos...,Anthoceros fragilis Steph.,https://id.biodiversity.org.au/name/ausmoss/20...,Anthoceros fertilis Steph.,heterotypic synonym
1,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Megaceros pellucidus (Colenso) E.A.Hodgs.,https://id.biodiversity.org.au/name/ausmoss/12...,Anthoceros longispirus Carrington & Pearson,heterotypic synonym
2,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Phaeoceros carolinianus (Michx.) Prosk.,https://id.biodiversity.org.au/name/ausmoss/20...,Anthoceros multicapsulus Steph.,heterotypic synonym
3,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Phaeoceros carolinianus (Michx.) Prosk.,https://id.biodiversity.org.au/name/ausmoss/20...,Anthoceros brotheri Steph.,heterotypic synonym
4,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Phaeoceros carolinianus (Michx.) Prosk.,https://id.biodiversity.org.au/name/ausmoss/20...,Anthoceros communis Steph.,heterotypic synonym
...,...,...,...,...,...,...
1746,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Riccia limbata Bisch.,https://id.biodiversity.org.au/name/ausmoss/21...,Riccia lata Taylor,heterotypic synonym
1747,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Riccia macrospora Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Riccia rubrispora Steph.,heterotypic synonym
1748,https://id.biodiversity.org.au/instance/apni/8...,https://id.biodiversity.org.au/instance/ausmos...,Riccia macrospora Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Riccia sellingii S.W.Arnell,heterotypic synonym
1749,https://id.biodiversity.org.au/instance/apni/9...,https://id.biodiversity.org.au/instance/ausmos...,Riccia multifida (Steph.) Steph. var. multifida,https://id.biodiversity.org.au/name/ausmoss/21...,Riccia burnettensis Steph.,heterotypic synonym


In [15]:
# basionym of accepted name
df_acc_comb = df_tax[~df_tax['scientificNameAuthorship'].isna() & df_tax['scientificNameAuthorship'].str.contains('\(')][['taxonID', 'scientificNameID', 'scientificName', 'scientificNameAuthorship']]
df_m1 = df_acc_comb.merge(df_namerel[df_namerel['type'] == 'basionym'], how='inner', left_on='scientificNameID', right_on='nameID')
df_m2 = df_m1.merge(df_nam[['scientificNameID', 'scientificName']], how='left', left_on='relatedNameID', right_on='scientificNameID')

df_acc_bas = df_m2[['taxonID', 'scientificName_x', 'relatedNameID', 'scientificName_y']].rename(columns={'relatedNameID': 'nameID', 'scientificName_x': 'acceptedName', 'scientificName_y': 'synonym'})
df_acc_bas['remarks'] = 'basionym of accepted name'
df_acc_bas


Unnamed: 0,taxonID,acceptedName,nameID,synonym,remarks
0,https://id.biodiversity.org.au/instance/ausmos...,Folioceros fuciformis (Mont.) D.C.Bhardwaj,https://id.biodiversity.org.au/name/ausmoss/14...,Anthoceros fuciformis,basionym of accepted name
1,https://id.biodiversity.org.au/instance/ausmos...,Folioceros glandulosus (Lehm. & Lindenb.) D.C....,https://id.biodiversity.org.au/name/ausmoss/12...,Anthoceros glandulosus,basionym of accepted name
2,https://id.biodiversity.org.au/instance/ausmos...,Dendroceros crispatus (Hook.) Nees,https://id.biodiversity.org.au/name/ausmoss/21...,Monoclea crispata,basionym of accepted name
3,https://id.biodiversity.org.au/instance/ausmos...,Megaceros carnosus (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/17...,Anthoceros carnosus,basionym of accepted name
4,https://id.biodiversity.org.au/instance/ausmos...,Megaceros denticulatus (Lehm.) Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Anthoceros denticulatus,basionym of accepted name
...,...,...,...,...,...
1039,https://id.biodiversity.org.au/instance/ausmos...,Preissia commutata (Lindenb.) Nees,https://id.biodiversity.org.au/name/ausmoss/21...,Marchantia commutata,basionym of accepted name
1040,https://id.biodiversity.org.au/instance/ausmos...,Riccia multifida (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Ricciella multifida,basionym of accepted name
1041,https://id.biodiversity.org.au/instance/ausmos...,Riccia papulosa (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Ricciella papulosa,basionym of accepted name
1042,https://id.biodiversity.org.au/instance/ausmos...,Riccia vesiculosa (Carrington & Pearson) Steph.,https://id.biodiversity.org.au/name/ausmoss/21...,Riccia bullosa var. vesiculosa,basionym of accepted name


In [16]:
# other combinations of basionyms of accepted names
df_m3 = df_acc_bas.merge(df_namerel[df_namerel['type'] == 'basionym'], how='left', left_on='nameID', right_on='relatedNameID')
df_m4 = df_m3.merge(df_nam[['scientificNameID', 'scientificName']], how='left', left_on='nameID_y', right_on='scientificNameID')

df_acc_other_comb = df_m4[df_m4['acceptedName'] != df_m4['scientificName']][['taxonID', 'acceptedName', 'scientificNameID', 'scientificName']].rename(columns={'scientificName': 'synonym', 'scientificNameID': 'nameID'})
df_acc_other_comb['remarks'] = 'other combination of basionym of accepted name'
df_acc_other_comb

Unnamed: 0,taxonID,acceptedName,nameID,synonym,remarks
0,https://id.biodiversity.org.au/instance/ausmos...,Folioceros fuciformis (Mont.) D.C.Bhardwaj,https://id.biodiversity.org.au/name/ausmoss/14...,Folioceros fuciformis,other combination of basionym of accepted name
1,https://id.biodiversity.org.au/instance/ausmos...,Folioceros glandulosus (Lehm. & Lindenb.) D.C....,https://id.biodiversity.org.au/name/ausmoss/20...,Aspiromitus glandulosus,other combination of basionym of accepted name
2,https://id.biodiversity.org.au/instance/ausmos...,Folioceros glandulosus (Lehm. & Lindenb.) D.C....,https://id.biodiversity.org.au/name/ausmoss/13...,Folioceros glandulosus,other combination of basionym of accepted name
3,https://id.biodiversity.org.au/instance/ausmos...,Dendroceros crispatus (Hook.) Nees,https://id.biodiversity.org.au/name/ausmoss/17...,Dendroceros crispatus,other combination of basionym of accepted name
4,https://id.biodiversity.org.au/instance/ausmos...,Megaceros carnosus (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/17...,Megaceros carnosus,other combination of basionym of accepted name
...,...,...,...,...,...
1638,https://id.biodiversity.org.au/instance/ausmos...,Riccia multifida (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/13...,Riccia multifida,other combination of basionym of accepted name
1639,https://id.biodiversity.org.au/instance/ausmos...,Riccia papulosa (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/15...,Riccia papulosa,other combination of basionym of accepted name
1640,https://id.biodiversity.org.au/instance/ausmos...,Riccia papulosa (Steph.) Steph.,https://id.biodiversity.org.au/name/ausmoss/19...,Riccia papulosa var. papulosa,other combination of basionym of accepted name
1641,https://id.biodiversity.org.au/instance/ausmos...,Riccia vesiculosa (Carrington & Pearson) Steph.,https://id.biodiversity.org.au/name/ausmoss/15...,Riccia vesiculosa,other combination of basionym of accepted name


In [17]:
# replaced names
df_acc_orig = df_tax[~(df_tax['scientificNameAuthorship'].isna() | df_tax['scientificNameAuthorship'].str.contains('\('))][['taxonID', 'scientificNameID', 'scientificName', 'scientificNameAuthorship']]
df_m5 = df_acc_orig.merge(df_namerel[df_namerel['type'] == 'replacement name'], how='inner', left_on='scientificNameID', right_on='relatedNameID')
df_m6 = df_m5.merge(df_nam[['scientificNameID', 'scientificName']], how='inner', left_on='nameID', right_on='scientificNameID')

df_replaced = df_m6[['taxonID', 'scientificName_x', 'nameID', 'scientificName_y']].rename(columns={'scientificName_x': 'acceptedName', 'scientificName_y': 'synonym'})
df_replaced['remarks'] = 'replaced name'
df_replaced

Unnamed: 0,taxonID,acceptedName,nameID,synonym,remarks


In [18]:
# combinations of replaced names
df_m7 = df_replaced.merge(df_namerel[df_namerel['type'] == 'basionym'], how='inner', left_on='nameID', right_on='relatedNameID')
df_m8 = df_m7.merge(df_nam[['scientificNameID', 'scientificName']], how='inner', left_on='relatedNameID', right_on='scientificNameID')
df_m8

df_replaced_comb = df_m8[['taxonID', 'acceptedName', 'nameID_y', 'scientificName']].rename(columns={'nameID_y': 'nameID', 'scientificName': 'synonym'})
df_replaced_comb['remarks'] = 'combination of replaced name'
df_replaced_comb

Unnamed: 0,taxonID,acceptedName,nameID,synonym,remarks


In [19]:
df_het_comb = df_syn.merge(df_namerel[df_namerel['type'] == 'basionym'], how='inner', left_on='nameID', right_on='relatedNameID')\
        .merge(df_nam[['scientificNameID', 'scientificName']], how='inner', left_on='nameID_y', right_on='scientificNameID')\
        [['taxonID', 'acceptedName', 'scientificNameID', 'scientificName']]\
        .rename(columns={'scientificNameID': 'nameID', 'scientificName': 'synonym'})
df_het_comb['remarks'] = 'combination of heterotypic synonym'
df_het_comb

Unnamed: 0,taxonID,acceptedName,nameID,synonym,remarks
0,https://id.biodiversity.org.au/instance/ausmos...,Megaceros pellucidus (Colenso) E.A.Hodgs.,https://id.biodiversity.org.au/name/ausmoss/12...,Megaceros longispirus,combination of heterotypic synonym
1,https://id.biodiversity.org.au/instance/ausmos...,Andreaea acutifolia Hook.f. & Wilson,https://id.biodiversity.org.au/name/ausmoss/10...,Andreaea amblyophylla var. attenuata,combination of heterotypic synonym
2,https://id.biodiversity.org.au/instance/ausmos...,Pyrrhobryum mnioides subsp. contortum (Müll.Ha...,https://id.biodiversity.org.au/name/ausmoss/10...,Rhizogonium hookeri,combination of heterotypic synonym
3,https://id.biodiversity.org.au/instance/ausmos...,Pyrrhobryum mnioides subsp. contortum (Müll.Ha...,https://id.biodiversity.org.au/name/ausmoss/10...,Rhizogonium mossmanianum,combination of heterotypic synonym
4,https://id.biodiversity.org.au/instance/ausmos...,Pyrrhobryum mnioides subsp. contortum (Müll.Ha...,https://id.biodiversity.org.au/name/ausmoss/10...,Pogonatum gullweri,combination of heterotypic synonym
...,...,...,...,...,...
678,https://id.biodiversity.org.au/instance/ausmos...,Symphyogyna podophylla (Thunb.) Mont. & Nees,https://id.biodiversity.org.au/name/ausmoss/21...,Symphyogyna obovata,combination of heterotypic synonym
679,https://id.biodiversity.org.au/instance/ausmos...,Plagiochasma rupestre (J.R.Forst. & G.Forst.) ...,https://id.biodiversity.org.au/name/ausmoss/21...,Aitonia australis,combination of heterotypic synonym
680,https://id.biodiversity.org.au/instance/ausmos...,Reboulia hemisphaerica (L.) Raddi,https://id.biodiversity.org.au/name/ausmoss/13...,Reboulia queenslandica,combination of heterotypic synonym
681,https://id.biodiversity.org.au/instance/ausmos...,Marchantia berteroana Lehm. & Lindenb.,https://id.biodiversity.org.au/name/ausmoss/21...,Marchantia polymorpha var. tabularis,combination of heterotypic synonym


In [20]:
df_inferred = pd.concat([df_acc_bas, df_acc_other_comb, df_replaced, df_replaced_comb, df_het_comb])
df_inferred.insert(0, 'ID', range(1, 1 + len(df_inferred)))
df_inferred = df_inferred.astype({'ID': 'str'})
df_inferred['ID'] = 'ausmoss-synonym-inferred-' + df_inferred['ID']

df_hom = df_inferred[df_inferred['remarks'] != 'combination of heterotypic synonym']

df_het = df_inferred[df_inferred['remarks'] == 'combination of heterotypic synonym']

df_syn2 = pd.concat([df_hom, df_syn, df_het])
df_syn = df_syn2.merge(df_nam[['scientificNameID']], how='inner', left_on='nameID', right_on='scientificNameID').drop(columns=['scientificNameID', 'acceptedName', 'synonym'])



### Remove names that are not used in taxonomy

In [21]:
df1 = df_tax[['taxonID', 'scientificNameID']].rename(columns={'taxonID': 'ID', 'scientificNameID': 'nameID'})
df2 = df_syn[['ID', 'nameID']]
df_used = pd.concat([df1, df2])
# df_used = df_used.drop_duplicates()
df_used

df_nam = df_nam.merge(df_used, how='inner', left_on='scientificNameID', right_on='nameID').drop(columns=['ID', 'nameID'])

In [22]:
df_namerel = df_namerel.merge(df_nam[['scientificNameID']], how='inner', left_on='nameID', right_on='scientificNameID').drop(columns=['scientificNameID'])\
        .merge(df_nam[['scientificNameID']], how='inner', left_on='relatedNameID', right_on='scientificNameID').drop(columns=['scientificNameID'])\
        .drop_duplicates()

#### Remove excess columns

In [23]:
df_tax = df_tax[['taxonID',
 'scientificNameID',
 'nameAccordingToID',
 'parentNameUsageID',
 'taxonRemarks']]

df_tax.rename(columns={
    'taxonID': 'ID',
    'scientificNameID': 'nameID',
    'nameAccordingToID': 'accordingToID',
    'parentNameUsageID': 'parentID',
    'taxonRemarks': 'remarks'
}, inplace=True)

df_tax

Unnamed: 0,ID,nameID,accordingToID,parentID,remarks
0,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/10...,https://id.biodiversity.org.au/reference/ausmo...,,
1,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/23...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
2,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/23...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
3,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/24...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
4,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/14...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
...,...,...,...,...,...
2607,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/21...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
2608,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/21...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
2609,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/14...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,
2610,https://id.biodiversity.org.au/instance/ausmos...,https://id.biodiversity.org.au/name/ausmoss/13...,https://id.biodiversity.org.au/reference/ausmo...,https://id.biodiversity.org.au/instance/ausmos...,


### Reference

In [24]:
df_ref = df_taxa[['nameAccordingToID', 'nameAccordingTo']]
df_ref.drop_duplicates(inplace=True)
df_ref.rename(columns={'nameAccordingToID': 'ID', 'nameAccordingTo': 'citation'}, inplace=True)
df_ref

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ref.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ref.rename(columns={'nameAccordingToID': 'ID', 'nameAccordingTo': 'citation'}, inplace=True)


Unnamed: 0,ID,citation
0,https://id.biodiversity.org.au/reference/ausmo...,"Klazenga, N. (2015), AusMoss: Catalogue of Aus..."
1,https://id.biodiversity.org.au/reference/ausmo...,"Renzaglia, K.S., Villarreal, J.C. & Duff, R.J...."
5,https://id.biodiversity.org.au/reference/ausmo...,"CHAH (2011), Australian Plant Census"
6,https://id.biodiversity.org.au/reference/ausmo...,"CHAH (2010), Australian Plant Census"
7,https://id.biodiversity.org.au/reference/ausmo...,"McCarthy, P.M. (2003), Catalogue of Australian..."
10,https://id.biodiversity.org.au/reference/ausmo...,"Klazenga, N. (2023), Census of Australian Bryo..."
41,https://id.biodiversity.org.au/reference/ausmo...,"CHAH (2013), Australian Plant Census"
68,https://id.biodiversity.org.au/reference/ausmo...,"Goffinet, B. & W.R. Buck (2017), Classificatio..."
73,https://id.biodiversity.org.au/reference/ausmo...,"Klazenga, N. (2023), Census of Australian bryo..."
196,https://id.biodiversity.org.au/reference/ausmo...,"Klazenga, N. (2019), Census of Australian bryo..."


### Alias identifiers

(HTTP identifiers break ChecklistBank)

In [25]:
# # taxon

# df_tax['ID'] = df_tax['ID'].str.replace('apni', 'ausmoss')
# df_tax['ID'] = df_tax['ID'].str.replace('https://id.biodiversity.org.au/instance/ausmoss/', 'ausmoss-taxon-')
# df_tax['nameID'] = df_tax['nameID'].str.replace('apni', 'ausmoss')
# df_tax['nameID'] = df_tax['nameID'].str.replace('https://id.biodiversity.org.au/name/ausmoss/', 'ausmoss-name-')
# df_tax['accordingToID'] = df_tax['accordingToID'].str.replace('apni', 'ausmoss')
# df_tax['accordingToID'] = df_tax['accordingToID'].str.replace('https://id.biodiversity.org.au/reference/ausmoss/', 'ausmoss-reference-')
# df_tax['parentID'] = df_tax['parentID'].str.replace('apni', 'ausmoss')
# df_tax['parentID'] = df_tax['parentID'].str.replace('https://id.biodiversity.org.au/instance/ausmoss/', 'ausmoss-taxon-')

# # normalise whitespace
# _RE_COMBINE_WHITESPACE = re.compile(r"\s+")

# remarks = []
# for index, row in df_tax.iterrows():
#     if isinstance(row['remarks'], str):
#         remarks.append(_RE_COMBINE_WHITESPACE.sub(" ", row['remarks']).strip())
#     else:
#         remarks.append(np.NaN)

# df_tax['remarks'] = remarks

# df_tax

In [26]:
# # name

# df_nam.rename(columns={
#         'scientificNameID': 'ID', 
#         'taxonRank': 'rank', 
#         'genericName': 'genus', 
#         'nomenclaturalStatus': 'status'
#     }, inplace=True)
# df_nam.drop(columns=['originalNameUsageID', 'originalNameUsage'], inplace=True)

# df_nam['ID'] = df_nam['ID'].str.replace('apni', 'ausmoss')
# df_nam['ID'] = df_nam['ID'].str.replace('https://id.biodiversity.org.au/name/ausmoss/', 'ausmoss-name-')
# df_nam

In [27]:
# # namerelation

# df_namerel['nameID'] = df_namerel['nameID'].str.replace('apni', 'ausmoss-name-')
# df_namerel['nameID'] = df_namerel['nameID'].str.replace('https://id.biodiversity.org.au/name/ausmoss/', 'ausmoss')
# df_namerel['relatedNameID'] = df_namerel['relatedNameID'].str.replace('apni', 'ausmoss')
# df_namerel['relatedNameID'] = df_namerel['relatedNameID'].str.replace('https://id.biodiversity.org.au/name/ausmoss/', 'ausmoss-name-')
# df_namerel

In [28]:
# # synonym 

# df_syn['ID'] = df_syn['ID'].str.replace('apni', 'ausmoss')
# df_syn['ID'] = df_syn['ID'].str.replace('https://id.biodiversity.org.au/instance/ausmoss/', 'ausmoss-synonym-')
# df_syn['taxonID'] = df_syn['taxonID'].str.replace('apni', 'ausmoss')
# df_syn['taxonID'] = df_syn['taxonID'].str.replace('https://id.biodiversity.org.au/instance/ausmoss/', 'ausmoss-taxon-')
# df_syn['nameID'] = df_syn['nameID'].str.replace('apni', 'ausmoss')
# df_syn['nameID'] = df_syn['nameID'].str.replace('https://id.biodiversity.org.au/name/ausmoss/', 'ausmoss-name-')

# df_syn

In [29]:
# # reference

# df_ref['ID'] = df_ref['ID'].str.replace('apni', 'ausmoss')
# df_ref['ID'] = df_ref['ID'].str.replace('https://id.biodiversity.org.au/reference/', 'ausmoss-reference-')
# df_ref

### Remove duplicates

In [30]:
print(df_tax.shape)
df_tax.drop_duplicates(inplace=True)
print(df_tax.shape)

print(df_nam.shape)
df_nam.drop_duplicates(inplace=True)
print(df_nam.shape)

(2612, 5)
(2612, 5)
(7280, 12)
(6205, 12)


### Create CoLDP

In [31]:
import os
from zipfile import ZipFile

os.chdir('/home/niels/code/jupyter-notebooks/nsl_export/coldp/' + group)

df_tax.to_csv('taxon.tsv', sep='\t', index=False)
df_syn.to_csv('synonym.tsv', sep='\t', index=False)
df_nam.to_csv('name.tsv', sep='\t', index=False)
df_name_relationships.to_csv('dwc_basionyms.tsv', sep='\t', index=False)
df_namerel.to_csv('namerelation.tsv', sep='\t', index=False)
df_higher.to_csv('dwc_higherclassification.tsv', sep='\t', index=False)
df_dwc_syn.to_csv('dwc_synonym.tsv', sep='\t', index=False)
df_ref.to_csv('reference.tsv', sep='\t', index=False)

with ZipFile('nsl_' + group + '_coldp.zip', 'w') as zipobj:
    zipobj.write('taxon.tsv')
    zipobj.write('name.tsv')
    zipobj.write('synonym.tsv')
    zipobj.write('namerelation.tsv')
    zipobj.write('reference.tsv')

os.chdir('/home/niels/code/jupyter-notebooks/nsl_export')

