# NSL export to CoLDP: Fungi

In [24]:
import pandas as pd
import numpy as np

taxonfile = 'data/fungi/FungI-taxon-2023-06-19-2814.csv'
namefile = 'data/fungi/FNI-names-2023-06-21-3253.csv'

### Names

In [25]:
df_names = pd.read_csv(namefile)

# Remove excess columns
columns = [
    'scientificNameID', 
    'nameAccordingToID', 
    'taxonRank', 
    'scientificName', 
    'genericName', 
    'specificEpithet', 
    'infraspecificEpithet', 
    'scientificNameAuthorship', 
    'nomenclaturalStatus',
    'originalNameUsageID', 
    'originalNameUsage'
]

df1 = df_names[columns]

# Instance ID is in nameAccordingToID for reasons unfathomable; rename to taxonID
df1 = df1.rename(columns={'nameAccordingToID': 'taxonID'})

# Filter for names with originalNameUsage
# Replace instance IDs in originalNameUsageID with scientificNameIDs; we will not need taxonID anymore after that
df2 = df1[~df1['originalNameUsageID'].isna()].merge(df1, how='left', left_on='originalNameUsageID', right_on='taxonID')[[
    'scientificNameID_x', 
    'taxonRank_x', 
    'scientificName_x', 
    'genericName_x', 
    'specificEpithet_x', 
    'infraspecificEpithet_x', 
    'scientificNameAuthorship_x', 
    'nomenclaturalStatus_x', 
    'scientificNameID_y', 
    'scientificName_y'
]]

column_mappings = {
    'scientificNameID_x': 'scientificNameID',
    'scientificName_x': 'scientificName',
    'taxonRank_x': 'taxonRank',
    'genericName_x': 'genericName',
    'specificEpithet_x': 'specificEpithet',
    'infraspecificEpithet_x': 'infraspecificEpithet',
    'scientificNameAuthorship_x': 'scientificNameAuthorship',
    'nomenclaturalStatus_x': 'nomenclaturalStatus',
    'scientificNameID_y': 'originalNameUsageID',
    'scientificName_y': 'originalNameUsage'
}

df2 = df2.rename(columns=column_mappings)

# Filter for names without originalNameUsage
df3 = df_names[df_names['originalNameUsage'].isna()][columns]
df3.drop(columns=['nameAccordingToID'], inplace=True)

# Merge dataframes
df5 = pd.concat([df2, df3])
df5 = df5.sort_values(by='scientificName')

# Make taxonRank lowercase
df5['taxonRank'] = df5['taxonRank'].str.lower()

# Map Latin rank names to less wankerish English ones
ranks = {
    'genus': 'genus',
    'species': 'species',
    'familia': 'family',
    'ordo': 'order',
    'subspecies': 'subspecies',
    'classis': 'class',
    'subclassis': 'subclass',
    'subdivision': 'subphylum',
    'subbdivision': 'subphylum',
    'varietas': 'variety',
    'subgenus': 'subgenus',
    'superspecies': 'superspecies',
    'forma': 'form',
    'division': 'phylum',
    'regnum': 'kingdom',
    'special form': 'special form',
    'sectio': 'section',
    'regio': 'domain',
    '[unknown]': '[unknown]'
}

taxon_ranks = []
uninomials = []
generic_names = []
for index, row in df5.iterrows():
    taxon_ranks.append(ranks[row['taxonRank']])

    uninomial = np.NaN
    generic_name = row['genericName']

    if not isinstance(row['genericName'], str):
        if not isinstance(row['scientificNameAuthorship'], str):
            uninomial = row['scientificName']
        else:
            uninomial = row['scientificName'][0:len(row['scientificName'])-len(row['scientificNameAuthorship'])-1]

    if row['taxonRank'] == 'genus':
        uninomial = row['genericName']
        generic_name = np.NaN

    uninomials.append(uninomial)
    generic_names.append(generic_name)

df5['taxonRank'] = taxon_ranks
df5['genericName'] = generic_names
df5['uninomial'] = uninomials

# Add nomenclaturalCode; 'ICN' in GBIF vocab. but 'botanical' in CoLDP
df5['code'] = 'botanical'

df5 = df5[[
    'scientificNameID', 
    'taxonRank', 
    'scientificName', 
    'uninomial',
    'genericName', 
    'specificEpithet', 
    'infraspecificEpithet', 
    'scientificNameAuthorship', 
    'code', 
    'nomenclaturalStatus', 
    'originalNameUsageID', 
    'originalNameUsage'
  ]]

df_nam = df5
df5

Unnamed: 0,scientificNameID,taxonRank,scientificName,uninomial,genericName,specificEpithet,infraspecificEpithet,scientificNameAuthorship,code,nomenclaturalStatus,originalNameUsageID,originalNameUsage
19746,https://id.biodiversity.org.au/name/fungi/6001...,genus,Abrothallus De Not.,Abrothallus,,,,De Not.,botanical,,,
11319,https://id.biodiversity.org.au/name/fungi/6001...,species,Abrothallus oxysporus Tul.,,Abrothallus,oxysporus,,Tul.,botanical,,,
19745,https://id.biodiversity.org.au/name/fungi/6001...,species,Abrothallus parmeliarum (Sommerf.) Arnold,,Abrothallus,parmeliarum,,(Sommerf.) Arnold,botanical,,,
4471,https://id.biodiversity.org.au/name/fungi/6001...,genus,Absidia Tiegh.,Absidia,,,,Tiegh.,botanical,,,
4469,https://id.biodiversity.org.au/name/fungi/6003...,species,Absidia butleri Lendn.,,Absidia,butleri,,Lendn.,botanical,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
18447,https://id.biodiversity.org.au/name/fungi/6007...,[unknown],trigonospora Bres.,,Clitocybe,semitalis,trigonospora,Bres.,botanical,,,
187,https://id.biodiversity.org.au/name/fungi/6007...,[unknown],vernus Bull.,,Agaricus,bulbosus,vernus,Bull.,botanical,,,
200,https://id.biodiversity.org.au/name/fungi/6007...,[unknown],villaticus Brond.,,Agaricus,campestris,villaticus,Brond.,botanical,,,
19160,https://id.biodiversity.org.au/name/fungi/6003...,variety,vulgaris d bromivora Tul. & C.Tul.,,Ustilago,carbo,vulgaris d bromivora,Tul. & C.Tul.,botanical,nom. inval.,,


### Name relations

In [26]:
df_name_relationships = df5[~df5['originalNameUsageID'].isna()][['scientificNameID', 'scientificName', 'originalNameUsageID', 'originalNameUsage']]

df_namerel = df_name_relationships[['scientificNameID', 'originalNameUsageID']]
df_namerel['type'] = 'basionym'
df_namerel.rename(columns={'scientificNameID': 'nameID', 'originalNameUsageID': 'relatedNameID'}, inplace=True)

# Remove originalNameUsage columns from Names
rename_columns = {
    'scientificNameID': 'ID',
    'taxonRank': 'rank',
    'scientificName': 'scientificName',
    'genericName': 'genus',
    'scientificNameAuthorship': 'authorship',
    'nomenclaturalStatus': 'status'
}

df5.drop(columns=drop_columns, inplace=True)
df5.rename(columns=rename_columns, inplace=True)

#df_name_relationships
df_namerel

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_namerel['type'] = 'basionym'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_namerel.rename(columns={'scientificNameID': 'nameID', 'originalNameUsageID': 'relatedNameID'}, inplace=True)


Unnamed: 0,nameID,relatedNameID,type
2014,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/name/fungi/6003...,basionym
13,https://id.biodiversity.org.au/name/fungi/6002...,https://id.biodiversity.org.au/name/fungi/6002...,basionym
4122,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/name/fungi/6003...,basionym
4123,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/name/fungi/6003...,basionym
830,https://id.biodiversity.org.au/name/fungi/6002...,https://id.biodiversity.org.au/name/fungi/6002...,basionym
...,...,...,...
2179,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/name/fungi/6003...,basionym
656,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/name/fungi/6001...,basionym
5222,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/name/fungi/6001...,basionym
2304,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/name/fungi/6002...,basionym


### Taxa

In [27]:
# Read Taxon export into dataframe
df_taxa = pd.read_csv(taxonfile)
df_taxa = df_taxa[df_taxa['nameType'] == 'scientific']

# Create dataframe with IDs
# This is used later to replace tree element IDs (in taxonID) with instance IDs (in taxonConceptID)
df_id = df_taxa[['taxonID', 'taxonConceptID']]

  df_taxa = pd.read_csv(taxonfile)


In [28]:
# Filter on accepted names; these are the taxa
df_tax = df_taxa[df_taxa['taxonomicStatus'] == 'accepted']

# Replace tree element IDs in taxonID with instance IDs (see above)
df_tax['taxonID'] = df_tax['taxonConceptID']

# Replace tree element IDs in parentNameUsageID with instance IDs
df_tax = df_tax.merge(df_id, how='left', left_on='parentNameUsageID', right_on='taxonID')
df_tax['parentNameUsageID'] = df_tax['taxonConceptID_y']
df_tax.drop(columns=['taxonID_y', 'taxonConceptID_y'], inplace=True)
df_tax.rename(columns={'taxonID_x': 'taxonID', 'taxonConceptID_x': 'taxonConceptID'}, inplace=True)

df_tax = df_tax[['taxonID',
 'scientificNameID',
 'scientificName',
 'scientificNameAuthorship',
 'nameAccordingTo',
 'nameAccordingToID',
 'parentNameUsageID',
 'taxonRank',
 'taxonRankSortOrder',
 'kingdom',
 'class',
 'subclass',
 'family',
 'taxonConceptID',
 'taxonRemarks',
 'higherClassification'
]]

# Translate ranks into English
df_tax['taxonRank'] = df_tax['taxonRank'].str.lower()
ranks = {
    'genus': 'genus',
    'species': 'species',
    'familia': 'family',
    'ordo': 'order',
    'subspecies': 'subspecies',
    'classis': 'class',
    'subclassis': 'subclass',
    'subdivision': 'subphylum',
    'subbdivision': 'subphylum',
    'varietas': 'variety',
    'subgenus': 'subgenus',
    'superspecies': 'superspecies',
    'forma': 'form',
    'division': 'phylum',
    'regnum': 'kingdom',
    'special form': 'special form',
    'sectio': 'section',
    'regio': 'domain',
    '[unknown]': '[unknown]'
}

taxon_ranks = []
for index, row in df_tax.iterrows():
    taxon_ranks.append(ranks[row['taxonRank']])

df_tax['taxonRank'] = taxon_ranks


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tax['taxonID'] = df_tax['taxonConceptID']


#### Higher classification

In [29]:
# Create dictionary with higher taxa from instance ID
def get_higher_taxa(id, higher={}):
    parent = df_tax.loc[df_tax['taxonID'] == id][[
      'taxonRank', 
      'scientificName', 
      'scientificNameAuthorship', 
      'parentNameUsageID']].to_dict(orient='records')[0]
    
    if isinstance(parent['scientificNameAuthorship'], str):
        higher[parent['taxonRank']] = parent['scientificName'][0:len(parent['scientificName'])-len(parent['scientificNameAuthorship'])-1]
    else:
        higher[parent['taxonRank']] = parent['scientificName']

    if isinstance(parent['parentNameUsageID'], str):
        return get_higher_taxa(parent['parentNameUsageID'], higher)
    else:
        return higher

In [30]:
# Create higher classification
def create_higher_classification(id):
    higher = get_higher_taxa(id, {})
    keys = list(higher.keys())

    hcl = {}

    hcl['taxonID'] = id
    hcl['species'] = higher['species'] if 'species' in keys else np.NaN
    hcl['section'] = higher['section'] if 'section' in keys else np.NaN
    hcl['subgenus'] = higher['subgenus'] if 'subgenus' in keys else np.NaN
    hcl['genus'] = higher['genus'] if 'genus' in keys else np.NaN
    hcl['subtribe'] = higher['subtribe'] if 'subtribe' in keys else np.NaN
    hcl['tribe'] = higher['tribe'] if 'tribe' in keys else np.NaN
    hcl['subfamily'] = higher['subfamily'] if 'subfamily' in keys else np.NaN
    hcl['family'] = higher['family'] if 'family' in keys else np.NaN
    hcl['superfamily'] = higher['superfamily'] if 'superfamily' in keys else np.NaN
    hcl['suborder'] = higher['suborder'] if 'suborder' in keys else np.NaN
    hcl['order'] = higher['order'] if 'order' in keys else np.NaN
    hcl['subclass'] = higher['subclass'] if 'subclass' in keys else np.NaN
    hcl['class'] = higher['class'] if 'class' in keys else np.NaN
    hcl['subphylum'] = higher['subphylum'] if 'subphylum' in keys else np.NaN
    hcl['phylum'] = higher['phylum'] if 'phylum' in keys else np.NaN
    hcl['kingdom'] = higher['kingdom'] if 'kingdom' in keys else np.NaN

    cl = list(higher.values())
    for i in range(len(cl) // 2):
        cl[i], cl[-1 - i] = cl[-1 - i], cl[i]

    hcl['classification'] = ' | '.join(cl)

    return hcl


In [31]:
cl = []
for index, row in df_tax.iterrows():
    cl.append(create_higher_classification(row['taxonID']))

df_higher = pd.DataFrame.from_dict(cl)

df_higher = df_higher.merge(df_tax[['taxonID', 'scientificName', 'taxonRank']], how='left', left_on='taxonID', right_on='taxonID')

df_higher = df_higher[['taxonID',
 'scientificName',
 'taxonRank',
 'kingdom',
 'phylum',
 'subphylum',
 'class',
 'subclass',
 'order',
 'suborder',
 'superfamily',
 'family',
 'tribe',
 'subfamily',
 'subtribe',
 'genus',
 'subgenus',
 'section',
 'species',
 'classification']]

df_higher.rename(columns={'classification': 'higherClassification'}, inplace=True)

df_higher

Unnamed: 0,taxonID,scientificName,taxonRank,kingdom,phylum,subphylum,class,subclass,order,suborder,superfamily,family,tribe,subfamily,subtribe,genus,subgenus,section,species,higherClassification
0,https://id.biodiversity.org.au/instance/fungi/...,Eukaryota,domain,,,,,,,,,,,,,,,,,Eukaryota
1,https://id.biodiversity.org.au/instance/fungi/...,Chromista,kingdom,Chromista,,,,,,,,,,,,,,,,Eukaryota | Chromista
2,https://id.biodiversity.org.au/instance/fungi/...,Bigyra,phylum,Chromista,Bigyra,,,,,,,,,,,,,,,Eukaryota | Chromista | Bigyra
3,https://id.biodiversity.org.au/instance/fungi/...,Labyrinthulea,class,Chromista,Bigyra,,Labyrinthulea,,,,,,,,,,,,,Eukaryota | Chromista | Bigyra | Labyrinthulea
4,https://id.biodiversity.org.au/instance/fungi/...,Thraustochytriida,order,Chromista,Bigyra,,Labyrinthulea,,Thraustochytriida,,,,,,,,,,,Eukaryota | Chromista | Bigyra | Labyrinthulea...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11621,https://id.biodiversity.org.au/instance/fungi/...,Pocheina rosea (Cienk.) A.R.Loebl. & Tappan,species,Protista,Percolozoa,Tetramitia,Heterolobosea,,Acrasida,,,Guttulinaceae,,,,Pocheina,,,Pocheina rosea,Eukaryota | Protista | Percolozoa | Tetramitia...
11622,https://id.biodiversity.org.au/instance/fungi/...,Rozella Cornu,genus,Protista,,,,,,,,,,,,Rozella,,,,Eukaryota | Protista | Rozella
11623,https://id.biodiversity.org.au/instance/fungi/...,Rozella allomycis Foust,species,Protista,,,,,,,,,,,,Rozella,,,Rozella allomycis,Eukaryota | Protista | Rozella | Rozella allom...
11624,https://id.biodiversity.org.au/instance/fungi/...,Rozella irregularis (E.J.Butler) Sparrow,species,Protista,,,,,,,,,,,,Rozella,,,Rozella irregularis,Eukaryota | Protista | Rozella | Rozella irreg...


#### Synonyms

In [32]:
# Filter on synonyms
df_syn = df_taxa[df_taxa['taxonomicStatus'].isin(['synonym', 'nomenclatural synonym', 'taxonomic synonym']) & ~df_taxa['acceptedNameUsageID'].isna()][[
    'taxonID',
    'scientificNameID',
    'scientificName',
    'acceptedNameUsageID',
    'acceptedNameUsage',
    'taxonomicStatus'
]]

# Replace tree element IDs in acceptedNameUsageID with instance IDs
df_syn = df_syn.merge(df_id, how='left', left_on='acceptedNameUsageID', right_on='taxonID')

df_syn.drop(columns=['acceptedNameUsageID', 'taxonID_y'], inplace=True)
df_syn.rename(columns={
    'taxonID_x': 'ID',
    'taxonConceptID': 'taxonID',
    'scientificNameID': 'nameID'    
}, inplace=True)
df_syn = df_syn[['ID', 'taxonID', 'nameID', 'scientificName', 'acceptedNameUsage', 'taxonomicStatus']]

# Get accepted names
df_dwc_syn = df_syn.merge(df_tax[['taxonID', 'scientificName']], left_on='taxonID', right_on='taxonID')

df_dwc_syn.drop(columns=['taxonID', 'nameID', 'acceptedNameUsage'], inplace=True)

df_dwc_syn.rename(columns={
    'ID': 'taxonID',
    'scientificName_x': 'scientificName',
    'scientificName_y': 'acceptedNameUsage'
}, inplace=True)

df_syn.drop(columns=['scientificName', 'acceptedNameUsage', 'taxonomicStatus'], inplace=True)

df_syn

Unnamed: 0,ID,taxonID,nameID
0,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...
1,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...
2,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...
3,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...
4,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...
...,...,...,...
7883,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...
7884,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...
7885,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...
7886,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...


#### Remove excess columns

In [33]:
df_tax = df_tax[['taxonID',
 'scientificNameID',
 'nameAccordingToID',
 'parentNameUsageID',
 'taxonRemarks']]

df_tax.rename(columns={
    'taxonID': 'ID',
    'scientificNameID': 'nameID',
    'nameAccordingToID': 'accordingToID',
    'parentNameUsageID': 'parentID',
    'taxonRemarks': 'remarks'
}, inplace=True)

df_tax

Unnamed: 0,ID,nameID,accordingToID,parentID,remarks
0,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,,
1,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
2,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
3,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
4,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
...,...,...,...,...,...
11621,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
11622,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6001...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
11623,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,
11624,https://id.biodiversity.org.au/instance/fungi/...,https://id.biodiversity.org.au/name/fungi/6003...,https://id.biodiversity.org.au/reference/fungi...,https://id.biodiversity.org.au/instance/fungi/...,


### Create CoLDP

In [34]:
import os
from zipfile import ZipFile

os.chdir('/home/niels/code/jupyter-notebooks/nsl_export/coldp/fungi')

df_tax.to_csv('taxon.tsv', sep='\t', index=False)
df_syn.to_csv('synonym.tsv', sep='\t', index=False)
df_nam.to_csv('name.tsv', sep='\t', index=False)
df_name_relationships.to_csv('dwc_basionyms.tsv', sep='\t', index=False)
df_namerel.to_csv('namerelation.tsv', sep='\t', index=False)
df_higher.to_csv('dwc_higherclassification.tsv', sep='\t', index=False)
df_dwc_syn.to_csv('dwc_synonym.tsv', sep='\t', index=False)

with ZipFile('nsl_fungi_coldp.zip', 'w') as zipobj:
    zipobj.write('taxon.tsv')
    zipobj.write('name.tsv')
    zipobj.write('synonym.tsv')
    zipobj.write('namerelation.tsv')

os.chdir('/home/niels/code/jupyter-notebooks/nsl_export')

