## Nordicmicroalgae NOMAC 2017.

Prepares the NOMAC 2017 species list.

In [1]:
import pandas as pd
import numpy as np
import datetime

#### Import species files downloaded from Artsnavnebase.

In [2]:
def read_from_artsnavnebase(file_name):
    df = pd.read_csv('./in_data/' + file_name, 
                 usecols=['Rike', 'Rekke', 'Klasse', 'Orden', 'Familie', 'Slekt', 'Art', 'Underart', 
                          'Varietet', 'Form', 'Autorstreng', 'Hovedstatus', 'Bistatus'], 
                          sep=';', encoding='cp1252',
                          na_values=['nan'], keep_default_na=False, 
                          )
    #
    old_header=['Rike', 'Rekke', 'Klasse', 'Orden', 'Familie', 'Slekt', 'Art', 'Underart', 
                'Varietet', 'Form', 'Autorstreng', 'Hovedstatus', 'Bistatus'] 
    new_header=['kingdom', 'phylum', 'class_name', 'order', 'family', 'genus', 'species', 'subspecies', 
                'variety', 'forma', 'author', 'status', 'status-2'] 
    df.rename(columns=dict(zip(old_header, new_header)), inplace=True)#df.fillna('', inplace=True)
    #
    return df

In [3]:
df1 = read_from_artsnavnebase('Chlorophyta_ArtsnavnebaseCSV_20170303.txt')
df2 = read_from_artsnavnebase('Chromista_ArtsnavnebaseCSV_20170303.txt')
df3 = read_from_artsnavnebase('Cyanobacteria_ArtsnavnebaseCSV_20170303.txt')
df4 = read_from_artsnavnebase('Plantae_Glaucophyta_ArtsnavnebaseCSV_20170303.txt')
df5 = read_from_artsnavnebase('Plantae_Rhodophyta_ArtsnavnebaseCSV_20170303.txt')
df6 = read_from_artsnavnebase('Protozoa_ArtsnavnebaseCSV_20170303.txt')

In [4]:
print('Length df1: ' + str(len(df1)))
print('Length df2: ' + str(len(df2)))
print('Length df3: ' + str(len(df3)))
print('Length df4: ' + str(len(df4)))
print('Length df5: ' + str(len(df5)))
print('Length df6: ' + str(len(df6)))
df = pd.concat([df1, df2, df3, df4, df5, df6])
print('Length total df: ' + str(len(df)))

Length df1: 1289
Length df2: 5388
Length df3: 831
Length df4: 10
Length df5: 501
Length df6: 720
Length total df: 8739


In [5]:
df.to_csv('./out_data/' + 'all_species_from_artsnavnebase.txt',
         sep='\t', encoding='cp1252', index = False)

#### Filter on status = 'Gyldig' to get valid taxa.

In [6]:
df_valid = df[df.status=='Gyldig']
print('Length df_valid: ' + str(len(df_valid)))

Length df_valid: 8739


In [7]:
df_valid.to_csv('./out_data/' + 'valid_species_from_artsnavnebase.txt',
               sep='\t', encoding='cp1252', index = False)

#### Extract kingdom and phylum and add to NOMAC 2017.
Note: Not taxa below in rank.

In [8]:
#add_kingdom_list = ["Bacteria", "Chromista", "Plantae", "Protozoa"]

In [9]:
#add_phylum_list = ["Cyanobacteria", "Bacillariophyta", "Cercozoa", "Haptophyta", "Miozoa", 
#                   "Not_assigned_3409", "Ochrophyta", "Oomycota", "Charophyta", "Chlorophyta", 
#                   "Cyanidiophyta", "Glaucophyta", "Rhodophyta", "Streptophyta", 
#                   "(Rekke) Incertae sedis", "Choanozoa", "Euglenozoa", "Metamonada", 
#                   "Myzozoa"]

In [10]:
#df_top_kingdom = df_valid[df_valid.phylum=='']
#df_top_kingdom = df_top_kingdom[df_top_kingdom.kingdom.isin(add_kingdom_list)]
#print('Length df_top_kingdom: ' + str(len(df_top_kingdom)))
#df_top_kingdom

In [11]:
#df_top_phylum = df_valid[df_valid.class_name=='']
#df_top_phylum = df_top_phylum[df_top_phylum.phylum.isin(add_phylum_list)]
#print('Length df_valid_phylum: ' + str(len(df_top_phylum)))
##df_top_phylum

#### Add phylum and class recursively.

In [12]:
##phylum_list = ["Tracheophyta", "Chromista phylum incertae sedis", "Ciliophora", "Hyphochytriomycota"]
#add_all_phylum_list = ["Chromista phylum incertae sedis", "Ciliophora", "Hyphochytriomycota"]
##phylum_list

In [13]:
#add_all_class_list = ["Charophyceae", "Chlorokybophyceae", "Coleochaetophyceae", "Conjugatophyceae ", 
#"Klebsormidiophyceae", "Mesostigmatophyceae", "Chlorodendrophyceae", "Chlorophyceae", 
#"Chlorophyta incertae sedis", "Mamiellophyceae", "Nephrophyceae", "Palmophyllophyceae", 
#"Pedinophyceae", "Pyramimonadophyceae", "Trebouxiophyceae", "Ulvophyceae", "Glaucophyceae", 
#"Bangiophyceae", "Compsopogonophyceae", "Cyanidiophyceae", "Florideophyceae", 
#"Porphyridiophyceae", "Rhodellophyceae", "Rhodophyta incertae sedis", "Stylonematophyceae", 
#"Bacillariophyta classis incertae sedis", "Bacillariophyceae", "Coscinodiscophyceae", 
#"Mediophyceae", "Bikosea", "Chlorarachniophyceae", "Filosa", "Imbricatea", "Cryptophyceae", 
#"Polythalamea", "Foraminifera incertae sedis", "Globothalamea", "Monothalamea", "Tubothalamea", 
#"Coccolithophyceae", "Pavlovophyceae", "Katablepharidophyceae", "Apicomonadea", "Colponemea", 
#"Dinophyceae", "Ellobiopsea", "Myzomonadea", "Noctilucea", "Oxyrrhida", "Perkinsea", "Syndinea", 
#"Bolidophyceae", "Chrysomerophyceae", "Chrysophyceae", "Dictyochophyceae", "Pelagophyceae", 
#"Phaeophyceae", "Phaeothamniophyceae", "Picophagophyceae", "Pinguiophyceae", "Placidiophyceae", 
#"Raphidophyceae", "Schizocladiophyceae", "Synchromophyceae", "Synurophyceae", "Xanthophyceae", 
#"Oomycetes", "Telonemia classis ineditae", "Choanoflagellatea", "Cristidiscoidia", "Diplonemea", 
#"Euglenophyceae", "Kinetoplastea", "Postgaardea", "Jakobea", "Trepomonadea", "Heterolobosea", 
#"Picomonadea", "Ebriophyceae", "Cyanophyceae"]
##class_list

In [14]:
#df_add_all_phylum = df_valid[df_valid.phylum.isin(add_all_phylum_list)]
#print('Length df_add_all_phylum: ' + str(len(df_add_all_phylum)))
##df_add_all_phylum.head()

In [15]:
#df_add_all_class = df_valid[df_valid.class_name.isin(add_all_class_list)]
#print('Length df_add_all_class: ' + str(len(df_add_all_class)))
##df_add_all_class.head()

#### Concatenate to NOMAC 2017.

In [16]:
#df_nomac2017 = pd.concat([df_top_kingdom, df_top_phylum, df_add_all_phylum, df_add_all_class])
df_nomac2017 = df_valid
print('Length df_nomac2017: ' + str(len(df_nomac2017)))

Length df_nomac2017: 8739


#### Test if the add_all_phylum_list and add_all_class_list content was found in NOMAC 2017.

In [17]:
#for phylum in add_all_phylum_list:
#    df_test = df_nomac2017[df_nomac2017.phylum == phylum]
#    print("Test phylum: " + phylum + "          Number of rows: " + str(len(df_test)) )

In [18]:
#for class_name in add_all_class_list:
#    df_test = df_nomac2017[df_nomac2017.class_name == class_name]
#    print("Test class: " + class_name + "          Number of rows: " + str(len(df_test)) )

#### Remove taxa recursively.

In [19]:
remove_phylum_list = ['Rhodophyta', 'Oomycota', ]
remove_class_list = ['Ulvophyceae', 'Schizocladiophyceae']
remove_order_list = ['Vaucheriales', 'Tribonemiales', 'Bangiophyceae', 'Compsogonophyceae', 
                     'Compsopogonophyceae', 'Florideophyceae', 'Phaeophyceae', 'Phaeothamniophyceae', ]
remove_family_list = []
remove_genus_list = ['Prasiola', 'Rosenvingiella', ]

In [20]:
print('Length df_nomac2017 before: ' + str(len(df_nomac2017)))
df_nomac2017 = df_nomac2017[~df_nomac2017.phylum.isin(remove_phylum_list)]
print('Length df_nomac2017 after phylym removed : ' + str(len(df_nomac2017)))
df_nomac2017 = df_nomac2017[~df_nomac2017.class_name.isin(remove_class_list)]
print('Length df_nomac2017 after class removed : ' + str(len(df_nomac2017)))
df_nomac2017 = df_nomac2017[~df_nomac2017.order.isin(remove_order_list)]
print('Length df_nomac2017 after order removed : ' + str(len(df_nomac2017)))
df_nomac2017 = df_nomac2017[~df_nomac2017.family.isin(remove_family_list)]
print('Length df_nomac2017 after family removed : ' + str(len(df_nomac2017)))
df_nomac2017 = df_nomac2017[~df_nomac2017.genus.isin(remove_genus_list)]
print('Length df_nomac2017 after genus removed : ' + str(len(df_nomac2017)))

Length df_nomac2017 before: 8739
Length df_nomac2017 after phylym removed : 8034
Length df_nomac2017 after class removed : 7809
Length df_nomac2017 after order removed : 7773
Length df_nomac2017 after family removed : 7773
Length df_nomac2017 after genus removed : 7762


#### Calculate new columns for NOMAC 2017.

In [21]:
# Add columns.
df_nomac2017 = df_nomac2017.assign(nomac_scientific_name="")
df_nomac2017 = df_nomac2017.assign(nomac_rank="")
df_nomac2017 = df_nomac2017.assign(nomac_parent="")
df_nomac2017 = df_nomac2017.assign(nomac_classification="")
df_nomac2017.head()

Unnamed: 0,kingdom,phylum,class_name,order,family,genus,species,subspecies,variety,forma,author,status,status-2,nomac_scientific_name,nomac_rank,nomac_parent,nomac_classification
0,Plantae,Chlorophyta,,,,,,,,,Rchb.,Gyldig,Ingen_bistatus,,,,
1,Plantae,Chlorophyta,Bryopsidophyceae,,,,,,,,Bessey,Gyldig,Ingen_bistatus,,,,
2,Plantae,Chlorophyta,Chlorophyceae,,,,,,,,Wille,Gyldig,Ingen_bistatus,,,,
3,Plantae,Chlorophyta,Nephrophyceae,,,,,,,,"Cavalier-Smith, 1993",Gyldig,Ingen_bistatus,,,,
4,Plantae,Chlorophyta,Pedinophyceae,,,,,,,,Moestrup,Gyldig,Ingen_bistatus,,,,


In [22]:
def calc_scientific_name(kingdom, phylum, class_name, order, family, genus, species, subspecies, variety, forma):
    #
    scientific_name = ''
    rank = ''
    classification = ''
    #
    if len(kingdom) > 0:
        scientific_name = kingdom
        rank = 'Kingdom'
        classification = kingdom
        parent = ''
    if len(phylum) > 0:
        scientific_name = phylum
        rank = 'Phylum'
        classification = classification + ' - ' + phylum
        parent = kingdom
    if len(class_name) > 0:
        scientific_name = class_name
        rank = 'Class'
        classification = classification + ' - ' + class_name
        parent = phylum
    if len(order) > 0:
        scientific_name = order
        rank = 'Order'
        classification = classification + ' - ' + order
        parent = class_name
    if len(family) > 0:
        scientific_name = family
        rank = 'Family'
        classification = classification + ' - ' + family
        parent = order
    if len(genus) > 0:
        scientific_name = genus
        rank = 'Genus'
        classification = classification + ' - ' + genus
        parent = family
    if len(species) > 0:
        scientific_name = genus + ' ' + species
        rank = 'Species'
        classification = classification + ' ' + species
        parent = genus
    if len(subspecies) > 0:
        scientific_name = genus + ' ' + species + ' spp. ' + subspecies
        rank = 'Subspecies'
        classification = classification + ' spp. ' + subspecies
        parent = species
    if len(variety) > 0:
        scientific_name = genus + ' ' + species + ' var. ' + variety
        rank = 'Variety'
        classification = classification + ' var. ' + variety
        parent = species
    if len(forma) > 0:
        scientific_name = genus + ' ' + species + ' f. ' + forma
        rank = 'Forma'
        classification = classification + ' f. ' + forma
        parent = species
    #
    return scientific_name, rank, parent, classification
#
for index, row in df_nomac2017.iterrows():    
   row.loc['nomac_scientific_name'], \
    row.loc['nomac_rank'], \
    row.loc['nomac_parent'], \
    row.loc['nomac_classification'] = calc_scientific_name(
        row.loc['kingdom'], 
        row.loc['phylum'], 
        row.loc['class_name'], 
        row.loc['order'], 
        row.loc['family'], 
        row.loc['genus'], 
        row.loc['species'], 
        row.loc['subspecies'], 
        row.loc['variety'], 
        row.loc['forma'])
#
print('Length df_nomac2017: ' + str(len(df_nomac2017)))
df_nomac2017

Length df_nomac2017: 7762


Unnamed: 0,kingdom,phylum,class_name,order,family,genus,species,subspecies,variety,forma,author,status,status-2,nomac_scientific_name,nomac_rank,nomac_parent,nomac_classification
0,Plantae,Chlorophyta,,,,,,,,,Rchb.,Gyldig,Ingen_bistatus,Chlorophyta,Phylum,Plantae,Plantae - Chlorophyta
1,Plantae,Chlorophyta,Bryopsidophyceae,,,,,,,,Bessey,Gyldig,Ingen_bistatus,Bryopsidophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Bryopsidophyceae
2,Plantae,Chlorophyta,Chlorophyceae,,,,,,,,Wille,Gyldig,Ingen_bistatus,Chlorophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Chlorophyceae
3,Plantae,Chlorophyta,Nephrophyceae,,,,,,,,"Cavalier-Smith, 1993",Gyldig,Ingen_bistatus,Nephrophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Nephrophyceae
4,Plantae,Chlorophyta,Pedinophyceae,,,,,,,,Moestrup,Gyldig,Ingen_bistatus,Pedinophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Pedinophyceae
5,Plantae,Chlorophyta,Pleurastrophyceae,,,,,,,,Mattox & K.D.Stewart,Gyldig,Ingen_bistatus,Pleurastrophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Pleurastrophyceae
6,Plantae,Chlorophyta,Prasinophyceae,,,,,,,,T.A.Chr. ex P.C.Silva,Gyldig,Ingen_bistatus,Prasinophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Prasinophyceae
7,Plantae,Chlorophyta,Trebouxiophyceae,,,,,,,,Friedl,Gyldig,Ingen_bistatus,Trebouxiophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Trebouxiophyceae
9,Plantae,Chlorophyta,Incertae sedis,,,,,,,,,Gyldig,Ingen_bistatus,Incertae sedis,Class,Chlorophyta,Plantae - Chlorophyta - Incertae sedis
10,Plantae,Chlorophyta,Mamiellophyceae,,,,,,,,B.Marin & Melkonian,Gyldig,Ingen_bistatus,Mamiellophyceae,Class,Chlorophyta,Plantae - Chlorophyta - Mamiellophyceae


In [23]:
date_iso = datetime.datetime.now().date().isoformat()
df_nomac2017.to_csv('./out_data/' + 'NOMAC_2017_version_' + date_iso + '.txt',
                    sep='\t', encoding='cp1252', index = False)