In [1]:
import pandas as pd
import pickle

In [2]:
import sys,os
sys.path.insert(0,os.path.expanduser('~/Documents/caryocar'))

In [3]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
from caryocar.models import CWN, SCN
from caryocar.cleaning import NamesAtomizer, namesFromString
from caryocar.cleaning import normalize,read_NamesMap_fromJson
from caryocar.cleaning import getNamesIndexes

In [4]:
dsetPath = '~/datasets/ub_herbarium/occurrence.csv'
cols=['recordedBy','scientificName','taxonRank','kingdom','phylum','class','order','family','genus','species',
      'countryCode', 'rightsHolder', 'eventDate']
occs = pd.read_table(dsetPath,usecols=cols,low_memory=False,parse_dates=['eventDate'])
occs = occs[occs['recordedBy'].notnull()]
occs = occs[occs['scientificName'].notnull()]
occs = occs[occs['species'].notnull()]

In [5]:
occs.head()

Unnamed: 0,kingdom,phylum,class,order,family,genus,species,taxonRank,scientificName,countryCode,eventDate,rightsHolder,recordedBy
0,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Lychnophora,Lychnophora pinaster,SPECIES,Lychnophora pinaster Mart.,BR,1980-10-01,UB - Herbário da Universidade de Brasília,"Ferreira, VF"
1,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Lychnophora,Lychnophora pinaster,SPECIES,Lychnophora pinaster Mart.,BR,1980-10-08,UB - Herbário da Universidade de Brasília,"Ferreira, VF"
2,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Lychnophora,Lychnophora pinaster,SPECIES,Lychnophora pinaster Mart.,BR,1980-10-01,UB - Herbário da Universidade de Brasília,"Ferreira, VF"
3,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Lychnophora,Lychnophora pinaster,SPECIES,Lychnophora pinaster Mart.,BR,1971-01-18,UB - Herbário da Universidade de Brasília,"Irwin, HS"
4,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Arundo,Arundo donax,SPECIES,Arundo donax L.,BR,2017-02-08,UB - Herbário da Universidade de Brasília,"Gomes, SM; Silva, ALH"


In [6]:
occs.columns

Index(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species',
       'taxonRank', 'scientificName', 'countryCode', 'eventDate',
       'rightsHolder', 'recordedBy'],
      dtype='object')

---

# Names atomization

In [7]:
na = NamesAtomizer(atomizeOp=namesFromString)

Replaces: Manually replace some entries which are not following naming standards

In [8]:
names_replaces_file = '../caryocar/cleaning/data/ub_collectors_replaces.json'

In [9]:
!cat {names_replaces_file}

{
    "_replaces": {
        "Barbosa; M.G.": "Barbosa, M.G.",
        "Bueno; S.B.": "Bueno, S.B.",
        "Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB": "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB",
        "Hatschbach, G; M.": "Hatschbach, G; Hatschbach, M",
        "Hällström; E.": "Hällström, E.",
        "Irwin, HS; Souza, R; Santos; RR": "Irwin, HS; Souza, R; Santos, RR",
        "Kirkbride Junior, JH; Ono; E.K.M; et al.": "Kirkbride Junior, JH; Ono, E.K.M; et al.",
        "Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.": "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R.",
        "Silva; D.R.; Colvéquia; L.P.T": "Silva, D.R.; Colvéquia, L.P.T",
        "Sr. Air, Sr. Milton, Rodrigo": "Sr. Air; Sr. Milton; Rodrigo",
        "Sônia / Josefina": "Sônia; Josefina",
        "Yushun.; K.": "Yushun., K."
    }
}

In [10]:
na.read_replaces(names_replaces_file)

Now we create a new feature with atomized names

In [11]:
occs['recordedBy_atomized']=na.atomize(occs['recordedBy'])

---

# Names mapping

In [12]:
namesMap_file = '../caryocar/cleaning/data/ub_namesmap.json'

In [13]:
!head {namesMap_file} -n 20

{
    "_map_prim_norm": {
        ".": "",
        "1980 Sino-Amer Exped.": "sinoamerexped",
        "?": "",
        "A.J.N.V.": "ajnv",
        "A.M.": "am",
        "Abbas, B": "abbas,b",
        "Abdala, GC": "abdala,gc",
        "Abdo, MSA": "abdo,msa",
        "Abdon": "abdon",
        "Abe, LB": "abe,lb",
        "Abe, LM": "abe,lm",
        "Abrahim, MA": "abrahim,ma",
        "Abreu, CG": "abreu,cg",
        "Abreu, GX": "abreu,gx",
        "Abreu, I": "abreu,i",
        "Abreu, LC": "abreu,lc",
        "Abreu, LCR": "abreu,lcr",
        "Abreu, M": "abreu,m",


In [14]:
nm = read_NamesMap_fromJson(namesMap_file, normalizationFunc=normalize)

In case there are any collector names which were not in the names map file, add them now:

In [15]:
collectors_names = list(set( n for n,st,num in na.getCachedNames() ))
nm.addNames(collectors_names)

---

# Names index

The names index maps each normalized collector name to the indices of the dataframe rows where they appear.

In [16]:
ni = getNamesIndexes(occs,'recordedBy_atomized', namesMap=nm.getMap())

---

# Building the models

## SCN

In [17]:
scn = SCN(species=occs['species'], collectors=occs['recordedBy_atomized'], namesMap=nm)

In [18]:
cols_to_filter = ['','ignorado','ilegivel','incognito','etal']
scn.remove_nodes_from(cols_to_filter)

## CWN

In [19]:
cwn = CWN(cliques=occs['recordedBy_atomized'],namesMap=nm)

In [20]:
cols_to_filter = ['','ignorado','ilegivel','incognito','etal']
cwn.remove_nodes_from(cols_to_filter)