# The Names Atomizer Class

In [1]:
import numpy as np
import pandas as pd

from modules.cleaning.names import namesFromString

## Loading the dataset

In [2]:
dsetPath = '/home/pedro/datasets/ub_herbarium/occurrence.txt'

In [3]:
cols = ['recordedBy', 'scientificName', 'collectionCode']
occs = pd.read_csv(dsetPath, sep='\t',usecols=cols)

## The NamesAtomizer class

In [4]:
import json
from collections import Counter
class NamesAtomizer:
    
    def __init__(self, atomizeOp, replaces=None):
        """
        The NamesAtomizer is built with an atomizing operation to be defined
        as the instance's default and an optional list with names to be replaced.
        Names to be replaced must be passed in a list of tuples, in any of the 
        following ways:
        
        >> rep = [('n1', 'correct_n1'), ('n2', 'correct_n2')]
        or
        >> rep = [(['n1','n1_2], 'correct_n1'), (['n2'], 'correct_n2')]
        or
        >> expr1 = lambda x: x.replace(';', '_')
        >> expr2 = lambda x: x.replace('&', '_')
        >> rep = [(['n1;1', 'n1;2'], expr1), (['n2&1', 'n2&2'], expr2)]
        
        Note that if you pass an expression as the second item of the tuple this expression
        must evaluate in a string!
        """
        self._replaces = self._buildReplaces(replaces)
        self._operation = atomizeOp
        self._cache = None


    def _buildReplaces(self, replacesList):
        """
        Builds a replaces dict from a list. The input list must contain tuples 
        in which the first element is a list of names strings that must be replaced
        by the string in the tuple's second element. The second element can alternatively
        be an expression that results in a names string.
        """
        res = {}
        
        if replacesList is None:
            return res
        
        for srcs,tgt in replacesList:
            if isinstance(srcs, str):
                src = srcs
                res.update( {src:tgt(src)} if callable(tgt) else {src:tgt})
                
            elif isinstance(srcs, (list,tuple,set)):
                for src in srcs:
                    res.update( {src:tgt(src)} if callable(tgt) else {src:tgt} )

            else:
                raise ValueError("Invalid value '{0}' in '({0},{1})'. Must be either string or iterable".format(srcs, tgt))
                
        return res
                    
    
    def atomize(self, col, operation=None, withReplacing=True, cacheResult=True):
        """
        This method takes a column with names strings and atomizes them
        
        Parameters
        ----------
        
        col : pandas.Series
            A column containing names strings to be atomized.   
        
        operation : function
            If an operation is passed in it is used to atomize the column instead
            of the instance's default operation.
        
        withReplacing : bool, default True
            If set to True some names replacing is performed before atomization. 
        
        cacheResult : bool, default True
            If set to True the resulting series is cached for later use.
        """
        if operation is None:
            operation = self._operation
        
        if withReplacing:
            col=col.replace(self._replaces)
            
        atomizedCol = col.apply(operation)
        if cacheResult:
            self._cache = (col, atomizedCol)
        return atomizedCol
    
    
    def addReplaces(self, replacesList):
        replacesDict = self._buildReplaces(replacesList)
        self._replaces.update(replacesDict)
    
    def write_replaces(self, filename):
        """
        Writes replaces to a json file
        """
        with open(filename,'w') as f:
            d = {'_replaces':self._replaces}
            json.dump(d, f, sort_keys=True, indent=4, ensure_ascii=False)
            
    def read_replaces(self, filepath, update=True):
        """
        Reads replaces from a json file
        """
        with open(filepath, 'r') as f:
            data = json.load(f)
            if update:
                self._replaces.update( data['_replaces'] )
            else:
                self._replaces = data['_replaces']
        
    def getCachedNames(self, namesToFilter=['et al.'], sortingExp=lambda x: [len(x[0]),-x[2]]):
        """
        This method uses data in the instance's cache.
        Returns atomized names from the instance's cache. Names are associated to 
        their original namestring as well as the number of records they appear 
        in the dataset. The result is structured as a 3-tuple, with elements in 
        the same order stated above.
        
        Parameters
        ----------
        
        namesToFilter : list
            Names that should be ignored by the method. By default it ignores 'et al.'.
        
        sortingExp : function
            An expression to be passed as key to sort the final result.
            
        Returns
        -------
        
        A 3-tuple (u,v,w) where:
          u = atomized name;
          v = original name string that was used to atomize names;
          w = count of the total occurrences of an atomized name in the dataset.
        """
        c = self._cache
        l = [ (n,nstr) for nstr,norm in zip(c[0],c[1]) for n in norm if n not in namesToFilter ]
        ctr = Counter(i[0] for i in l)
        return sorted([ (u,v,ctr[u]) for u,v in set(l) ],key=sortingExp)

## Creating a Names Atomizer

In [5]:
# create atomizer object
na = NamesAtomizer(namesFromString)

### Before the replacement list

In [6]:
occs['recordedBy_1'] = na.atomize(occs['recordedBy'].astype(str))
occs.iloc[123889].astype(str)

collectionCode                     UB
recordedBy              Barbosa; M.G.
scientificName       Sematophyllaceae
recordedBy_1      ['Barbosa', 'M.G.']
Name: 123889, dtype: object

### Using the replacement list

In [7]:
# define the replacement list
rep = [ 
    ('Sr. Air, Sr. Milton, Rodrigo', "Sr. Air; Sr. Milton; Rodrigo"),
    ('Sônia / Josefina', "Sônia; Josefina"),
    ('Hatschbach, G; M.', "Hatschbach, G; Hatschbach, M"),
    ('Irwin, HS; Souza, R; Santos; RR', "Irwin, HS; Souza, R; Santos, RR"),
    ('Kirkbride Junior, JH; Ono; E.K.M; et al.', "Kirkbride Junior, JH; Ono, E.K.M; et al."),
    ('Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB', "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB"),
    ('Silva; D.R.; Colvéquia; L.P.T', "Silva, D.R.; Colvéquia, L.P.T"),
    (['Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.'], "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R."),
    
    (['Yushun.; K.', 
      'Barbosa; M.G.',
      'Hällström; E.',
      'Bueno; S.B.'], 
            lambda x: x.replace(';',',')
    ),   
]

# add replaces to the Names Atomizer
na.addReplaces(rep)

# store the result in a new column in the dataframe
occs['recordedBy_2'] = na.atomize(occs['recordedBy'].astype(str))

In [8]:
occs.iloc[123889].astype(str)

collectionCode                     UB
recordedBy              Barbosa; M.G.
scientificName       Sematophyllaceae
recordedBy_1      ['Barbosa', 'M.G.']
recordedBy_2        ['Barbosa, M.G.']
Name: 123889, dtype: object

## Writing a Names Atomizer replacement dict to json

In [9]:
na.write_replaces('ub_collectors_replaces.json')

In [17]:
%%bash
cat ub_collectors_replaces.json

{
    "_replaces": {
        "Barbosa; M.G.": "Barbosa, M.G.",
        "Bueno; S.B.": "Bueno, S.B.",
        "Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB": "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB",
        "Hatschbach, G; M.": "Hatschbach, G; Hatschbach, M",
        "Hällström; E.": "Hällström, E.",
        "Irwin, HS; Souza, R; Santos; RR": "Irwin, HS; Souza, R; Santos, RR",
        "Kirkbride Junior, JH; Ono; E.K.M; et al.": "Kirkbride Junior, JH; Ono, E.K.M; et al.",
        "Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.": "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R.",
        "Silva; D.R.; Colvéquia; L.P.T": "Silva, D.R.; Colvéquia, L.P.T",
        "Sr. Air, Sr. Milton, Rodrigo": "Sr. Air; Sr. Milton; Rodrigo",
        "Sônia / Josefina": "Sônia; Josefina",
        "Yushun.; K.": "Yushun., K."
    }
}

## Loading a replacement dict from json

In [11]:
na2 = NamesAtomizer(namesFromString)
na2.addReplaces([('a','correct_a')]) # this will be kept if update is set to True in read_replaces
na2.read_replaces('./rep.json')

In [12]:
na2._replaces

{'Barbosa; M.G.': 'Barbosa, M.G.',
 'Bueno; S.B.': 'Bueno, S.B.',
 'Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB': 'Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB',
 'Hatschbach, G; M.': 'Hatschbach, G; Hatschbach, M',
 'Hällström; E.': 'Hällström, E.',
 'Irwin, HS; Souza, R; Santos; RR': 'Irwin, HS; Souza, R; Santos, RR',
 'Kirkbride Junior, JH; Ono; E.K.M; et al.': 'Kirkbride Junior, JH; Ono, E.K.M; et al.',
 'Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.': 'Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R.',
 'Silva; D.R.; Colvéquia; L.P.T': 'Silva, D.R.; Colvéquia, L.P.T',
 'Sr. Air, Sr. Milton, Rodrigo': 'Sr. Air; Sr. Milton; Rodrigo',
 'Sônia / Josefina': 'Sônia; Josefina',
 'Yushun.; K.': 'Yushun., K.',
 'a': 'correct_a'}

## Getting cached result from last atomizing operation

In [13]:
print(''.join( u+" in "+v+": "+str(w)+'\n' for u,v,w in na.getCachedNames()[:100] ))

? in ?: 130
S in Martins, DS; Câmara, PEAS; Amorim, PRF; Costa, DP; Faria, JEQ; Carvalho, AM; Gonzaga, RMO; S: 80
O in Oliveira, RC; Moura, CO; Cardoso, AGT; Sonsin, J; Cordeiro, AOO; Million, JL; Antunes, LLC; O: 1
P in Sasaki, D; Pedroga, JA; Corrêa, TR; P; Piva, JH: 1
R in Farias, R; Carvalho, AM; Carvalho, JA; Fonsêca, LM; Proença, CEB; Potzernheim, ML; R: 1
. in Faria, JEQ; Carvalho-Silva, M; Câmara, PEAS; .; Soares, AER; Teixeira Júnior, AQ; Benedete: 1
F in F: 1
Nu in Faria, JEQ; Campos, LZO; Ibrahim, M; Martins, RC; Caires, CS; Meneguzzo, TEC; Souza, LF; Nu: 20
FO in FO: 1
Fl in Lucas, EJ; Mazine-Capelo, FF; Kollmann, L; Brummitt, NA; Campos, OR; Fl: 1
Si in Ratter, JA; Bridgewater, S; Cardoso, E; Lima, V; Munhoz, CBR; Oliveira, NR; Ribeiro, JF; Si: 1
Iri in Iri: 93
Ben in Faria, JEQ; Carvalho-Silva, M; Câmara, PEAS; Gama, R; Soares, AER; Teixeira Júnior, AQ; Ben: 68
Car in Faria, JEQ; Câmara, PEAS; Costa, DP; Martins, DS; Amorim, PRF; Sousa, RV; Gonzaga, RMO; Car: 44
Ana in Ba