# The Names Atomizer Class

In [1]:
import numpy as np
import pandas as pd

from modules.cleaning.names import namesFromString

## Loading the dataset

In [2]:
dsetPath = '/home/pedro/datasets/ub_herbarium/occurrence.txt'

In [3]:
cols = ['recordedBy', 'scientificName', 'collectionCode']
occs = pd.read_csv(dsetPath, sep='\t',usecols=cols)

## The NamesAtomizer class

In [4]:
import json
class NamesAtomizer:
    
    def __init__(self, atomizeOp, replaces=None):
        """
        The NamesAtomizer is built with an atomizing operation to be defined
        as the instance's default and an optional list with names to be replaced.
        Names to be replaced must be passed in a list of tuples, in any of the 
        following ways:
        
        >> rep = [('n1', 'correct_n1'), ('n2', 'correct_n2')]
        or
        >> rep = [(['n1','n1_2], 'correct_n1'), (['n2'], 'correct_n2')]
        or
        >> expr1 = lambda x: x.replace(';', '_')
        >> expr2 = lambda x: x.replace('&', '_')
        >> rep = [(['n1;1', 'n1;2'], expr1), (['n2&1', 'n2&2'], expr2)]
        
        Note that if you pass an expression as the second item of the tuple this expression
        must evaluate in a string!
        """
        self._replaces = self._buildReplaces(replaces)
        self._operation = atomizeOp


    def _buildReplaces(self, replacesList):
        """
        Builds a replaces dict from a list. The input list must contain tuples 
        in which the first element is a list of names strings that must be replaced
        by the string in the tuple's second element. The second element can alternatively
        be an expression that results in a names string.
        """
        res = {}
        
        if replacesList is None:
            return res
        
        for srcs,tgt in replacesList:
            if isinstance(srcs, str):
                src = srcs
                res.update( {src:tgt(src)} if callable(tgt) else {src:tgt})
                
            elif isinstance(srcs, (list,tuple,set)):
                for src in srcs:
                    res.update( {src:tgt(src)} if callable(tgt) else {src:tgt} )

            else:
                raise ValueError("Invalid value '{0}' in '({0},{1})'. Must be either string or iterable".format(srcs, tgt))
                
        return res
                    
    
    def atomize(self, col, operation=None, withReplacing=True):
        """
        This method takes a column with names strings and atomizes them
        
        Parameters
        ----------
        col : pandas.Series
            A column containing names strings to be atomized.   
        
        operation : function
            If an operation is passed in it is used to atomize the column instead
            of the instance's default operation.
        
        withReplacing : bool, default True
            If set to true some names replacing is performed before atomization 
        """
        if operation is None:
            operation = self._operation
        
        if withReplacing:
            col=col.replace(self._replaces)
            
        atomizedCol = col.apply(operation)
        return atomizedCol
    
    
    def addReplaces(self, replacesList):
        replacesDict = self._buildReplaces(replacesList)
        self._replaces.update(replacesDict)
    
    def write_replaces(self, filename):
        """
        Writes replaces to a json file
        """
        with open(filename,'w') as f:
            d = {'_replaces':self._replaces}
            json.dump(d, f, sort_keys=True, indent=4, ensure_ascii=False)
            
    def read_replaces(self, filepath, update=True):
        """
        Reads replaces from a json file
        """
        with open(filepath, 'r') as f:
            data = json.load(f)
            if update:
                self._replaces.update( data['_replaces'] )
            else:
                self._replaces = data['_replaces']
        
    def suggest(self):
        pass

## Creating a Names Atomizer

In [5]:
# create atomizer object
na = NamesAtomizer(namesFromString)

### Before the replacement list

In [6]:
occs['recordedBy_1'] = na.atomize(occs['recordedBy'].astype(str))
occs.iloc[123889].astype(str)

collectionCode                     UB
recordedBy              Barbosa; M.G.
scientificName       Sematophyllaceae
recordedBy_1      ['Barbosa', 'M.G.']
Name: 123889, dtype: object

### Using the replacement list

In [7]:
# define the replacement list
rep = [ 
    ('Sr. Air, Sr. Milton, Rodrigo', "Sr. Air; Sr. Milton; Rodrigo"),
    ('Sônia / Josefina', "Sônia; Josefina"),
    ('Hatschbach, G; M.', "Hatschbach, G; Hatschbach, M"),
    ('Irwin, HS; Souza, R; Santos; RR', "Irwin, HS; Souza, R; Santos, RR"),
    ('Kirkbride Junior, JH; Ono; E.K.M; et al.', "Kirkbride Junior, JH; Ono, E.K.M; et al."),
    ('Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB', "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB"),
    ('Silva; D.R.; Colvéquia; L.P.T', "Silva, D.R.; Colvéquia, L.P.T"),
    (['Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.'], "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R."),
    
    (['Yushun.; K.', 
      'Barbosa; M.G.',
      'Hällström; E.',
      'Bueno; S.B.'], 
            lambda x: x.replace(';',',')
    ),   
]

# add replaces to the Names Atomizer
na.addReplaces(rep)

# store the result in a new column in the dataframe
occs['recordedBy_2'] = na.atomize(occs['recordedBy'].astype(str))

In [8]:
occs.iloc[123889].astype(str)

collectionCode                     UB
recordedBy              Barbosa; M.G.
scientificName       Sematophyllaceae
recordedBy_1      ['Barbosa', 'M.G.']
recordedBy_2        ['Barbosa, M.G.']
Name: 123889, dtype: object

## Writing a Names Atomizer replacement dict to json

In [9]:
na.write_replaces('rep.json')

In [10]:
%%bash
cat .rep.json

{
    "_replaces": {
        "Barbosa; M.G.": "Barbosa, M.G.",
        "Bueno; S.B.": "Bueno, S.B.",
        "Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB": "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB",
        "Hatschbach, G; M.": "Hatschbach, G; Hatschbach, M",
        "Hällström; E.": "Hällström, E.",
        "Irwin, HS; Souza, R; Santos; RR": "Irwin, HS; Souza, R; Santos, RR",
        "Kirkbride Junior, JH; Ono; E.K.M; et al.": "Kirkbride Junior, JH; Ono, E.K.M; et al.",
        "Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.": "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R.",
        "Silva; D.R.; Colvéquia; L.P.T": "Silva, D.R.; Colvéquia, L.P.T",
        "Sr. Air, Sr. Milton, Rodrigo": "Sr. Air; Sr. Milton; Rodrigo",
        "Sônia / Josefina": "Sônia; Josefina",
        "Yushun.; K.": "Yushun., K."
    }
}

## Loading a replacement dict from json

In [13]:
na2 = NamesAtomizer(namesFromString)
na2.addReplaces([('a','correct_a')]) # this will be kept if update is set to True in read_replaces
na2.read_replaces('./rep.json')

In [14]:
na2._replaces

{'Barbosa; M.G.': 'Barbosa, M.G.',
 'Bueno; S.B.': 'Bueno, S.B.',
 'Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB': 'Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB',
 'Hatschbach, G; M.': 'Hatschbach, G; Hatschbach, M',
 'Hällström; E.': 'Hällström, E.',
 'Irwin, HS; Souza, R; Santos; RR': 'Irwin, HS; Souza, R; Santos, RR',
 'Kirkbride Junior, JH; Ono; E.K.M; et al.': 'Kirkbride Junior, JH; Ono, E.K.M; et al.',
 'Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.': 'Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R.',
 'Silva; D.R.; Colvéquia; L.P.T': 'Silva, D.R.; Colvéquia, L.P.T',
 'Sr. Air, Sr. Milton, Rodrigo': 'Sr. Air; Sr. Milton; Rodrigo',
 'Sônia / Josefina': 'Sônia; Josefina',
 'Yushun.; K.': 'Yushun., K.'}