# The NamesMap class

In [1]:
import json
from collections import Counter
from copy import deepcopy
from warnings import warn

class NamesMap:
    """
    The class which describes NamesMap objects. Names maps store both name primitives and normalized 
    names. Primitives are the "original names", as they're given as input to the class constructor. 
    When the class is instanced each name primitive is mapped to its normalized form through a
    normalization function. Normalized maps can then be remapped to other names by following a 
    remapping index.
    """
    
    def __init__(self, names, normalizationFunc, remappingIndex=None, *args, **kwargs):
        """
        The NamesMap constructor
        
        Parameters
        ----------
        names : list
            A list with names to be normalized
        
        normalizationFunc : function
            A function or expression for names normalization
            
        remappingIndex : dict
            A dictionary with mapped names to initialize the instance's remapping index
        """
        self._normalizationFunc = normalizationFunc
        
        load_map_prim_norm = kwargs.get('_map_prim_norm',None)
        load_remappingIndex = kwargs.get('_remappingIndex',None)
        
        normNamesDict = lambda nlst: dict( (n,self._normalizationFunc(n)) for n in nlst )
        self._map_prim_norm = normNamesDict(names) if load_map_prim_norm is None else load_map_prim_norm 
        self._remappingIndex = remappingIndex if load_remappingIndex is None else load_remappingIndex
    

    def _getRef(self,n):
        """
        Follows all chained references for a name in the remapping index
        
        Parameters
        ----------
        n : str
            The name to be de-referenced
        """
        start=n
        chain=[]
        remappingKeys = self._remappingIndex.keys()
        while n in remappingKeys:
            chain.append(n)
            n = self._remappingIndex[n]
            if n in chain:
                chain.append(n)
                raise RuntimeError("Loopback detected", start, chain)
        return n
    
    
    def _get_loopback_inconsistencies(self):
        """
        Detects loopbacks in in mapping chains
        """
        inconsistencies = {}
        for k in self._remappingIndex.keys():
            try:
                self._getRef(k)
                
            except RuntimeError as e:
                inconsistencies['mes'] = inconsistencies.get('mes',[]) + [e.args[0]]
                inconsistencies['key'] = inconsistencies.get('key',[]) + [e.args[1]]
                inconsistencies['chain'] = inconsistencies.get('chain',[]) + [e.args[2]]
        
        if len(inconsistencies)==0:
            return None
        else:
            return inconsistencies
    
    
    def _remove_selfloops(self):
        keys_to_remove = [ k for k in self._remappingIndex.keys() if k==self._remappingIndex[k]]       
        for k in keys_to_remove:
            self._remappingIndex.pop(k)
                  
            
    def getInconsistencies(self, prettyPrint=True):
        d = {}
        d['loopback_inconsistencies'] = self._get_loopback_inconsistencies()
        
        if any( True if v is not None else False for v in d.values()  ):
            if prettyPrint:
                mes = "INCONSISTENCIES\n===============\n"

                # loopback inconsistencies
                if d['loopback_inconsistencies'] is not None:
                    mes += "Loopback Inconsistencies\n"
                    data = list(zip( *d['loopback_inconsistencies'].values() ))
                    dataStr = lambda t: "  > {}: Starting from key '{}' got chain {}\n".format(*t)
                    mes += ''.join( dataStr(t) for t in data )
                    mes += '---------------'

                return mes
            
            return d
        
        return None
                
        
    def getMap(self, remap=True):
        """
        Returns a COPY of the names map.
        
        Parameters
        ----------
        remap : bool, default True
            If set to True, the names map is buit by first de-referencing
            remaps in the remapping index. Otherwise all remaps will not
            be considered for building the names map.
        """
        res = deepcopy(self._map_prim_norm)
        if remap and self._remappingIndex is not None:
            for s,t in self._map_prim_norm.items():
                try:
                    res[s] = self._getRef(t)
                except RuntimeError as e:
                    raise(e)
        return res

    
    def remap(self, remaps, fromScratch=False):
        """
        Updates the remapping dictionary using a list of tuples as input.
        
        Parameters
        ----------
        remaps : list of tuples
            Remaps values from tuples (s,t), where a normalized name s remaps to a
            normalized name t.
        
        fromScratch : bool
            If set to True the remapping dict becomes the one passed in. All other previous
            remaps are discarded.
            
        Note
        ----
        If the list of tuples passed in contains duplicated keys a warning is issued, and the
        latest (key,value) pair is the one which will persist.
        """        
        # check for duplicated keys
        duplicatedKeys = [ s for s,cnts in Counter( s for s,t in remaps ).items() if cnts>1 ]
        if len(duplicatedKeys)>0: 
            warningMsg = "Some keys from input are duplicated: {}.".format(str(duplicatedKeys))
            warn(warningMsg)
        
        # update remapping index
        if fromScratch: self._remappingIndex=None
        if self._remappingIndex is None: self._remappingIndex={}
        
        for s,t in remaps:
            self._remappingIndex[s] = t
        
        self._remove_selfloops()
        return self.getInconsistencies()
    
    
    def setEndpoint(self, key):
        """
        This method sets a name as the endpoint of a chain. This method is used for resolving loopbacks
        in the remapping chain. The name is set to be the latest reference in the chain, and therefore
        does not map to any other name.
        
        Parameter
        ---------
        key : str
            The name to be set as the latest reference.
        """
        return self._remappingIndex.pop(key)
    
    
    def write_toJson(self, filename, flatten=False):
        """
        Creates a json file to store a NamesMap's primitive-to-normalized names map and remapping index.
        Data is stored as json object arguments `_map_prim_norm` and `_remappingIndex`.
        
        Parameters
        ----------
        filename : str
            Path to the file to be created.
        
        flatten : bool, default False
            If set to true, all remappings are consolidated into the `_map_prim_norm` map. In other words,
            the remapping index is used to assign each name primitive to its final reference. All references
            are then removed from the remapping index.
        
        """
        json_dict = dict([ ('_map_prim_norm', self.getMap() if flatten else self._map_prim_norm),
                           ('_remappingIndex', {} if flatten else self._remappingIndex) ])
        
        with open(filename, 'w') as output_file:
            json.dump( json_dict, output_file, sort_keys=True, indent=4, ensure_ascii=False)
            

            
def read_NamesMap_fromJson(filepath, normalizationFunc=None):
    """
    Creates a NamesMap instance from a json file containing both a primitive-to-normalized names map
    and a remapping index. The json object must have both attributes `map_prim_norm` and 
    `_remappingIndex`, which stores data used to instance NamesMap class.
    
    Parameters
    ----------
    filepath : str
        Path to the json file containing the map
        
    normalizationFunc : function
        A normalization function to be passed to the NamesMap constructor. If it is 
        not set a warning is issued, as the NamesMap will not be assigned to any
        normalization rule.
    """
    if normalizationFunc is None:
        warn("A names map was created without a normalization function!")
        
    with open(filepath,'r') as f:
        d = json.load(f)
        nm = NamesMap( names=None, normalizationFunc=normalizationFunc, 
                       _map_prim_norm=d['_map_prim_norm'], 
                       _remappingIndex=d['_remappingIndex'])
    
    return nm
    
    

---

## Examples

### Creating a NamesMap instance

First define a list with names and a normalization function

In [2]:
# list with names
names = ['F.G. Alis', 'F. Alis', 
         'Mendonza', 'M.C. Mendonza', 'M.Mendonça', 'P.M.C. Mendonça',
         'J.C. Junior', 'C. Júnior', 'C. Junio','J.Junio', 'J.Junior', 
         'Clara', 
         'Pedro']

# normalization function
import unicodedata, string
from modules.cleaning.names import namesFromString
def normalize(name):
    name = name.lower()
    name = [ s for s in (s.strip() for s in name.split('.')) if s!=''] # split names
    initials = [ p for p in name if len(p)==1 ]
    non_initials = list(filter(lambda x: x not in initials, name))
  
    non_initials_str = '-'.join(non_initials)
    remove_accents = lambda s: ''.join( x for x in unicodedata.normalize('NFKD', s) if x in string.ascii_letters )
    non_initials_str = '-'.join( remove_accents(n) for n in namesFromString(non_initials_str, delim=[' ','-']) )

    return ''.join(initials)+"_"+non_initials_str 

Now create a `NamesMap` instance

In [3]:
nm = NamesMap(names,normalize)
nm.getMap()

{'C. Junio': 'c_junio',
 'C. Júnior': 'c_junior',
 'Clara': '_clara',
 'F. Alis': 'f_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'j_junio',
 'J.Junior': 'j_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

### Remapping

We can pass names to be remapped as tuples (s,t), where s is remapped to t. Notice that we only specify **normalized names** to be remapped.

In [4]:
nm.remap([ ('f_alis','fg_alis'), ('j_junio', 'j_junior'), ('c_junio','c_junior') ])
nm._remappingIndex

{'c_junio': 'c_junior', 'f_alis': 'fg_alis', 'j_junio': 'j_junior'}

And now we can check the map and verify that the name primitive of `'f_alis'` (`'F. Alis'`) is now remapped to the same primitive as `'fg_alis'` (`'F.G. Alis'`). Although we specify normalized names to be remapped the remapping dict remaps **name primitives** under the hood.

In [5]:
nm.getMap()

{'C. Junio': 'c_junior',
 'C. Júnior': 'c_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'j_junior',
 'J.Junior': 'j_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

Let's remap one more name. Notice that this time two names mapped to `'c_junior'`, and therefore there will be two updates.

In [6]:
nm.remap([('c_junior','jc_junior')])

In [7]:
nm.getMap()

{'C. Junio': 'jc_junior',
 'C. Júnior': 'jc_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'j_junior',
 'J.Junior': 'j_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

What if we wanted to rename a normalized name to a non-existant name, for example `'ne'`? For that we can simply remap to the new desired name:

In [8]:
nm.remap([('j_junior','ne')])

In [9]:
nm.getMap()

{'C. Junio': 'jc_junior',
 'C. Júnior': 'jc_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'ne',
 'J.Junior': 'ne',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

Now let's fix this and remap those names mismapped to `'ne'` to their correct version (`'jc_junior'`): 

In [10]:
nm.remap([('ne','jc_junior')])

In [11]:
nm.getMap()

{'C. Junio': 'jc_junior',
 'C. Júnior': 'jc_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'jc_junior',
 'J.Junior': 'jc_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

If we wanted to update the remapping dict using repeated keys in the same operation that would throw a warning. Only the latest mapping using that key persists.

In [12]:
nm.remap([('_mendonza','mc_mendonza'),('_mendonza', 'm_mendonca')])



In [13]:
nm._remappingIndex

{'_mendonza': 'm_mendonca',
 'c_junio': 'c_junior',
 'c_junior': 'jc_junior',
 'f_alis': 'fg_alis',
 'j_junio': 'j_junior',
 'j_junior': 'ne',
 'ne': 'jc_junior'}

### Inconsistencies

#### Loopbacks

The simplest possible loopback is the case where `n1` remaps to `n2` and `n2` remaps to `n1`. If loopbacks are formed after a remapping operation they are reported to the user:

In [14]:
print(nm.remap([('j_junior','c_junior'),('c_junior','j_junior')]))

INCONSISTENCIES
Loopback Inconsistencies
  > Loopback detected: Starting from key 'j_junio' got chain ['j_junio', 'j_junior', 'c_junior', 'j_junior']
  > Loopback detected: Starting from key 'c_junio' got chain ['c_junio', 'c_junior', 'j_junior', 'c_junior']
  > Loopback detected: Starting from key 'c_junior' got chain ['c_junior', 'j_junior', 'c_junior']
  > Loopback detected: Starting from key 'j_junior' got chain ['j_junior', 'c_junior', 'j_junior']
---------------


An exception is thrown if we call the `getMap()` method while there's a loopback inconsistency:

In [15]:
try:
    nm.getMap()
except RuntimeError as e:
    print(e.args[0], ": key {} in {}".format(e.args[1],e.args[2]))

Loopback detected : key c_junior in ['c_junior', 'j_junior', 'c_junior']


and we must resolve this issue by setting one of the names as the endpoint of the loopback chain:

In [16]:
nm.setEndpoint('j_junior')
print(nm.getInconsistencies())

None


And now we can retrieve the map:

In [17]:
nm.getMap()

{'C. Junio': 'j_junior',
 'C. Júnior': 'j_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'j_junior',
 'J.Junior': 'j_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': 'm_mendonca',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

There may be **transitive loopbacks**, in which the looping chain is composed by more than two elements. Consider, for example, remapping `'c_junior' -> 'j_junior' -> 'jc_junior' -> 'c_junior'`

In [18]:
print(nm.remap([('c_junior','j_junior'),('j_junior', 'jc_junior'),('jc_junior', 'c_junior')]))

INCONSISTENCIES
Loopback Inconsistencies
  > Loopback detected: Starting from key 'j_junio' got chain ['j_junio', 'j_junior', 'jc_junior', 'c_junior', 'j_junior']
  > Loopback detected: Starting from key 'c_junio' got chain ['c_junio', 'c_junior', 'j_junior', 'jc_junior', 'c_junior']
  > Loopback detected: Starting from key 'c_junior' got chain ['c_junior', 'j_junior', 'jc_junior', 'c_junior']
  > Loopback detected: Starting from key 'ne' got chain ['ne', 'jc_junior', 'c_junior', 'j_junior', 'jc_junior']
  > Loopback detected: Starting from key 'j_junior' got chain ['j_junior', 'jc_junior', 'c_junior', 'j_junior']
  > Loopback detected: Starting from key 'jc_junior' got chain ['jc_junior', 'c_junior', 'j_junior', 'jc_junior']
---------------


#### Inconsistencies to a dataframe

From an inconsistencies report the user can also build a data frame, for cleaner visualization:

In [19]:
import pandas as pd

loopback_incs = nm.getInconsistencies(prettyPrint=False)['loopback_inconsistencies']
pd.DataFrame(loopback_incs)

Unnamed: 0,chain,key,mes
0,"[j_junio, j_junior, jc_junior, c_junior, j_jun...",j_junio,Loopback detected
1,"[c_junio, c_junior, j_junior, jc_junior, c_jun...",c_junio,Loopback detected
2,"[c_junior, j_junior, jc_junior, c_junior]",c_junior,Loopback detected
3,"[ne, jc_junior, c_junior, j_junior, jc_junior]",ne,Loopback detected
4,"[j_junior, jc_junior, c_junior, j_junior]",j_junior,Loopback detected
5,"[jc_junior, c_junior, j_junior, jc_junior]",jc_junior,Loopback detected


Let's now set `'jc_junior'` as the ending point and verify that the inconsistency is resolved.

In [20]:
nm.setEndpoint('jc_junior')
print(nm.getInconsistencies())

None


In [21]:
nm.getMap()

{'C. Junio': 'jc_junior',
 'C. Júnior': 'jc_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'jc_junior',
 'J.Junior': 'jc_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': 'm_mendonca',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

---

### Writing names map to JSON

We can save a names map in a json file

In [22]:
nm.write_toJson('testNamesMap.json')

In [23]:
%%bash 
cat testNamesMap.json

{
    "_map_prim_norm": {
        "C. Junio": "c_junio",
        "C. Júnior": "c_junior",
        "Clara": "_clara",
        "F. Alis": "f_alis",
        "F.G. Alis": "fg_alis",
        "J.C. Junior": "jc_junior",
        "J.Junio": "j_junio",
        "J.Junior": "j_junior",
        "M.C. Mendonza": "mc_mendonza",
        "M.Mendonça": "m_mendonca",
        "Mendonza": "_mendonza",
        "P.M.C. Mendonça": "pmc_mendonca",
        "Pedro": "_pedro"
    },
    "_remappingIndex": {
        "_mendonza": "m_mendonca",
        "c_junio": "c_junior",
        "c_junior": "j_junior",
        "f_alis": "fg_alis",
        "j_junio": "j_junior",
        "j_junior": "jc_junior",
        "ne": "jc_junior"
    }
}

In [24]:
nm.write_toJson('testNamesMap_flattened.json', flatten=True)

In [25]:
%%bash
cat testNamesMap_flattened.json

{
    "_map_prim_norm": {
        "C. Junio": "jc_junior",
        "C. Júnior": "jc_junior",
        "Clara": "_clara",
        "F. Alis": "fg_alis",
        "F.G. Alis": "fg_alis",
        "J.C. Junior": "jc_junior",
        "J.Junio": "jc_junior",
        "J.Junior": "jc_junior",
        "M.C. Mendonza": "mc_mendonza",
        "M.Mendonça": "m_mendonca",
        "Mendonza": "m_mendonca",
        "P.M.C. Mendonça": "pmc_mendonca",
        "Pedro": "_pedro"
    },
    "_remappingIndex": {}
}

---

### Reading names map from JSON

We can also create a names map instance by reading a JSON file. Below I will read both the flattened and non-flattened names maps.

In [26]:
nm_r = read_NamesMap_fromJson('./testNamesMap.json')
nm_rf = read_NamesMap_fromJson('./testNamesMap_flattened.json')



Both instances output equal names maps:

In [27]:
nm_r.getMap()==nm_rf.getMap()

True

However, here are the attributes for each instance:

##### Non-flattened names map

In [28]:
nm_r._map_prim_norm

{'C. Junio': 'c_junio',
 'C. Júnior': 'c_junior',
 'Clara': '_clara',
 'F. Alis': 'f_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'j_junio',
 'J.Junior': 'j_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': '_mendonza',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

In [29]:
nm_r._remappingIndex

{'_mendonza': 'm_mendonca',
 'c_junio': 'c_junior',
 'c_junior': 'j_junior',
 'f_alis': 'fg_alis',
 'j_junio': 'j_junior',
 'j_junior': 'jc_junior',
 'ne': 'jc_junior'}

##### Flattened names map

In [30]:
nm_rf._map_prim_norm

{'C. Junio': 'jc_junior',
 'C. Júnior': 'jc_junior',
 'Clara': '_clara',
 'F. Alis': 'fg_alis',
 'F.G. Alis': 'fg_alis',
 'J.C. Junior': 'jc_junior',
 'J.Junio': 'jc_junior',
 'J.Junior': 'jc_junior',
 'M.C. Mendonza': 'mc_mendonza',
 'M.Mendonça': 'm_mendonca',
 'Mendonza': 'm_mendonca',
 'P.M.C. Mendonça': 'pmc_mendonca',
 'Pedro': '_pedro'}

In [31]:
nm_rf._remappingIndex

{}

---

In [32]:
%%bash
rm ./testNamesMap.json ./testNamesMap_flattened.json