In [79]:
import numpy as np
import pandas as pd
import networkx as nx

import matplotlib.pyplot as plt

import modules.names_cleaning as nc

%matplotlib inline

In [80]:
dsetPath = '/home/pedro/datasets/ub_herbarium/occurrence.txt'

cols = [ 'recordedBy','scientificName','taxonRank', 'collectionCode']
occs = pd.read_csv(dsetPath, sep='\t', usecols=cols)
occs=occs[occs['recordedBy'].notnull()]

In [81]:
occs.head()

Unnamed: 0,collectionCode,recordedBy,scientificName,taxonRank
0,UB,"Irwin, HS",Annona monticola Mart.,SPECIES
1,UB,"Ratter, JA; et al.",Myracrodruon urundeuva Allem.,SPECIES
2,UB,"Heringer, EP",Myracrodruon urundeuva Allem.,SPECIES
3,UB,"Coelho, JP",Myracrodruon urundeuva Allem.,SPECIES
4,UB,"Eiten, G; Eiten, LT",Myracrodruon urundeuva Allem.,SPECIES


## Atomizing names

In [82]:
atomizingFunction = lambda x: nc.namesFromString(x,delim=';') 
atomized_recordedBy = nc.atomizeNames(occs['recordedBy'],operation=atomizingFunction)
occs['recordedBy_atomized'] = atomized_recordedBy

In [83]:
from collections import Counter

def getNamesList( col, with_counts=False, orderBy=None ):
    """
    Gets a list of names from an atomized names column.
    
    ordered: default = None
        alphabetic
        counts
    
    with_counts
    """
    
    if orderBy not in [ None, "alphabetic", "counts"]:
        raise ValueError("Invalid argument for 'orderBy': {}".format(orderBy))
    
    if with_counts or orderBy=="counts":
        l = [ (n,c) for (n,c) in Counter( n for nlst in col for n in nlst ).items() ] 
        if orderBy=="alphabetic":
            return sorted( l, key=lambda x: x[0] )
        elif orderBy=="counts":
            sorted_l = sorted( l, key=lambda x: x[1], reverse=True )
            if with_counts:
                return sorted_l
            else:
                return [ n for (n,c) in sorted_l ]
        else:
            return l
                
    else:
        if orderBy=="alphabetic":
            return sorted(list(set( n for nlst in col for n in nlst )))
        else:
            return list(set( n for nlst in col for n in nlst ))

In [84]:
def getRecordsBy(df, colName, name):
    return df.loc[df[colName].apply( lambda x: name in x )]

In [85]:
def atomizeNames( col, operation=None, replaces=None ):
    """
    Applies an atomization operation on a names column, which must be a pandas Series object. 
    The atomized names at each row are stored as a list.
    
    Parameters
    ----------
    col : pd.Series
        Names column to be atomized.
        
    operation : function
        The atomization operation to be applied to the names column
        
    replaces:
        A list of 2-tuples (srclst, tgt), where srclst is a list of names to be replaced by tgt.
        The element tgt can be either a string or a function which results in a string.
        
    Returns
    -------
        A pandas Series with lists of atomized names.
    """
    replacesDict = dict( (src, tgt(src)) if callable(tgt) else (src,tgt) for (srclst, tgt) in replaces for src in srclst )
    col = col.replace( replacesDict )
    col_atomized = col.apply( operation )
    return col_atomized

In [86]:
rep = [ 
    (['Sr. Air, Sr. Milton, Rodrigo'], "Sr. Air; Sr. Milton; Rodrigo"),
    (['Sônia / Josefina'], "Sônia; Josefina"),
    (['Hatschbach, G; M.'], "Hatschbach, G; Hatschbach, M"),
    (['Hällström; E.'], "Hällström, E"),
    (['Irwin, HS; Souza, R; Santos; RR'], "Irwin, HS; Souza, R; Santos, RR"),
    (['Kirkbride Junior, JH; Ono; E.K.M; et al.'], "Kirkbride Junior, JH; Ono, E.K.M; et al."),

    
    (['Carboni, M; Faraco, AG; Soares; P.G.; Sampaio, D; Breier, TB'], "Carboni, M; Faraco, AG; Soares, P.G.; Sampaio, D; Breier, TB"),
    (['Silva; D.R.; Colvéquia; L.P.T'], "Silva, D.R.; Colvéquia, L.P.T"),
    (['Quintiliano; F.J.; Colvéquia; L.P.T; Silva; D.R.'], "Quintiliano, F.J.; Colvéquia, L.P.T; Silva, D.R."),
    
    (['Yushun.; K.', 
      'Barbosa; M.G.'], 
            lambda x: x.replace(';',',')
    ),   
]

atomizingFunction = lambda x: nc.namesFromString(x,delim=';') 
atomized_recordedBy = atomizeNames(occs['recordedBy'],operation=atomizingFunction, replaces=rep)
occs['recordedBy_atomized'] = atomized_recordedBy
occs_collaborative = occs[occs['recordedBy_atomized'].apply(lambda x: len(x)>1)]

In [87]:
occs_collaborative.head()

Unnamed: 0,collectionCode,recordedBy,scientificName,taxonRank,recordedBy_atomized
1,UB,"Ratter, JA; et al.",Myracrodruon urundeuva Allem.,SPECIES,"[Ratter, JA, et al.]"
4,UB,"Eiten, G; Eiten, LT",Myracrodruon urundeuva Allem.,SPECIES,"[Eiten, G, Eiten, LT]"
7,UB,"Proença, CEB; Oliveira, RS; Pinto, AS",Ichthyothere Mart.,GENUS,"[Proença, CEB, Oliveira, RS, Pinto, AS]"
9,UB,"Gomes, BM; Brito, DS; Mendes, VC",Bromelia exigua Mez,SPECIES,"[Gomes, BM, Brito, DS, Mendes, VC]"
11,UB,"Montgomery, F; Blake, D",Nyssa sylvatica Marshall,SPECIES,"[Montgomery, F, Blake, D]"


## Building names map

In [88]:
nl = getNamesList(occs['recordedBy_atomized'], orderBy="alphabetic")
nm = nc.NamesMap(nl, nc.normalize)

In [89]:
nm.getMap()

{'.': '',
 '1980 Sino-Amer Exped.': 'sinoamerexped',
 '?': '',
 'A.J.N.V.': 'ajnv',
 'A.M.': 'am',
 'Abbas, B': 'abbas,b',
 'Abdala, GC': 'abdala,gc',
 'Abdo, MSA': 'abdo,msa',
 'Abdon': 'abdon',
 'Abe, LB': 'abe,lb',
 'Abe, LM': 'abe,lm',
 'Abrahim, MA': 'abrahim,ma',
 'Abreu, CG': 'abreu,cg',
 'Abreu, GX': 'abreu,gx',
 'Abreu, I': 'abreu,i',
 'Abreu, LC': 'abreu,lc',
 'Abreu, LCR': 'abreu,lcr',
 'Abreu, M': 'abreu,m',
 'Abreu, MC': 'abreu,mc',
 'Abreu, MS': 'abreu,ms',
 'Abreu, NL': 'abreu,nl',
 'Abreu, NR': 'abreu,nr',
 'Abreu, TLS': 'abreu,tls',
 'Accioly': 'accioly',
 'Accorsi, WR': 'accorsi,wr',
 'Acera, V': 'acera,v',
 'Acevedo-Rodriguez, P': 'acevedorodriguez,p',
 'Acuña, BC': 'acuna,bc',
 'Adams, WP': 'adams,wp',
 'Adcock, S': 'adcock,s',
 'Adderley, LS': 'adderley,ls',
 'Addison, G': 'addison,g',
 'Addor, EE': 'addor,ee',
 'Adler': 'adler',
 'Adorno, H': 'adorno,h',
 'Adão': 'adao',
 'Aedo, C': 'aedo,c',
 'Affonso, P': 'affonso,p',
 'Afonso, AA': 'afonso,aa',
 'Afonso, AP': '

In [90]:
remap = {
    #'proenca,c': 'proenca,ceb',
    
    'abreu,lc': 'abreu,lcr',
    'abreu,m': 'abreu,mc',
    'abreu,ms':'abreu,mc',
    'aguiar,ac': 'aguiar,aca',
    'allem,a':'allem,ac',
    
    }

nm.remap(remap)

## Names index

In [91]:
ni = nc.getNamesIndexes(occs, 'recordedBy_atomized',namesMap=nm.getMap())

## Assembling the network

In [92]:
import networkx
import itertools
from collections import Counter

class CoworkingNetwork(networkx.Graph):
    """
    Class for coworking networks. Extends networkx Graph class.
    
    Parameters
    ----------
    namesSets : iterable
        An iterable of iterables containing names used to compose cliques 
        in the network.

    weighted : bool, default False
        If set to True the resulting network will have weighted edges
        
    namesMap : NamesMap
        A NamesMap object for normalizing nodes names.
        
    Examples
    --------
    >>> namesSets = [ ['a','b','c'], ['d','e'], ['a','c'] ]
    >>> CoworkingNetwork( namesSets, weighted=True).edges(data=True)
    [('b', 'a', {'weight': 1}),
     ('b', 'c', {'weight': 1}),
     ('a', 'c', {'weight': 2}),
     ('e', 'd', {'weight': 1})]
    
    >>> CoworkingNetwork( namesSets ).edges(data=True)
    [('b', 'a', {}), 
     ('b', 'c', {}), 
     ('a', 'c', {}), 
     ('e', 'd', {})]
    """
    def __init__(self, namesSets, weighted=False, namesMap=None):
        super().__init__()
        
        if namesMap:
            nmap = namesMap.getMap()
            namesSets = [ [ nmap[n] for n in nset ] for nset in namesSets ]
            
        cliques = map( lambda n: itertools.combinations(n,r=2), namesSets )
        edges = [ e for edges in cliques for e in edges ]
        self.add_edges_from(edges)
        
        if weighted:
            edges_weights = Counter(edges)

            for (u,v),w in edges_weights.items():
                try:
                    self[u][v]['weight'] += w
                except:
                    self[u][v]['weight'] = w
        
        return
    
        

In [452]:
m = nm.getMap()
ni = nc.getNamesIndexes(occs, 'recordedBy_atomized',namesMap=nm.getMap())
G = CoworkingNetwork( occs_collaborative['recordedBy_atomized'], weighted=True, namesMap=nm )
G.remove_node('etal')
nx.write_gexf(G, './graph.gexf')

In [453]:
nx.set_node_attributes(G, 'n_records', dict( (n, len(ni[n])) for n in G.nodes() ))

# Next:

To find potential names variations:
1. Find close matches for the names of important nodes;
2. For each close match, calculate the similarity of pairs of nodes in terms of their neighbors (cosine similarity?);

$$sim(n_h, n_l) = \frac{1}{k_l}  \sum_{i=1}^{k_l} g(v_i) , \quad 
g(x) = \begin{cases} 1 \quad x \in S_h \\ 0 \quad x \notin S_h \end{cases}, \qquad \textit{where}$$

* $n_h$ is the higher-degree node;
* $n_l$ is the lower-degree node;
* $k_h$ is the degree of node $n_h$;
* $k_l$ is the degree of node $n_l$;
* $v_i$ are neighbors of node $n_l$;
* $S_h$ is the set of neighbors of node $n_h$.

In [200]:
def neighborhoodSimilarity( G, n1, n2 ):
    if len(G[n1]) > len(G[n2]):
        n_h,n_l = n1,n2
    else:
        n_h,n_l = n2,n1
    
    S = G.neighbors(n_h)
    k_h = G.degree(n_h)
    k_l = G.degree(n_l)
    if k_l > 0:
        return sum( 1 if v in S else 0 for v in G.neighbors(n_l) )/k_l
    else: 
        return 0
    
neighborhoodSimilarity(G, 'proenca,ceb', 'proenca,c')

0.8260869565217391

In [479]:
names = [ n for n,d in sorted( G.nodes(data=True), key=lambda x: x[1]['n_records'], reverse=True ) ][:1000]

l=[]
similarity_threshold=0.1
for n1 in names:
    for n2 in [ n2 for n2 in dfl.get_close_matches(n1, names) if n2!=n1 ]:
        sim = neighborhoodSimilarity(G,n1,n2)
        ordered_nodes = sorted([ n1, n2 ])
        if sim >= similarity_threshold:
            l += [ (ordered_nodes[0],ordered_nodes[1],sim) ]
l

[('faria,jeq', 'farias,r', 0.16666666666666666),
 ('heringer,ep', 'krieger,pl', 0.125),
 ('concha,c', 'proenca,ceb', 0.6),
 ('souza,e', 'souza,rr', 0.375),
 ('souza,mc', 'souza,mgm', 0.14285714285714285),
 ('eiten,g', 'eiten,lt', 0.7857142857142857),
 ('zanatta,gv', 'zanatta,mrv', 0.5),
 ('carvalho,vm', 'carvalhosilva,m', 0.2222222222222222),
 ('carvalho,', 'carvalhosilva,m', 0.8333333333333334),
 ('harley,r', 'harley,rm', 0.6),
 ('fonseca,fs', 'fonseca,s', 1.0),
 ('kirkbride,mcg', 'kirkbride-junior,jh', 0.2857142857142857),
 ('eiten,g', 'eiten,lt', 0.7857142857142857),
 ('pires,jf', 'pires,jn', 1.0),
 ('gottsberger,g', 'gottsberger,is', 0.6666666666666666),
 ('hatschbach,g', 'hatschbach,m', 1.0),
 ('villar,ts', 'villarroel,d', 0.16666666666666666),
 ('carvalho,avm', 'carvalho,vm', 0.6666666666666666),
 ('carvalho,avm', 'carvalho,ja', 0.41935483870967744),
 ('duartesilva,ag', 'soares-silva,lh', 0.125),
 ('gama,r', 'zampa,l', 1.0),
 ('ramos,j', 'ramos,jf', 0.14285714285714285),
 ('marti

In [480]:
sorted(l, key=lambda x: x[2],reverse=True)

[('fonseca,fs', 'fonseca,s', 1.0),
 ('pires,jf', 'pires,jn', 1.0),
 ('hatschbach,g', 'hatschbach,m', 1.0),
 ('gama,r', 'zampa,l', 1.0),
 ('mimura,i', 'miranda,is', 1.0),
 ('macedo,a', 'macedo,m', 1.0),
 ('almeida,sms', 'almeida,sp', 1.0),
 ('leite,fq', 'leite,lq', 1.0),
 ('carvalho,sf', 'fankdecarvalho,sm', 1.0),
 ('hatschbach,g', 'hatschbach,m', 1.0),
 ('leite,fq', 'leite,lq', 1.0),
 ('almeida,sms', 'almeida,sp', 1.0),
 ('campos,jmp', 'campos,sm', 1.0),
 ('campos,jmp', 'campos,sm', 1.0),
 ('pires,jf', 'pires,jn', 1.0),
 ('macedo,a', 'macedo,m', 1.0),
 ('fonseca,fs', 'fonseca,s', 1.0),
 ('gama,r', 'zampa,l', 1.0),
 ('maguire,b', 'maguire,ck', 0.9),
 ('maguire,b', 'maguire,ck', 0.9),
 ('duartesilva,ag', 'silva,agd', 0.875),
 ('fagg,cw', 'fagg,jmf', 0.8571428571428571),
 ('fagg,cw', 'fagg,jmf', 0.8571428571428571),
 ('carvalho,', 'carvalhosilva,m', 0.8333333333333334),
 ('silva,ss', 'souzasilva,s', 0.8333333333333334),
 ('souza,jp', 'souza,vc', 0.8260869565217391),
 ('alvarenga,a', 'alva

In [478]:
G['barbosa,e']

{'abe,lm': {'weight': 10},
 'cervi,ca': {'weight': 3},
 'cordeiro,e': {'weight': 1},
 'cordeiro,i': {'weight': 1},
 'cordeiro,j': {'weight': 24},
 'costa,ef': {'weight': 8},
 'cruz,jm': {'weight': 8},
 'fagundes,ja': {'weight': 3},
 'ferreira,la': {'weight': 6},
 'goldenberg,r': {'weight': 2},
 'hatschbach,g': {'weight': 57},
 'hatschbach,m': {'weight': 37},
 'mansano,vs': {'weight': 1},
 'pereira,lbs': {'weight': 1},
 'poliquesi,cb': {'weight': 11},
 'pott,a': {'weight': 3},
 'pott,v': {'weight': 3},
 'ribas,os': {'weight': 14},
 'schinini,a': {'weight': 1},
 'silva,jm': {'weight': 43},
 'silva,lbp': {'weight': 1},
 'spichiger,r': {'weight': 1},
 'vaz,s': {'weight': 1}}

In [385]:
occs.loc[ni['munhoz,ca']]

Unnamed: 0,collectionCode,recordedBy,scientificName,taxonRank,recordedBy_atomized
4344,UB,"Barros, MAG; Filgueiras, TS; Silva, PEN; Munho...",Prescottia oligantha (Sw.) Lindl.,SPECIES,"[Barros, MAG, Filgueiras, TS, Silva, PEN, Munh..."
113553,UB,"Barros, M; Filgueiras, TS; Munhoz, CA; Nascime...",Byrsonima verbascifolia (L.) DC.,SPECIES,"[Barros, M, Filgueiras, TS, Munhoz, CA, Nascim..."


In [444]:
remap = {
    'gawryszewski,fm': 'grawryszewski,fm',
    'coradin,l': 'coradin,lc',
    'jennings,': 'jennings,lvs',
    'kolmann,l': 'kollmann,l',
    'vera,l': 'veralucia',
    'clemente,c': 'clemente,cm',
    'bertolda,j': 'bertoldo,j',
    'smith,g': 'smith,gl',
    'vaz,a': 'vaz,amsf',
    'sena,l': 'senna,l',
    'sanaiotti,t': 'sanaiotti,tm',
    'klein,vl': 'klein,vlg',
    'casto,ws': 'castro,ws',
    'dias,ej': 'dias,jb',
    'torres,dc': 'torres,dsc',
    'landim,m': 'landim,mf',
    'silva,lh': 'soares-silva,lh', 
    'soaressilva,lh': 'soares-silva,lh',
    'silva,lhs': 'soares-silva,lh',
    'oliveira,ma': 'oliveira,ms',
    'borges,r': 'borges,rax',
    'oliveira,s': 'oliveira,scc',
    'lage,jl': 'hage,jl',
    'maas,h': 'maas,pjm',
    'cardoso,e': 'cardoso,es',
    'proenca,c': 'proenca,ceb',
    'noleto,l': 'noletto,lg',
    'rudall,p': 'ruddall,p',
    'chiea,sac': 'chiea,sc',
    'cielofilho,r':'cielo-filho,r', 
    'filho,rc':'cielo-filho,r',
    'cid,ca': 'cid,cac',
    'nascimento,e': 'nascimento,ea',f
    'jardim,j': 'jardim,jg',
    'villaroel,d': 'villarroel,d',
    'wagner': 'wagner,hl',
    'dias,bj': 'dias,jb',
    'amorim,p': 'amorim,pr',
    'flores': 'flores,tb',
    'lucas,e': 'lucas,ej',
    'morbeck': 'morbeck,a',
    'castro,r': 'castro,ra',
    'passon,l': 'passon,lm',
    'simpson,pl': 'simpson-junior,pl', 
    'simpsonjunior,pl': 'simpson-junior,pl',
    'paulasouza,j': 'souza,jp',
    'coveny,r': 'coveny,rg',
    'crosby': 'crosby,mr',
    'souza,cv': 'souza,vc',
    'moreira,al': 'moreira,alc',
    'nobs,ma': 'noles,ma',
    'kuehn,e': 'kuhn,e',
    'davidsen,c': 'davidson,c',
    'estabrook': 'estabrook,gf',
    'sousa,tc': 'souza,tc',
    'verwimp': 'verwimp,i',
    'campos,jmf': 'campos,jmp',
    'silva,lm': 'silva,lam',
    'smith,l': 'smith,lb',
    'yamomoto,m': 'yamamoto,m',
    'verveloet,rr': 'vervloet,rr',
    'rocha,rm': 'rocha,rn',
    'oliveira,nr': 'oliveira,nro',
    'haas,jh': 'hass,jh',
    'whalen,a': 'whalen,aj',
    'sa,spp': 'sa,spps',
    'bensusan,n': 'bensusan,nr',
    'borgato,df': 'borgatto,df',
    'mendes,jn': 'mendes,jm',
    'fontella,j': 'fontella,jp',
    'staggemeier,vg': 'staggmeier,vg',
    'campos,mtv': 'campos,mtva',
    'benton,f': 'benton,fp',
    'marchioni,jm': 'marchiori,jn',
    'juchum': 'juchum,f',
    'peocopio,lc': 'procopio,lc',
    'romeroc': 'romero,c', 
    'marques,c': 'marques,cf',
    'cardoso,ef': 'cardoso,f',
    'pereira,ba': 'pereira,bas',
    'carvalho,sl': 'carvalho-leite,sl', 
    'carvalholeite,sl': 'carvalho-leite,sl',
    'benedete': 'benedete,al',
    'harley,gm': 'harley,rm',
    'diasmelo,r': 'dias-melo,r', 
    'melo,rd': 'dias-melo,r',
    'schiesinki': 'schiesinski,d',
    'porto,jr': 'porto,jlr',
    'argent,g': 'argent,gcg',
    'argentgcgin': 'argent,gcg',
    'rodri': 'rodrig',
    'araujo,g': 'araujo,gm',
    'carvalho,amv': 'carvalho,avm',
    'carvalho,am': 'carvalho,avm', 
    'carvalho,amv': 'carvalho,avm',
    'careno,s': 'carreno,s',
    'mazine,f': 'mazine-capelo,ff', 
    'mazinecapelo,ff': 'mazine-capelo,ff',
    'lopes,i': 'lopes,isn',
    'irvine,gc': 'irwine,cg',
    'taroda,n': 'tarroda,n',
    'isejima': 'isejima,em',
    'dario,f': 'dario,fr',
    'onishi': 'onishi,e',
    'sidney': 'sidney,gf',
    'reitz': 'reitz,pr',
    'devogel,ef': 'vogel,hvf',
    'borges,j': 'borges,jwm',
    'brade': 'brade,ac',
    'nicacio': 'nicacio,jn',
    'anapaula': 'ana-paula', 
    'paula,a': 'ana-paula',
    'dell,d': 'odell,d',
    'rondon': 'rondon,c',
    'santos,h': 'santos,hcf',
    'furla': 'furlan,a',
    'grasser,g': 'grasser,ga',
    'pedrosa,ma': 'pedroso,ma',
    'pedrosa,n': 'pedrosa,ns',
    'zoccoli,d': 'zoccoli,dm',
    'arroyo,mtk': 'kallin-arroyo,mt', 
    'kallinarroyo,mt': 'kallin-arroyo,mt', 
    'brito,ic': 'britto,ic',
    'degrande,da': 'grande,da',
    'sales,sc': 'salles,sc',
    'souza,r': 'souza,rr',
    'guedes,j': 'guedes,jc',
    'herlan,j': 'herlanio,j',
    'nascimento,a': 'nascimento,ae',
    'siva,ma': 'silva,ma',
    'bucci,f': 'bucci,ffb',
    'santana,bdi': 'santana,bid',
    'giordano,lc': 'giordano,lcs',
    'meyer': 'meyer,fs',
    'franca': 'franca,f', 
    'koekemoer,m': 'koekomoer,m',
    'souza,rt': 'souza,rtc',
    'pereira,t': 'pereira,ta',
    'jose,m': 'jose,maria',
    'carmo,j': 'carmo,jj',
    'fernandes,a': 'fernandes,ag',
    'moraes,plr': 'moraes,prl',
    'maia,w': 'maia,wd',
    'martins,ca': 'martins,can',
    'polite,l': 'politi,l',
    'almeida,f': 'almeida,fc',
    'borges,jw': 'borges,jwm',
    'kuhlman,m': 'kuhlmann,m',
    'silva,mb': 'silva,mib', 
    'sousa,rv': 'souza,rv',
    'koczichi': 'koczicki,c',
    'leite,jr': 'leite,jrs',
    'silva,pit': 'tanno-silva,pi',
    'tannosilva,pi': 'tanno-silva,pi',
    'mendonca,r': 'mendonca,rc',
    'mendonca,rr': 'mendonca,rc',
    'schumke,j': 'schunke,j',
    'abe,lb': 'abe,lm',
    'reitz': 'reitz,r',
    'stieber,m': 'stieber,mt',
    'sieber,m': 'stieber,mt',
    'chagasesilva,fc': 'chagas-e-silva,fc', 
    'chagasesilva,f': 'chagas-e-silva,fc',
    'silva,fc': 'chagas-e-silva,fc',
    'prance,g': 'prance,gt',
    'melo,trb': 'mello,trb', 
    'sena,pac': 'senna,pac',
    'pereira,l': 'pereira,la',
    'caneiro,j': 'carneiro,j',
    'munhoz,ca': 'munhoz,cbr',
    'jimenez': 'jimenez,ja',
    'castellanos': 'castellanos,a',
    'cristobal,l': 'cristobal,cl', 
    'sousa,ng': 'souza,ng',
    'westra,lyt': 'westra,lyth',
    'luna,ta': 'luna,ti',
    'pilger': 'pilges',
    'silva,mi': 'silva,mib',
    'mitzi': 'mitzi,g',
    'santos,rr': 'santos,rrb',
    'morales,r': 'morales,rav',
    'sidney': 'sidnei,rm', 
    'siva,ja': 'silva,ja', 
    'vitti,f': 'vitti,fx',
    'oliveira,fac': 'oliveira,fca',
    'landrum,s': 'landrum,ss',
    'rodrigues,wa': 'rodrigues,wm',
    'bromley,g': 'bomley,gl', 
    'constable,ef': 'constable,ej',
    'armando,m': 'armando,ms',
    'falcao,ji': 'falcao,jia',
    'maroccolo,jf': 'marocollo,jf',
    'maxwell,h': 'maxwell,hh',
    'zaruchi': 'zarucchi,j', 
    'santos,mcv': 'vilela-santos,mc', 
    'vilelasantos,mc': 'vilela-santos,mc',
    'silva,eb': 'silva,ebm',
    'jumbo,s': 'jimbo,s', 
    'medeiros,l': 'medeiros,lb',
    'johnson,l': 'johnson,las',
    'franciso,em': 'francisco,em', 
    'melo': 'melo,e',
    'nogueira,lm': 'nogueira,lmg',
    'aquino,f': 'aquino,fg',
    'cairus,rjr': 'cairos,rjr',
    'santos,aj': 'santos,ajv',
    'cerqueira,ls': 'cerqueira,lsc',
    'fonseca,rr': 'fonseca,s',
    'fonseca,sg': 'fonseca,s',
    'fonseca,rr': 'fonseca,s',
    'fonseca,sf': 'fonseca,s',
    'fonseca,fs': 'fonseca,s',
    'fonseca,fj': 'fonseca,j',
    'fonseca,l': 'fonseca,lm',
    'vilar,ts': 'villar,ts',
    'colleta,gd': 'colletta,gd',
    'barker,r': 'barker,rm',
    'fragg,c': 'fagg,cw', 
    'fagg,c': 'fagg,cw',
    'pennigton,td': 'pennington,td',
    'rivera,v': 'rivera,vl',
    'smya,s': 'sumya,s',
    'versiane,af': 'versiane,afa',
    'castelo,aj': 'castro,aj',
    'richards': 'richardspwin',
    'richards,m': 'richardspwin',
    'richards,pw': 'richardspwin',
    'pinheiro,s': 'pinheiros,s',
    'pinheiro,em': 'pinheiros,em',
    'lima,e': 'lima,es',
    'hind,dj': 'hind,djn',
    'hind,n': 'hind,djn', 
    'soares,g': 'soares,gf',
    'ferreira,map': 'pereira,map',
    'dobereiner': 'dobereiner,j',
    'scariot,a': 'scariot,ao',
    'monteiro,r': 'monteiro,rn',
    'leal,cg': 'leal,g',
    'garcia,mcm': 'garcia,mgm',
    'lopes,wdp': 'lopes,wp',
    'oliveira,m': 'oliveira,ms',
    'vinha,s': 'vinha,sg',
    'santos,g': 'santos,gb',
    'filgueira,ts': 'filgueiras,ts',
    'stutte,jg': 'stutts,jg',
    'stutte,j': 'stutts,jg',
    'stutts,j': 'stutts,jg',
    'shutts,jg': 'stutts,jg',
    'garcia,pb': 'garcia,pbc',
    'joaovicente': 'vicente,j',
    'mariz,g': 'mariza,g',
    'fierros,af': 'freire-fierros,a', 
    'freirefierros,a': 'freire-fierros,a',
    'rodrigues,ce': 'rodrigues-junior,ce', 
    'rodriguesjunior,ce': 'rodrigues-junior,ce',
    'kirkbridejunior,j': 'kirkbride-junior,jh', 
    'kirkbridejunior,jh': 'kirkbride-junior,jh',
    'kirkbride,jh': 'kirkbride-junior,jh',
    'souza,rs': 'sousa,rs',
    'belizario,m': 'belisario,m',
    'gibles,p': 'gibbs,pe', 
    'black': 'black,ga',
    'black,g': 'black,ga',
    'black,ca': 'black,ga',
    'szechy,mtm': 'sechy,mts', 
    'ladeira,j': 'ladeira,jl',
    'kauseilmari':'kause,i', 
    'rodrigues,w': 'rodrigues,wm',
    'marines,g': 'marinis,g',
    'maranis,g': 'marinis,g',
    'bromley,g': 'bromley,gl',
    'barbosa,ea': 'barbosa,e', 
    'leitaofilho,h':'leitao-filho,h',
    'leitao,hf': 'leitao-filho,h',
    'leitaofilho,hf':'leitao-filho,h',
    'nilsson,s': 'nilson,s', 
    'mirizawa,m': 'kirizawa,m',
    'gomes,v': 'gomes,vl',
    'leite,jra': 'leite,jrs',
    'philcox': 'philcox,d',
    'dutil,jh': 'dutilh,jha',
    'arraes,mgm': 'arrais,mgm',
    'soderstrom': 'soderstrom,tr',
    'raulino,t': 'raulino,taf',
    'cisnero,la': 'cisneros,la',
    'santos,f': 'santos,gf',
    'santos,fm': 'santos,fam',
    'santos,ffm': 'santos,fam',
    'wasshusen,dc': 'wasshausen,dc', 
    'fereira,a': 'ferreira,a',
    'freitas,g': 'freitas,gs',
    'oliveira,fca': 'oliveira,fcao',
    'cervi,ac': 'cervi,ca',
    'wilsonbrowne,g': 'browne,gw', 
    'ribeiro,mm': 'ribeiro,mmv',
    'boom,b': 'boom,bm',
    'amorim,pr': 'amorim,prf',
    'amorim,p': 'amorim,prf',
    'lannasobrinho,jp': 'lana-sobrinho,jp',
    'lanasobrinho,jp': 'lana-sobrinho,jp',
    'sobrinho,jpl': 'lana-sobrinho,jp',
    'pires,jm': 'pires,jn',
    'pires,jf': 'pires,jn',
    'fereira,a': 'ferreira,a',
    'bomley,gl': 'bromley,gl',
    'dusi,rl': 'dusi,rlm',
    'assuncao,pacl': 'assuncao,pac',
    'assunsao,paci': 'assuncao,pac',
    'assuncao,pacs': 'assuncao,pac',
    'assuncao,pa': 'assuncao,pac',
    'hathome,w': 'hawthorne,w',
    'sendullsky,t': 'sendulsky,t',
    'caixetadedeus,w': 'caixeta,w',
    'vicente,j': 'vicente,jc',
    'alavarenga,d': 'alvarenga,d',
    'fontella,jp': 'fontella,pj',
    'andrade,p': 'andrade,pm',
    'rizzini': 'rizzini,ct',
    'barbosa,ma': 'barboza,ma',
}

In [449]:
nm.remap(remap)
nm.getMap()

{'.': '',
 '1980 Sino-Amer Exped.': 'sinoamerexped',
 '?': '',
 'A.J.N.V.': 'ajnv',
 'A.M.': 'am',
 'Abbas, B': 'abbas,b',
 'Abdala, GC': 'abdala,gc',
 'Abdo, MSA': 'abdo,msa',
 'Abdon': 'abdon',
 'Abe, LB': 'abe,lm',
 'Abe, LM': 'abe,lm',
 'Abrahim, MA': 'abrahim,ma',
 'Abreu, CG': 'abreu,cg',
 'Abreu, GX': 'abreu,gx',
 'Abreu, I': 'abreu,i',
 'Abreu, LC': 'abreu,lcr',
 'Abreu, LCR': 'abreu,lcr',
 'Abreu, M': 'abreu,mc',
 'Abreu, MC': 'abreu,mc',
 'Abreu, MS': 'abreu,mc',
 'Abreu, NL': 'abreu,nl',
 'Abreu, NR': 'abreu,nr',
 'Abreu, TLS': 'abreu,tls',
 'Accioly': 'accioly',
 'Accorsi, WR': 'accorsi,wr',
 'Acera, V': 'acera,v',
 'Acevedo-Rodriguez, P': 'acevedorodriguez,p',
 'Acuña, BC': 'acuna,bc',
 'Adams, WP': 'adams,wp',
 'Adcock, S': 'adcock,s',
 'Adderley, LS': 'adderley,ls',
 'Addison, G': 'addison,g',
 'Addor, EE': 'addor,ee',
 'Adler': 'adler',
 'Adorno, H': 'adorno,h',
 'Adão': 'adao',
 'Aedo, C': 'aedo,c',
 'Affonso, P': 'affonso,p',
 'Afonso, AA': 'afonso,aa',
 'Afonso, AP':

In [31]:
len(G['silva,t'])

2

In [78]:
names = list(G.nodes())
dfl.get_close_matches('proenca', names)

['proenca,sl', 'proenca,ceb', 'rocha']

In [19]:
l1 = ['a','b','c','d']
l2 = ['a','b','c','d', 'e']

def similarity(l1,l2):
    if len(l1)>len(l2):
        mainList, testList = l1,l2
    else: 
        mainList, testList = l2,l1
    
    score = sum( 1 if i in mainList else 0 for i in testList )/len(testList)
    
    return score

def nodesSimilarity( G, node1, node2 ):
    return similarity(list(G[node1]), list(G[node2]))

In [20]:
sorted( nx.degree(G).items(), key=lambda x: x[1], reverse=True)

[('proenca,ceb', 249),
 ('ratter,ja', 131),
 ('faria,jeq', 128),
 ('munhoz,cbr', 125),
 ('carvalho,am', 107),
 ('silva,ma', 106),
 ('oliveira,rc', 98),
 ('souza,vc', 96),
 ('mendonca,rc', 95),
 ('mendes,vc', 92),
 ('harley,rm', 92),
 ('walter,bmt', 92),
 ('franca,f', 86),
 ('melo,e', 77),
 ('cardoso,e', 77),
 ('silva,gp', 74),
 ('martins,rc', 66),
 ('carvalhosilva,m', 64),
 ('santana,sc', 63),
 ('jardim,jg', 63),
 ('fonsecafilho,j', 62),
 ('pirani,jr', 62),
 ('hatschbach,g', 59),
 ('bridgewater,s', 59),
 ('silvajunior,mc', 59),
 ('camara,peas', 58),
 ('cordeiro,i', 54),
 ('shevock,jr', 54),
 ('filgueiras,ts', 53),
 ('zanatta,mrv', 53),
 ('souza,r', 52),
 ('irwin,hs', 51),
 ('oliveira,m', 51),
 ('mellosilva,r', 49),
 ('silva,jm', 49),
 ('oliveira,nr', 48),
 ('forzza,rc', 48),
 ('souza,jp', 47),
 ('silva,js', 46),
 ('heringer,ep', 46),
 ('santos,rr', 46),
 ('alvarenga,d', 46),
 ('cavalcanti,tb', 46),
 ('francener,a', 45),
 ('santos,ts', 44),
 ('novelino,rf', 44),
 ('ivanauskas,nm', 44),


In [21]:
dfl.get_close_matches('harley,rm', names)

['harley,rm', 'harley,r', 'marley,rm']

In [22]:
similarity('harley,rm','marley,rm')

0.8888888888888888

In [23]:
G['marley,rm']

KeyError: 'marley,rm'