In [2]:
import pandas as pd
import re

In [3]:
!pip install ftfy
from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.replace('.', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    string = string.strip()
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


print(ngrams('Klazenga, N.'))
print(ngrams('Beauglehole, A.C.'))


['Kla', 'laz', 'aze', 'zen', 'eng', 'nga', 'ga ', 'a N']
['Bea', 'eau', 'aug', 'ugl', 'gle', 'leh', 'eho', 'hol', 'ole', 'le ', 'e A', ' A ', 'A C']


In [4]:
# Create Wikidata items data frame
df_wikidata = pd.read_csv('data/wikidata_persons.csv')
df_wikidata = df_wikidata.iloc[:, 1:]

df_wikidata.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,item,itemLabel,surname,initials,canonical_string,orcid,viaf,isni,harv,ipni,abbr,bloodhound_id,enc_au_sc_id,yob,yod,wyb,wye
0,http://www.wikidata.org/entity/Q67329,Carl Gustav Jablonsky,Jablonsky,C.G.,"Jablonsky, C.G.",,24944037,0000 0000 5526 3001,,,,,,1756.0,1787.0,,
1,http://www.wikidata.org/entity/Q68738,Hermann Müller,Müller,H.,"Müller, H.",,2532803,0000 0001 0837 1728,36129.0,6735-1,H.Müll.,,,1829.0,1883.0,,
2,http://www.wikidata.org/entity/Q66902,Hermann Lebert,Lebert,H.,"Lebert, H.",,27833384,0000 0001 2100 1924,67704.0,22162-1,Lebert,,,1813.0,1878.0,,
3,http://www.wikidata.org/entity/Q69552,Carl Julius Bernhard Börner,Börner,C.J.B.,"Börner, C.J.B.",,118457204,0000 0001 1085 9124,1763.0,12350-1,Börner,,,1880.0,1953.0,,
4,http://www.wikidata.org/entity/Q66379,Ludwig Bechstein,Bechstein,L.,"Bechstein, L.",,36914329,0000 0001 2278 3438,,,,,,1801.0,1860.0,,


In [5]:
# Create data frame with unique canonical strings
df_wikidata_unique = df_wikidata.groupby('canonical_string').agg({'item': ['count']}).reset_index()
cols = df_wikidata_unique.columns.tolist()

# df_wikidata_unique
print(df_wikidata_unique.values)

[['(-Walraevens), O.H.' 2]
 ['(1), G.Z.' 1]
 ['(Entomologist), C.K.' 1]
 ...
 ['Șuster, P.M.' 1]
 ['Șık, L.' 1]
 ['Țopa, E.' 1]]


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize Wikidata name (use fit_transform())
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(df_wikidata_unique['canonical_string'])

In [9]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tf_idf_matrix_clean) # tfidf contains the vectorized wikidata names from the previous step

# matching query
def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices

In [72]:
def get_matches_df(name_vector, test_df, distances, indices, upper_bound=0.9):
    matches = []
    for i,j in enumerate(indices):
        left_side = name_vector[i]
        right_side = test_df.values[j][0][0]
        distance = round(distances[i][0],2)
        temp = [left_side, right_side, distance]
        matches.append(temp)

    matches = pd.DataFrame(matches, columns=['left_side','right_side','distance'])
    matches = matches[matches['distance'] <= upper_bound]

    matches = matches.sort_values(['distance'])
    matches = matches.reset_index()
    return matches

In [10]:
# Load AVH collectors data set
df_avh = pd.read_csv('data/avh_collectors.csv')
df_avh = df_avh.iloc[:, 1:]
df_avh.drop(columns=['i18nCode', 'fq'], inplace=True)

df_avh.head()

Unnamed: 0,label,count,start_date,end_date,activity_span
0,"Beauglehole, A.C.",90942,1865.0,2005.0,140.0
1,"Forster, P.I.",64649,1955.0,2018.0,63.0
2,"Hyland, B.",57265,1952.0,2008.0,56.0
3,"Latz, P.",51230,1875.0,2019.0,144.0
4,"Streimann, H.",45346,1896.0,2001.0,105.0


In [73]:
avh_names = set(df_avh['label'].values)

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(avh_names)
duration = time.time() - start
print('Completed in:', duration, 's')

avh_names = list(avh_names)

df_matches = get_matches_df(avh_names, df_wikidata_unique, distances, indices)

print(df_matches.sample(20))

Getting nearest neighbours...
Completed in: 2.0293214321136475 s
     index                    left_side         right_side  distance
623    855                 Harris, P.L.         Harris, P.      0.54
706    640                  Smith, N.M.        Smith, N.P.      0.63
790    792                 Stacey, C.I.       Stacey, J.W.      0.76
228    696                    Paust, S.          Paust, S.      0.00
695    870                  Aston, H.I.          Aston, H.      0.62
409    162                   Leiper, G.         Leiper, G.      0.00
724    768               Eldridge, D.J.     Eldridge, G.H.      0.65
789    621  Gates, G.M.|Ratkowsky, D.A.  Ratkowsky, D.(.).      0.76
768    121                   Cronin, M.       Cronin, T.M.      0.72
119    470                 Rodway, F.A.       Rodway, F.A.      0.00
729    839                    May, T.W.            May, T.      0.66
701     47           Leichhardt, F.W.L.     Leichhardt, L.      0.63
684    961               Stephens, K.M

In [74]:
# Join matches data frame to AVH collectors data frame
df_avh_matches = pd.merge(df_avh, df_matches, left_on='label', right_on='left_side', how='left')

df_avh_matches

# Join Wikidata items
df_avh_matches_wikidata = pd.merge(df_avh_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_avh_matches_wikidata = pd.merge(df_avh_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_avh_matches_wikidata.rename(columns={df_avh_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# Remove unwanted columns
df_avh_wikidata_nn = df_avh_matches_wikidata[['label', 'count', 'start_date', 'end_date', 'activity_span', 
                                                  'left_side', 'right_side', 'distance', 'dup_count', 
                                                  'item', 'itemLabel', 'surname', 'initials', 'canonical_string', 
                                                  'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                  'enc_au_sc_id', 'yob', 'yod', 'wyb', 'wye']]

# Order by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_avh_wikidata_nn.sort_values(by=['distance', 'dup_count', 'count'], ascending=[True, True, False], inplace=True)

df_avh_wikidata_nn

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,left_side,right_side,distance,dup_count,item,...,isni,harv,ipni,abbr,bloodhound_id,enc_au_sc_id,yob,yod,wyb,wye
0,"Beauglehole, A.C.",90942,1865.0,2005.0,140.0,"Beauglehole, A.C.","Beauglehole, A.C.",0.0,1.0,http://www.wikidata.org/entity/Q16744919,...,,,,,Q16744919,P000214b,1920.0,2002.0,,
1,"Forster, P.I.",64649,1955.0,2018.0,63.0,"Forster, P.I.","Forster, P.I.",0.0,1.0,http://www.wikidata.org/entity/Q9057027,...,,,18907-1,P.I.Forst.,,,1961.0,,,
2,"Hyland, B.",57265,1952.0,2008.0,56.0,"Hyland, B.","Hyland, B.",0.0,1.0,http://www.wikidata.org/entity/Q4893242,...,,,4262-1,B.Hyland,,,1937.0,,,
4,"Streimann, H.",45346,1896.0,2001.0,105.0,"Streimann, H.","Streimann, H.",0.0,1.0,http://www.wikidata.org/entity/Q21339679,...,0000 0001 1573 7118,2053,15669-1,Streimann,,P005606b,1938.0,2001.0,,
5,"Elix, J.A.",39702,1878.0,2020.0,142.0,"Elix, J.A.","Elix, J.A.",0.0,1.0,http://www.wikidata.org/entity/Q21339171,...,0000 0000 8084 828X,93027,18445-1,Elix,,P005725b,1941.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,"Abbott, B. | Burton, E.",705,2008.0,2013.0,5.0,,,,,,...,,,,,,,,,,
1113,"Wauchope, J.",695,1923.0,1987.0,64.0,,,,,,...,,,,,,,,,,
1120,"Ollerenshaw, N.",692,1874.0,1990.0,116.0,,,,,,...,,,,,,,,,,
1125,"Menkins, I.L.",687,1998.0,2012.0,14.0,,,,,,...,,,,,,,,,,


In [75]:
df_avh_wikidata_nn.to_csv('data/avhcoll_wikidata_nn.csv')

In [76]:
# Load MEL Collectors data set
df_melcoll = pd.read_csv('data/mel_collectors.csv')
df_melcoll['full_name'] = df_melcoll['family_name'] + ', ' + df_melcoll['initials']
df_melcoll = df_melcoll[df_melcoll['num_coll'] >= 1000]

df_melcoll

Unnamed: 0,agent_id,family_name,initials,given_names,num_coll,start_year,end_year,full_name
0,1297,Beauglehole,A.C.,,69198,1929.0,2001.0,"Beauglehole, A.C."
1,14283,Mueller,F.,Ferdinand Jacob Heinrich,25429,1812.0,1895.0,"Mueller, F."
2,19313,Stone,I.G.,,25428,1960.0,1999.0,"Stone, I.G."
3,21883,Willis,J.H.,James,20637,1885.0,1996.0,"Willis, J.H."
4,6458,Filson,R.B.,Rex,15193,1933.0,2000.0,"Filson, R.B."
...,...,...,...,...,...,...,...,...
142,8624,Hartmann,C.H.,Carl,1017,1856.0,1889.0,"Hartmann, C.H."
143,1609,Betche,E.,Ernst,1016,1865.0,1986.0,"Betche, E."
144,8823,Healey,K.,,1014,1953.0,1963.0,"Healey, K."
145,19882,Thies,A.W.,Arthur,1002,1978.0,1998.0,"Thies, A.W."


In [78]:
mel_names = set(df_melcoll['full_name'].values)

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(avh_names)
duration = time.time() - start
print('Completed in:', duration, 's')

mel_names = list(avh_names)

df_matches = get_matches_df(mel_names, df_wikidata_unique, distances, indices)

print(df_matches.sample(20))

Getting nearest neighbours...
Completed in: 2.1944732666015625 s
     index                    left_side         right_side  distance
133    425                 Muspratt, J.       Muspratt, J.      0.00
778    233              van Leeuwen, S.    Leeuwen, S.J.v.      0.74
283    130                     Dahl, E.           Dahl, E.      0.00
572    462                   Sharpe, P.       Sharpe, P.R.      0.48
529    869                Johnson, D.C.        Johnson, D.      0.45
222    707               Camfield, J.H.     Camfield, J.H.      0.00
409    162                   Leiper, G.         Leiper, G.      0.00
23     495                   Carr, G.W.         Carr, G.W.      0.00
610     53                   Wilson, K.         Wilson, [.      0.52
250    794                  Ornduff, R.        Ornduff, R.      0.00
685    851               Champion, I.G.       Champion, J.      0.61
411    289                  Duval, D.J.        Duval, D.J.      0.00
758    781                Brooker, M.I

In [81]:
# Join matches data frame to MEL collectors data frame
df_melcoll_matches = pd.merge(df_melcoll, df_matches, left_on='full_name', right_on='left_side', how='left')

# Join Wikidata items
df_melcoll_matches_wikidata = pd.merge(df_melcoll_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_melcoll_matches_wikidata = pd.merge(df_melcoll_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_melcoll_matches_wikidata.rename(columns={df_melcoll_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# Remove unwanted columns
df_melcoll_wikidata_nn = df_melcoll_matches_wikidata[['agent_id', 'family_name', 'initials_x', 'given_names', 
                                                          'num_coll', 'start_year', 'end_year', 'full_name', 
                                                          'left_side', 'right_side', 'distance', 'dup_count', 
                                                          'item', 'itemLabel', 'surname', 'initials_y', 'canonical_string', 
                                                          'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                          'enc_au_sc_id', 'yob', 'yod', 'wyb', 'wye']]

# Sort by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_melcoll_wikidata_nn.sort_values(by=['distance', 'dup_count', 'num_coll'], ascending=[True, True, False], inplace=True)

df_melcoll_wikidata_nn

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,agent_id,family_name,initials_x,given_names,num_coll,start_year,end_year,full_name,left_side,right_side,...,isni,harv,ipni,abbr,bloodhound_id,enc_au_sc_id,yob,yod,wyb,wye
0,1297,Beauglehole,A.C.,,69198,1929.0,2001.0,"Beauglehole, A.C.","Beauglehole, A.C.","Beauglehole, A.C.",...,,,,,Q16744919,P000214b,1920.0,2002.0,,
2,19313,Stone,I.G.,,25428,1960.0,1999.0,"Stone, I.G.","Stone, I.G.","Stone, I.G.",...,0000 0001 1247 4965,40174,27255-1,I.G.Stone,,,1913.0,2001.0,,
5,6458,Filson,R.B.,Rex,15193,1933.0,2000.0,"Filson, R.B.","Filson, R.B.","Filson, R.B.",...,,76762,18803-1,Filson,,P002347b,1930.0,,,
6,6771,Forster,P.I.,Paul Irwin,12226,1955.0,2118.0,"Forster, P.I.","Forster, P.I.","Forster, P.I.",...,,,18907-1,P.I.Forst.,,,1961.0,,,
10,20954,Walsh,N.G.,Neville,9122,1971.0,2020.0,"Walsh, N.G.","Walsh, N.G.","Walsh, N.G.",...,0000 0000 6385 4733,,14171-1,N.G.Walsh,,P005329b,1956.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,7041,Fullagar,J.P.,James,1121,1860.0,1877.0,"Fullagar, J.P.",,,...,,,,,,,,,,
163,2729,Bufton,J.,,1061,1853.0,1898.0,"Bufton, J.",,,...,,,,,,,,,,
164,9524,Hooker,J.D.,Joseph Dalton,1046,1830.0,1902.0,"Hooker, J.D.",,,...,,,,,,,,,,
165,21755,Wilhelmi,J.F.C.,,1035,1847.0,1887.0,"Wilhelmi, J.F.C.",,,...,,,,,,,,,,


In [82]:
# Save to CSV file
df_melcoll_wikidata_nn.to_csv('data/melcoll_wikidata_nn.csv')