# Cosine similarity

In [1]:
import pandas as pd, numpy as np, re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sparse_dot_topn import awesome_cossim_topn

In [2]:
def get_matches_df(sparse_matrix, A, B, top=100):
    non_zeros = sparse_matrix.nonzero()

    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]

    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size

    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similarity = np.zeros(nr_matches)

    for index in range(0, nr_matches):
        left_side[index] = A[sparserows[index]]
        right_side[index] = B[sparsecols[index]]
        similarity[index] = round(sparse_matrix.data[index], 3)

    return pd.DataFrame({'left_side': left_side,
                         'right_side': right_side,
                         'similarity': similarity})

In [3]:
!pip install ftfy
from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.replace('.', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    string = string.strip()
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


print(ngrams('Klazenga, N.'))
print(ngrams('Beauglehole, A.C.'))


['Kla', 'laz', 'aze', 'zen', 'eng', 'nga', 'ga ', 'a N']
['Bea', 'eau', 'aug', 'ugl', 'gle', 'leh', 'eho', 'hol', 'ole', 'le ', 'e A', ' A ', 'A C']


In [4]:
# Create Wikidata items data frame
df_wikidata = pd.read_csv('data/wikidata_persons.csv')
df_wikidata = df_wikidata.iloc[:, 1:]

df_wikidata.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,item,itemLabel,surname,initials,canonical_string,orcid,viaf,isni,harv,ipni,...,yod,wyb,wye,wikidata_link,orcid_link,harv_link,ipni_link,bloodhound_link,enc_au_sc_link,au_dict_bio_link
0,http://www.wikidata.org/entity/Q67329,Carl Gustav Jablonsky,Jablonsky,C.G.,"Jablonsky, C.G.",,24944037,0000 0000 5526 3001,,,...,1787.0,,,http://www.wikidata.org/wiki/Q67329,,,,,,
1,http://www.wikidata.org/entity/Q68738,Hermann Müller,Müller,H.,"Müller, H.",,2532803,0000 0001 0837 1728,36129.0,6735-1,...,1883.0,,,http://www.wikidata.org/wiki/Q68738,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/6735-1,,,
2,http://www.wikidata.org/entity/Q66902,Hermann Lebert,Lebert,H.,"Lebert, H.",,27833384,0000 0001 2100 1924,67704.0,22162-1,...,1878.0,,,http://www.wikidata.org/wiki/Q66902,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/22162-1,,,
3,http://www.wikidata.org/entity/Q69552,Carl Julius Bernhard Börner,Börner,C.J.B.,"Börner, C.J.B.",,118457204,0000 0001 1085 9124,1763.0,12350-1,...,1953.0,,,http://www.wikidata.org/wiki/Q69552,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/12350-1,,,
4,http://www.wikidata.org/entity/Q66379,Ludwig Bechstein,Bechstein,L.,"Bechstein, L.",,36914329,0000 0001 2278 3438,,,...,1860.0,,,http://www.wikidata.org/wiki/Q66379,,,,,,


In [5]:
# Create data frame with unique canonical strings
df_wikidata_unique = df_wikidata.groupby('canonical_string').agg({'item': ['count']}).reset_index()
cols = df_wikidata_unique.columns.tolist()

df_wikidata_unique

Unnamed: 0_level_0,canonical_string,item
Unnamed: 0_level_1,Unnamed: 1_level_1,count
0,"(-Walraevens), O.H.",2
1,"(1), G.Z.",1
2,"(Bert), A.A.H.",1
3,"(Entomologist), C.K.",1
4,"(Gilyazetdinova), D.G.",1
...,...,...
107831,"Ǒwaki, K.",1
107832,"Șerbanescu, I.",1
107833,"Șuster, P.M.",1
107834,"Șık, L.",1


In [6]:
# Vectorize Wikidata name (use fit_transform())
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix_clean = vectorizer.fit_transform(df_wikidata_unique['canonical_string'])

In [7]:
# Load AVH collectors data set
df_avh = pd.read_csv('data/avh_collectors.csv')
df_avh = df_avh.iloc[:, 1:]
df_avh.drop(columns=['i18nCode', 'fq'], inplace=True)

df_avh.head()

Unnamed: 0,label,count,start_date,end_date,activity_span
0,"Beauglehole, A.C.",90942,1865.0,2005.0,140.0
1,"Forster, P.I.",64649,1955.0,2018.0,63.0
2,"Hyland, B.",57265,1952.0,2008.0,56.0
3,"Latz, P.",51230,1875.0,2019.0,144.0
4,"Streimann, H.",45346,1896.0,2001.0,105.0


In [8]:
# Vectorize AVH collector names (use transform())
tf_idf_matrix_dirty = vectorizer.transform(df_avh['label'])

# Calculate Cosine Similarity; keep only the best match (ntop=1) and only if the similarity is greater than 0.5 (lower_bound=0.5)
import time
t1 = time.time()
matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), ntop=1, lower_bound=0.5)
t = time.time()-t1
print("SELFTIMED:", t)

# Create the data frame
matches_df = get_matches_df(matches, df_avh['label'], df_wikidata_unique['canonical_string'], top=0)

matches_df.head()

SELFTIMED: 0.13402223587036133


Unnamed: 0,left_side,right_side,similarity
0,"Beauglehole, A.C.","Beauglehole, A.C.",1.0
1,"Forster, P.I.","Forster, P.I.",1.0
2,"Hyland, B.","Hyland, B.",1.0
3,"Latz, P.","Latz, P.K.",0.848
4,"Streimann, H.","Streimann, H.",1.0


In [9]:
# Join matches data frame to AVH collectors data frame
df_avh_matches = pd.merge(df_avh, matches_df, left_on='label', right_on='left_side', how='left')

df_avh_matches

# Join Wikidata items
df_avh_matches_wikidata = pd.merge(df_avh_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_avh_matches_wikidata = pd.merge(df_avh_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_avh_matches_wikidata.rename(columns={df_avh_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# Remove unwanted columns
df_avh_wikidata_cossim = df_avh_matches_wikidata[['label', 'count', 'start_date', 'end_date', 'activity_span', 
                                                  'left_side', 'right_side', 'similarity', 'dup_count', 
                                                  'item', 'itemLabel', 'surname', 'initials', 'canonical_string', 
                                                  'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                  'enc_au_sc_id', 'yob', 'yod', 'wyb', 'wye', 
                                                  'wikidata_link', 'orcid_link', 'harv_link', 'ipni_link', 
                                                  'bloodhound_link', 'enc_au_sc_link', 'au_dict_bio_link']]

# Order by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_avh_wikidata_cossim.sort_values(by=['similarity', 'dup_count', 'count'], ascending=[False, True, False], inplace=True)

df_avh_wikidata_cossim


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,left_side,right_side,similarity,dup_count,item,...,yod,wyb,wye,wikidata_link,orcid_link,harv_link,ipni_link,bloodhound_link,enc_au_sc_link,au_dict_bio_link
0,"Beauglehole, A.C.",90942,1865.0,2005.0,140.0,"Beauglehole, A.C.","Beauglehole, A.C.",1.0,1.0,http://www.wikidata.org/entity/Q16744919,...,2002.0,,,http://www.wikidata.org/wiki/Q16744919,,,,https://bloodhound-tracker.net/Q16744919,http://www.eoas.info/biogs/P000214b,
1,"Forster, P.I.",64649,1955.0,2018.0,63.0,"Forster, P.I.","Forster, P.I.",1.0,1.0,http://www.wikidata.org/entity/Q9057027,...,,,,http://www.wikidata.org/wiki/Q9057027,,,https://www.ipni.org/a/18907-1,,,
2,"Hyland, B.",57265,1952.0,2008.0,56.0,"Hyland, B.","Hyland, B.",1.0,1.0,http://www.wikidata.org/entity/Q4893242,...,,,,http://www.wikidata.org/wiki/Q4893242,,,https://www.ipni.org/a/4262-1,,,
4,"Streimann, H.",45346,1896.0,2001.0,105.0,"Streimann, H.","Streimann, H.",1.0,1.0,http://www.wikidata.org/entity/Q21339679,...,2001.0,,,http://www.wikidata.org/wiki/Q21339679,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/15669-1,,http://www.eoas.info/biogs/P005606b,
5,"Elix, J.A.",39702,1878.0,2020.0,142.0,"Elix, J.A.","Elix, J.A.",1.0,1.0,http://www.wikidata.org/entity/Q21339171,...,,,,http://www.wikidata.org/wiki/Q21339171,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/18445-1,,http://www.eoas.info/biogs/P005725b,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,"Taws, N.",771,1993.0,1995.0,2.0,,,,,,...,,,,,,,,,,
1184,"Sparshott, K.M.",731,1993.0,2002.0,9.0,,,,,,...,,,,,,,,,,
1192,"Whaite, T.M. | Whaite, J.L.",714,1949.0,1985.0,36.0,,,,,,...,,,,,,,,,,
1208,"Wauchope, J.",695,1923.0,1987.0,64.0,,,,,,...,,,,,,,,,,


In [10]:
# Save as CSV file
df_avh_wikidata_cossim.to_csv('data/avhcoll_wikidata_cossim.csv')

In [11]:
# Load MEL Collectors data set
df_melcoll = pd.read_csv('data/mel_collectors.csv')
df_melcoll['full_name'] = df_melcoll['family_name'] + ', ' + df_melcoll['initials']
df_melcoll = df_melcoll[df_melcoll['num_coll'] >= 1000]

df_melcoll

Unnamed: 0,agent_id,family_name,initials,given_names,num_coll,start_year,end_year,full_name
0,1297,Beauglehole,A.C.,,69198,1929.0,2001.0,"Beauglehole, A.C."
1,14283,Mueller,F.,Ferdinand Jacob Heinrich,25429,1812.0,1895.0,"Mueller, F."
2,19313,Stone,I.G.,,25428,1960.0,1999.0,"Stone, I.G."
3,21883,Willis,J.H.,James,20637,1885.0,1996.0,"Willis, J.H."
4,6458,Filson,R.B.,Rex,15193,1933.0,2000.0,"Filson, R.B."
...,...,...,...,...,...,...,...,...
142,8624,Hartmann,C.H.,Carl,1017,1856.0,1889.0,"Hartmann, C.H."
143,1609,Betche,E.,Ernst,1016,1865.0,1986.0,"Betche, E."
144,8823,Healey,K.,,1014,1953.0,1963.0,"Healey, K."
145,19882,Thies,A.W.,Arthur,1002,1978.0,1998.0,"Thies, A.W."


In [12]:
# Vectorize MEL collector names (use transform())
tf_idf_matrix_dirty = vectorizer.transform(df_melcoll['full_name'])

# Calculate Cosine Similarity
import time
t1 = time.time()
matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0.5)
t = time.time()-t1
print("SELFTIMED:", t)

# Get matches and create data frame
matches_df = get_matches_df(matches, df_melcoll['full_name'], df_wikidata_unique['canonical_string'], top=0)

matches_df.head()

SELFTIMED: 0.03132915496826172


Unnamed: 0,left_side,right_side,similarity
0,"Beauglehole, A.C.","Beauglehole, A.C.",1.0
1,"Mueller, F.","Mueller, F.v.",0.885
2,"Stone, I.G.","Stone, I.G.",1.0
3,"Willis, J.H.","Willis, J.H.",1.0
4,"Filson, R.B.","Filson, R.B.",1.0


In [13]:
# Join matches data frame to MEL collectors data frame
df_melcoll_matches = pd.merge(df_melcoll, matches_df, left_on='full_name', right_on='left_side', how='left')

# Join Wikidata items
df_melcoll_matches_wikidata = pd.merge(df_melcoll_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_melcoll_matches_wikidata = pd.merge(df_melcoll_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_melcoll_matches_wikidata.rename(columns={df_melcoll_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# Remove unwanted columns
df_melcoll_wikidata_cossim = df_melcoll_matches_wikidata[['agent_id', 'family_name', 'initials_x', 'given_names', 
                                                          'num_coll', 'start_year', 'end_year', 'full_name', 
                                                          'left_side', 'right_side', 'similarity', 'dup_count', 
                                                          'item', 'itemLabel', 'surname', 'initials_y', 'canonical_string', 
                                                          'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                          'enc_au_sc_id', 'yob', 'yod', 'wyb', 'wye', 'wikidata_link', 
                                                          'orcid_link', 'harv_link', 'ipni_link', 'bloodhound_link', 
                                                          'enc_au_sc_link', 'au_dict_bio_link']]

# Sort by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_melcoll_wikidata_cossim.sort_values(by=['similarity', 'dup_count', 'num_coll'], ascending=[False, True, False], inplace=True)

df_melcoll_wikidata_cossim

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,agent_id,family_name,initials_x,given_names,num_coll,start_year,end_year,full_name,left_side,right_side,...,yod,wyb,wye,wikidata_link,orcid_link,harv_link,ipni_link,bloodhound_link,enc_au_sc_link,au_dict_bio_link
0,1297,Beauglehole,A.C.,,69198,1929.0,2001.0,"Beauglehole, A.C.","Beauglehole, A.C.","Beauglehole, A.C.",...,2002.0,,,http://www.wikidata.org/wiki/Q16744919,,,,https://bloodhound-tracker.net/Q16744919,http://www.eoas.info/biogs/P000214b,
2,19313,Stone,I.G.,,25428,1960.0,1999.0,"Stone, I.G.","Stone, I.G.","Stone, I.G.",...,2001.0,,,http://www.wikidata.org/wiki/Q6000720,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/27255-1,,,
5,6458,Filson,R.B.,Rex,15193,1933.0,2000.0,"Filson, R.B.","Filson, R.B.","Filson, R.B.",...,,,,http://www.wikidata.org/wiki/Q21339388,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/18803-1,,http://www.eoas.info/biogs/P002347b,
6,6771,Forster,P.I.,Paul Irwin,12226,1955.0,2118.0,"Forster, P.I.","Forster, P.I.","Forster, P.I.",...,,,,http://www.wikidata.org/wiki/Q9057027,,,https://www.ipni.org/a/18907-1,,,
10,20954,Walsh,N.G.,Neville,9122,1971.0,2020.0,"Walsh, N.G.","Walsh, N.G.","Walsh, N.G.",...,,,,http://www.wikidata.org/wiki/Q6041421,,,https://www.ipni.org/a/14171-1,,http://www.eoas.info/biogs/P005329b,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,19007,St John,P.R.H.,,4402,1881.0,1999.0,"St John, P.R.H.",,,...,,,,,,,,,,
90,10624,Karunajeewa,N.G.,Nimal,2052,2002.0,2020.0,"Karunajeewa, N.G.",,,...,,,,,,,,,,
129,19624,Tadgell,A.J.,,1425,1886.0,1966.0,"Tadgell, A.J.",,,...,,,,,,,,,,
144,1176,Batianoff,G.N.,,1278,1955.0,2007.0,"Batianoff, G.N.",,,...,,,,,,,,,,


In [14]:
# Save to CSV file
df_melcoll_wikidata_cossim.to_csv('data/melcoll_wikidata_cossim.csv')

In [20]:
# Australian Collectors and illustrators
dtypes = {'coll_index': str, 
          'surname': str,
          'initials': str,
          'canonical_string': str,
          'name': str, 
          'active': str, 
          'comment': str, 
          'info_link': str, 
          'portrait_link': str}
df_chahcoll = pd.read_csv('data/chah_collectors_clean.csv', dtype=dtypes);
df_chahcoll.head()

Unnamed: 0.1,Unnamed: 0,coll_index,surname,initials,canonical_string,name,active,comment,info_link,portrait_link
0,0,c00001,ABBOTT,F.,"ABBOTT, F.","ABBOTT, Francis, Jnr",1834–1903,Gardens Superintendent,notes,portrait
1,1,c00002,ABID,M.A.,"ABID, M.A.","ABID, Munir. A., See MUNIR",,,,
2,2,c00003,ABRAHAMS,L.,"ABRAHAMS, L.","ABRAHAMS, L.",fl. 1910,,,
3,3,c00004,ABRAHAMSON,A.,"ABRAHAMSON, A.","ABRAHAMSON, Ada",fl. 1890s,"Amateur seaweed collector, WA",notes,
4,4,c00005,ACKLAND,J.J.,"ACKLAND, J.J.","ACKLAND, Judith Joan",fl. 1960s,Botanist,,


In [23]:
# Vectorize CHAH collector names (use transform())
tf_idf_matrix_dirty = vectorizer.transform(df_chahcoll['canonical_string'])

# Calculate Cosine Similarity
import time
t1 = time.time()
matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0.5)
t = time.time()-t1
print("SELFTIMED:", t)

# Get matches and create data frame
matches_df = get_matches_df(matches, df_chahcoll['canonical_string'], df_wikidata_unique['canonical_string'], top=0)

matches_df.head()

SELFTIMED: 0.25528907775878906


Unnamed: 0,left_side,right_side,similarity
0,"ABBOTT, F.","Abbott, F.",1.0
1,"ABID, M.A.","Abid, R.",0.764
2,"ABRAHAMS, L.","Abrahams, L.J.",0.918
3,"ABRAHAMSON, A.","Abrahamson, W.",0.928
4,"ACKLAND, J.J.","Ackland, E.",0.815


In [28]:
# Join matches data frame to MEL collectors data frame
df_chahcoll_matches = pd.merge(df_chahcoll, matches_df, left_on='canonical_string', right_on='left_side', how='left')

# Join Wikidata items
df_chahcoll_matches_wikidata = pd.merge(df_chahcoll_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_chahcoll_matches_wikidata = pd.merge(df_chahcoll_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_chahcoll_matches_wikidata.rename(columns={df_chahcoll_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

print(df_chahcoll_matches_wikidata.columns.tolist())

# Remove unwanted columns
df_chahcoll_wikidata_cossim = df_chahcoll_matches_wikidata[['coll_index', 'surname_x', 'initials_x', 'canonical_string_x', 'name', 'active', 'comment', 
                                                          'left_side', 'right_side', 'similarity', 'dup_count', 
                                                          'item', 'itemLabel', 'surname_y', 'initials_y', 'canonical_string_y', 
                                                          'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                          'enc_au_sc_id', 'yob', 'yod', 'wyb', 'wye', 'wikidata_link', 
                                                          'orcid_link', 'harv_link', 'ipni_link', 'bloodhound_link', 
                                                          'enc_au_sc_link', 'au_dict_bio_link']]

# Sort by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_chahcoll_wikidata_cossim.sort_values(by=['similarity', 'dup_count'], ascending=[False, True], inplace=True)

df_chahcoll_wikidata_cossim

['Unnamed: 0', 'coll_index', 'surname_x', 'initials_x', 'canonical_string_x', 'name', 'active', 'comment', 'info_link', 'portrait_link', 'left_side', 'right_side', 'similarity', 'item', 'itemLabel', 'surname_y', 'initials_y', 'canonical_string_y', 'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 'enc_au_sc_id', 'au_dict_bio', 'yob', 'yod', 'wyb', 'wye', 'wikidata_link', 'orcid_link', 'harv_link', 'ipni_link', 'bloodhound_link', 'enc_au_sc_link', 'au_dict_bio_link', ('canonical_string', ''), 'dup_count']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,coll_index,surname_x,initials_x,canonical_string_x,name,active,comment,left_side,right_side,similarity,...,yod,wyb,wye,wikidata_link,orcid_link,harv_link,ipni_link,bloodhound_link,enc_au_sc_link,au_dict_bio_link
0,c00001,ABBOTT,F.,"ABBOTT, F.","ABBOTT, Francis, Jnr",1834–1903,Gardens Superintendent,"ABBOTT, F.","Abbott, F.",1.0,...,1883.0,,,http://www.wikidata.org/wiki/Q4529517,,,,,http://www.eoas.info/biogs/P000065b,http://adb.anu.edu.au/biography/abbott-francis-4
9,c00010,ADAMS,L.G.(.,"ADAMS, L.G.(.","ADAMS, Laurence George (Laurie)*",1929–2014,"Herbarium botanist, Canberra","ADAMS, L.G.(.","Adams, L.G.",1.0,...,2014.0,,,http://www.wikidata.org/wiki/Q10316562,,,https://www.ipni.org/a/12081-1,,http://www.eoas.info/biogs/P005518b,
11,c00012,ADAMS,M.,"ADAMS, M.","ADAMS, Miss",fl. 1889,Collected on the River Murray,"ADAMS, M.","Adams, M.",1.0,...,,,,http://www.wikidata.org/wiki/Q22110151,,,,,,
12,c00013,ADAMS,N.,"ADAMS, N.","ADAMS, Nilavan",fl. 2000s,"Botanical artist, ACT","ADAMS, N.","Adams, N.",1.0,...,2007.0,,,http://www.wikidata.org/wiki/Q3297913,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/32209-1,https://bloodhound-tracker.net/Q3297913,,
13,c00014,ADAMS,R.,"ADAMS, R.","ADAMS, Robin*",1953-,Botany Teacher,"ADAMS, R.","Adams, R.",1.0,...,1911.0,,,http://www.wikidata.org/wiki/Q7348773,,,,,,http://adb.anu.edu.au/biography/adams-robert-p...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4953,c03537,YOUNG Andrew fl.1990-2010s Botanist,,YOUNG Andrew fl.1990-2010s Botanist,YOUNG Andrew fl.1990-2010s Botanist,,,,,,...,,,,,,,,,,
4961,c03541,ZACHARIN Robert Fyfe,,ZACHARIN Robert Fyfe,ZACHARIN Robert Fyfe,,,,,,...,,,,,,,,,,
4964,c03544,ZECK Emil Hermann,,ZECK Emil Hermann,ZECK Emil Hermann,1891-,?Entomologist,,,,...,,,,,,,,,,
4965,c03545,ZICH Frank Anthony,,ZICH Frank Anthony,ZICH Frank Anthony *,1968-,Herbarium curator,,,,...,,,,,,,,,,


In [29]:
# Save to CSV file
df_chahcoll_wikidata_cossim.to_csv('data/chahcoll_wikidata_cossim.csv')

In [40]:
# BG Meise collectors
dtypes = {
    'COLL_ID': str, 
    'NAME': str,
    'LNAME': str, 
    'FNAME': str,
    'MNAME': str, 
    'NAME_TO_PRINT': str, 
    'TITLE': str, 
    'POSITION': str, 
    'BIRTH_DT': str, 
    'DEATH_DT': str, 
    'COUNTRY_ONE_LINE': str, 
    'BIRTH_DT': str, 
    'DEATH_DT': str, 
    'COUNTRY_CODE': str
}
df_bgmcoll = pd.read_csv('data/COLLECTORS_28OCT19.TXT', sep='\t', dtype=dtypes)

match_name = []
for i, row in df_bgmcoll.iterrows():
    nstr = str(row['LNAME'])
    fname = str(row['FNAME'])
    mname = str(row['MNAME'])
    if len(fname) > 0:
        words = re.split('[ .]', fname)
        words = [string for string in words if string != ""]
        init = ".".join([x[0] for x in words if len(x) > 0]) + '.'
        nstr = nstr + ', ' + init
        if len(mname) > 0:
            words = re.split('[ .]', mname)
            words = [string for string in words if string != ""]
            init = ".".join([x[0] for x in words if len(x) > 0]) + '.'
            nstr = nstr + init
    match_name.append(nstr)
    
df_bgmcoll['match_name'] = match_name
df_bgmcoll = df_bgmcoll[['COLL_ID', 'NAME', 'LNAME', 'FNAME', 'MNAME', 'match_name']]
df_bgmcoll

Unnamed: 0,COLL_ID,NAME,LNAME,FNAME,MNAME,match_name
0,L789,L. Leclercq,Leclercq,L.,,"Leclercq, L.n."
1,PLZGE,Pilz G.E.,Pilz,George,E.,"Pilz, G.E."
2,CLIPP,De Clippele,De Clippele,,,"De Clippele, n.n."
3,GRNWL,Greenstock W.,Greenstock,William,,"Greenstock, W.n."
4,R150,Renauld F.,Renauld,F.,,"Renauld, F.n."
...,...,...,...,...,...,...
9911,CARLF,Carlier F.-X.,Carlier,François-Xavier,,"Carlier, F.n."
9912,PANIG,Panigrahi G.,Panigrahi,G.,,"Panigrahi, G.n."
9913,FELFTGI,Feltgen I.,Feltgen,,I.,"Feltgen, n.I."
9914,DEFLD,Defleur D.,Defleur,David,,"Defleur, D.n."


In [41]:
# Vectorize CHAH collector names (use transform())
tf_idf_matrix_dirty = vectorizer.transform(df_bgmcoll['match_name'])

# Calculate Cosine Similarity
import time
t1 = time.time()
matches = awesome_cossim_topn(tf_idf_matrix_dirty, tf_idf_matrix_clean.transpose(), 1, 0.5)
t = time.time()-t1
print("SELFTIMED:", t)

# Get matches and create data frame
matches_df = get_matches_df(matches, df_bgmcoll['match_name'], df_wikidata_unique['canonical_string'], top=0)

matches_df.head()

SELFTIMED: 0.8342916965484619


Unnamed: 0,left_side,right_side,similarity
0,"Leclercq, L.n.","Leclercq, S.",0.757
1,"Pilz, G.E.","Pilz, G.E.",1.0
2,"Greenstock, W.n.","Bostock, W.",0.546
3,"Renauld, F.n.","Renauld, F.F.G.",0.801
4,"Maula, C.n.","Paula, C.C.d.",0.594


In [44]:
# Join matches data frame to MEL collectors data frame
df_bgmcoll_matches = pd.merge(df_bgmcoll, matches_df, left_on='match_name', right_on='left_side', how='left')

# Join Wikidata items
df_bgmcoll_matches_wikidata = pd.merge(df_bgmcoll_matches, df_wikidata, left_on='right_side', right_on='canonical_string', how='left')
df_bgmcoll_matches_wikidata = pd.merge(df_bgmcoll_matches_wikidata, df_wikidata_unique, left_on='right_side', right_on='canonical_string', how='left')
df_bgmcoll_matches_wikidata.rename(columns={df_bgmcoll_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# Remove unwanted columns
df_bgmcoll_wikidata_cossim = df_bgmcoll_matches_wikidata[['COLL_ID', 'NAME', 'LNAME', 'FNAME', 'MNAME', 'match_name', 
                                                          'left_side', 'right_side', 'similarity', 'dup_count', 
                                                          'item', 'itemLabel', 'surname', 'initials', 'canonical_string', 
                                                          'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'bloodhound_id', 
                                                          'enc_au_sc_id', 'au_dict_bio', 'yob', 'yod', 'wyb', 'wye', 
                                                          'wikidata_link', 'orcid_link', 'harv_link', 'ipni_link', 
                                                          'bloodhound_link', 'enc_au_sc_link', 'au_dict_bio_link']]

# Sort by similarity (desc), number of Wikidata items (asc) and number of collections (desc)
df_bgmcoll_wikidata_cossim.sort_values(by=['similarity', 'dup_count'], ascending=[False, True], inplace=True)

df_bgmcoll_wikidata_cossim

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,COLL_ID,NAME,LNAME,FNAME,MNAME,match_name,left_side,right_side,similarity,dup_count,...,yod,wyb,wye,wikidata_link,orcid_link,harv_link,ipni_link,bloodhound_link,enc_au_sc_link,au_dict_bio_link
2,PLZGE,Pilz G.E.,Pilz,George,E.,"Pilz, G.E.","Pilz, G.E.","Pilz, G.E.",1.0,1.0,...,2017.0,,,http://www.wikidata.org/wiki/Q21389330,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/24871-1,,,
8,D121,Debeaux O.,Debeaux,Jean,Odon,"Debeaux, J.O.","Debeaux, J.O.","Debeaux, J.O.",1.0,1.0,...,1910.0,,,http://www.wikidata.org/wiki/Q3173773,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/2058-1,,,
21,MLLNG,Miller N.G.,Miller,Norton,George,"Miller, N.G.","Miller, N.G.","Miller, N.G.",1.0,1.0,...,2011.0,,,http://www.wikidata.org/wiki/Q21521514,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/23405-1,,,
24,MEIKR,Meikle R.D.,Meikle,Robert,Desmond,"Meikle, R.D.","Meikle, R.D.","Meikle, R.D.",1.0,1.0,...,,,,http://www.wikidata.org/wiki/Q1309063,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/6316-1,,,
46,L952,Laurer J.F.,Laurer,Johann,Friedrich,"Laurer, J.F.","Laurer, J.F.","Laurer, J.F.",1.0,1.0,...,1873.0,,,http://www.wikidata.org/wiki/Q21341241,,https://kiki.huh.harvard.edu/databases/botanis...,https://www.ipni.org/a/5328-1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11345,K050,Kristof L.,Kristof,Lorenz,,"Kristof, L.n.",,,,,...,,,,,,,,,,
11353,L108,Laessoe Th.,Laessoe,Th.,,"Laessoe, T.n.",,,,,...,,,,,,,,,,
11354,L098,Lemyel,Lemyel,,,"Lemyel, n.n.",,,,,...,,,,,,,,,,
11359,VDZOA,Van der Zon A.P.M.,Van der Zon,A.P.M.,,"Van der Zon, A.P.M.n.",,,,,...,,,,,,,,,,


In [45]:
# Save as CSV file
df_bgmcoll_wikidata_cossim.to_csv('data/bgmcoll_wikidata_cossim.csv')