# Match AVH collectors to Wikidata items

In [1]:
import pandas as pd

### Load Wikidata data set

[Jupyter Notebook for creating the Wikidata data set](./create_wikidata_dataset.ipynb)

Out of the Wikidata items data set we create a data frame with unique canonical name strings and their counts.

In [2]:
wikidata = pd.read_csv('data/wikidata_persons.csv')
wikidata = wikidata.iloc[:, 1:]

wd_test = wikidata.groupby('canonical_string').agg({'item': ['count']}).reset_index()

print(wd_test.tail())

# colls = list(wikidata.columns)
# wikidata = wikidata[[colls[-1]] + colls[0:-1]]

  interactivity=interactivity, compiler=compiler, result=result)


      canonical_string  item
                       count
95344        Ǒwaki, K.     1
95345   Șerbanescu, I.     1
95346     Șuster, P.M.     1
95347          Șık, L.     1
95348         Țopa, E.     1


### Load AVH collectors data set

[Jupyter Notebook for creating the AVH collectors data set](./create_avh_collectors_dataset.ipynb)

In [3]:
avh = pd.read_csv('data/avh_collectors.csv')
avh = avh.iloc[:, 1:]

print(avh.head())

               label                     i18nCode  count  \
0  Beauglehole, A.C.  collector.Beauglehole, A.C.  90942   
1      Forster, P.I.      collector.Forster, P.I.  64649   
2         Hyland, B.         collector.Hyland, B.  57265   
3           Latz, P.           collector.Latz, P.  51230   
4      Streimann, H.      collector.Streimann, H.  45346   

                              fq  start_date  end_date  activity_span  
0  collector:"Beauglehole, A.C."      1865.0    2005.0          140.0  
1      collector:"Forster, P.I."      1955.0    2018.0           63.0  
2         collector:"Hyland, B."      1952.0    2008.0           56.0  
3           collector:"Latz, P."      1875.0    2019.0          144.0  
4      collector:"Streimann, H."      1896.0    2001.0          105.0  


### Set up the text search

See https://towardsdatascience.com/fuzzy-matching-at-scale-84f2bfd0c536

The ngrams function is used as an analyzer in the text search later.

In [4]:
import re
!pip install ftfy # amazing text cleaning for decode issues..
from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


print(ngrams('Klazenga, N.'))
print(ngrams(avh.loc[0, 'label']))
print(ngrams(wd_test.loc[0, 'canonical_string'].values[0]))


[' Kl', 'Kla', 'laz', 'aze', 'zen', 'eng', 'nga', 'ga ', 'a N', ' N ']
[' Be', 'Bea', 'eau', 'aug', 'ugl', 'gle', 'leh', 'eho', 'hol', 'ole', 'le ', 'e A', ' Ac', 'Ac ']
[' Wa', 'Wal', 'alr', 'lra', 'rae', 'aev', 'eve', 'ven', 'ens', 'ns ', 's O', ' Oh', 'Oh ']


Vectorize Wikidata names...

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

wikidata_names = wd_test['canonical_string']

# vectorize wikidata names
print('Vectorizing data. This may take a while...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(wikidata_names)
print('Vectorizing completed')


Vectorizing data. This may take a while...
Vectorizing completed


Set up the function that performs the nearest neighbour matches...

In [6]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf) # tfidf contains the vectorized wikidata names from the previous step

# matching query
def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices


### Perform the matching

Perform the NN matches on the AVH collector names and create a data frame with matches...

In [7]:
avh_names = set(avh['label'].values) # convert list to set for better performance

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(avh_names)
duration = time.time() - start
print('Completed in:', duration, 's')

avh_names = list(avh_names) # convert back to list

print('Finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [avh_names[i], wd_test.values[j][0][0], round(distances[i][0],2)]
  matches.append(temp)

print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['name','matched_name','distance'])
print('Done') 

matches = matches.sort_values(['distance'])
matches = matches.reset_index()

matches.head()

Getting nearest neighbours...
Completed in: 1.8628828525543213 s
Finding matches...
Building data frame...
Done
   index                     name      matched_name  distance
0      0  Cooper, W. | Cooper, W.        Cooper, W.       0.0
1    393            Wallace, B.J.     Wallace, B.J.       0.0
2    392                Ashby, E.         Ashby, E.       0.0
3    391               Hunter, J.        Hunter, J.       0.0
4    390         Waterhouse, J.T.  Waterhouse, J.T.       0.0


### Create output

Link the matches data frame back to the AVH collectors and Wikidata items data frames...

In [8]:
# join matches data frame back to avh dataframe 
avh_matches = pd.merge(avh, matches, left_on='label', right_on='name')

avh_matches.head()

Unnamed: 0,label,i18nCode,count,fq,start_date,end_date,activity_span,index,name,matched_name,distance
0,"Beauglehole, A.C.","collector.Beauglehole, A.C.",90942,"collector:""Beauglehole, A.C.""",1865.0,2005.0,140.0,184,"Beauglehole, A.C.","Beaglehole, E.",0.86
1,"Forster, P.I.","collector.Forster, P.I.",64649,"collector:""Forster, P.I.""",1955.0,2018.0,63.0,41,"Forster, P.I.","Forster, P.I.",0.0
2,"Hyland, B.","collector.Hyland, B.",57265,"collector:""Hyland, B.""",1952.0,2008.0,56.0,43,"Hyland, B.","Hyland, B.",0.0
3,"Latz, P.","collector.Latz, P.",51230,"collector:""Latz, P.""",1875.0,2019.0,144.0,609,"Latz, P.","Latz, P.K.",0.69
4,"Streimann, H.","collector.Streimann, H.",45346,"collector:""Streimann, H.""",1896.0,2001.0,105.0,123,"Streimann, H.","Streimann, H.",0.0


With grouped Wikidata items:

In [24]:
# link counts of wikidata items with canonical name string
avh_matches_g1 = pd.merge(avh_matches, wd_test, left_on='matched_name', right_on='canonical_string')
avh_matches_g1.rename(columns = {list(avh_matches_g1)[-1]: 'item_count'}, inplace=True)

# link wikidata items with canonical name string (pipe separated if more than one)
print('Creating items aggregate...')
wikidata_uniq_items = wikidata.groupby(['canonical_string'])['item'].apply('|'.join).reset_index()
print('Done.')
avh_matches_g2 = pd.merge(avh_matches_g1, wikidata_uniq_items, left_on='matched_name', right_on='canonical_string')
avh_matches_g2.rename(columns = {list(avh_matches_g2)[-1]: 'items'}, inplace=True)

# link wikidata items with canonical name string (pipe separated if more than one)
print('Creating item labels aggregate...')
wikidata_uniq_itemlabels = wikidata.groupby(['canonical_string'])['itemLabel'].apply('|'.join).reset_index()
print('Done.')
avh_matches_g3 = pd.merge(avh_matches_g2, wikidata_uniq_itemlabels, left_on='matched_name', right_on='canonical_string')
avh_matches_g3.rename(columns = {list(avh_matches_g3)[-1]: 'item_labels'}, inplace=True)

# Remove superfluous columns
avh_matches_group = avh_matches_g3[['label', 'count', 'start_date', 'end_date', 'activity_span', 
                                   'name', 'matched_name', 'distance', 'item_count', 
                                   'items', 'item_labels']]
avh_matches_group.sort_values(by=['distance', 'item_count', 'count'], inplace=True)

avh_matches_group.head()

Creating items aggregate...
Done.
Creating item labels aggregate...
Done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,name,matched_name,distance,item_count,items,item_labels
992,"Hamilton, A.G.",692,1882.0,1992.0,110.0,"Hamilton, A.G.","Hamilton, A.G.",0.0,1,http://www.wikidata.org/entity/Q21514592,Alexandra Greenlaw Hamilton
990,"Bayer, R.J.",693,1944.0,2009.0,65.0,"Bayer, R.J.","Bayer, R.J.",0.0,1,http://www.wikidata.org/entity/Q7291564,Randall James Bayer
991,"Catcheside, D.G.",693,1951.0,1991.0,40.0,"Catcheside, D.G.","Catcheside, D.G.",0.0,1,http://www.wikidata.org/entity/Q21165808,David Guthrie Catcheside
989,"Wright, G.T.",694,1998.0,2019.0,21.0,"Wright, G.T.","Wright, G.T.",0.0,1,http://www.wikidata.org/entity/Q59606231,Genevieve T. Wright
674,"Scarlett, N.",697,1947.0,1998.0,51.0,"Scarlett, N.","Scarlett, N.",0.0,1,http://www.wikidata.org/entity/Q19004122,N.H. Scarlett


Save the results...

In [25]:
avh_matches_group.to_csv('data/avhcoll_wikidata_matches_group.csv')

[CSV file with match results](./data/avhcoll_wikidata_matches_group.csv) (This downloads the file to your computer.)

With individual Wikidata items:

In [32]:
# join wikidata items to avh collectors matches
avh_matches_t1 = pd.merge(avh_matches, wikidata, left_on='matched_name', right_on='canonical_string')
# avh_matches_t1.drop(columns=['canonical_string'])

# link counts of wikidata items with same canonical name string
avh_matches_t2 = pd.merge(avh_matches_t1, wd_test, left_on="matched_name", right_on="canonical_string")
avh_matches_t2.rename(columns = {list(avh_matches_t2)[-1]: 'dup_count'}, inplace=True)

# Clean up data frame by removing duplicated columns
# # print(list(avh_matches_t2.columns))
cols = ['label', 'count', 'start_date', 'end_date', 'activity_span', 
        'name', 'matched_name', 'distance', 'dup_count', 
        'item', 'itemLabel', 'surname', 'initials', 'canonical_string', 
        'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 
        'yob', 'yod', 'wyb', 'wye']
avh_matches_indiv = avh_matches_t2[cols]

# Order rows by NN distance and dup. count
avh_matches_indiv.sort_values(['distance', 'dup_count', 'count'], ascending=[True, True, False], inplace=True)
avh_matches_indiv.reset_index(inplace=True)

avh_matches_indiv = avh_matches_indiv.iloc[:,1:]

avh_matches_indiv.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,name,matched_name,distance,dup_count,item,...,orcid,viaf,isni,harv,ipni,abbr,yob,yod,wyb,wye
0,"Forster, P.I.",64649,1955.0,2018.0,63.0,"Forster, P.I.","Forster, P.I.",0.0,1,http://www.wikidata.org/entity/Q9057027,...,,,,,18907-1,P.I.Forst.,1961.0,,,
1,"Hyland, B.",57265,1952.0,2008.0,56.0,"Hyland, B.","Hyland, B.",0.0,1,http://www.wikidata.org/entity/Q4893242,...,,,,,4262-1,B.Hyland,1937.0,,,
2,"Streimann, H.",45346,1896.0,2001.0,105.0,"Streimann, H.","Streimann, H.",0.0,1,http://www.wikidata.org/entity/Q21339679,...,,69178760.0,0000 0001 1573 7118,2053.0,15669-1,Streimann,1938.0,2001.0,,
3,"Elix, J.A.",39702,1878.0,2020.0,142.0,"Elix, J.A.","Elix, J.A.",0.0,1,http://www.wikidata.org/entity/Q21339171,...,,5178780.0,0000 0000 8084 828X,93027.0,18445-1,Elix,1941.0,,,
4,"Blake, S.T.",33031,1846.0,1996.0,150.0,"Blake, S.T.","Blake, S.T.",0.0,1,http://www.wikidata.org/entity/Q2984580,...,,304022808.0,0000 0000 6521 9151,13863.0,838-1,S.T.Blake,1910.0,1973.0,,


Save the results...

In [33]:
avh_matches_indiv.to_csv('data/avhcoll_wikidata_matches_indiv.csv')

[CSV file with match results](./data/avhcoll_wikidata_matches_indiv.csv) (This downloads the file to your computer.)

Explanation of columns:

Column | Description
-|-
**AVH collectors** |
label | Collector name string from AVH
count | Number of records with this collector name string in AVH
start_date | Year of first collection
end_date | Year of last collection
activity_span | Number of years between first and last collection
**Name matching** |
name | input name; = AVH collector name string
matched_name | matched name; = Wikidata item label name is matched to
distance | Nearest Neighbour distance between the name and matched name; the lower the value, the better the match
**Wikidata**
item | Wikidata Item ID (URL)
itemLabel | Wikidata Item label
surname	| Surname; derived from item label
initials | Initials; derived from item label
canonical_string | Canonical name string; derived from item label, used for matching
orcid | ORCID ([P496](https://www.wikidata.org/wiki/Property:P496))
viaf | VIAF ID ([P214](https://www.wikidata.org/wiki/Property:P214))
isni | ISNI ID ([P213](https://www.wikidata.org/wiki/Property:P496))	
harv | Harvard Index of Botanists ID ([P6264](https://www.wikidata.org/wiki/Property:P6264))
ipni | IPNI author ID ([P586](https://www.wikidata.org/wiki/Property:P586))
abbr | botanist author abbreviation (standard form) ([P428](https://www.wikidata.org/wiki/Property:P428))
yob	| Year of birth (derived from [P569](https://www.wikidata.org/wiki/Property:P569))
yod	| Year of death (derived from [P496](https://www.wikidata.org/wiki/Property:P570))
wyb	| Start year of work period ([P2031](https://www.wikidata.org/wiki/Property:P2031))
wye | End year of work period ([P2032](https://www.wikidata.org/wiki/Property:P2032))
