# Match AVH collectors to Wikidata items

In [1]:
import pandas as pd

### Load Wikidata data set

[Jupyter Notebook for creating the Wikidata data set](./create_wikidata_dataset.ipynb)

Out of the Wikidata items data set we create a data frame with unique canonical name strings and their counts.

In [2]:
wikidata = pd.read_csv('data/wikidata_persons.csv')
wikidata = wikidata.iloc[:, 1:]

wd_test = wikidata.groupby('canonical_string').agg({'item': ['count']}).reset_index()

print(wd_test.tail())

# colls = list(wikidata.columns)
# wikidata = wikidata[[colls[-1]] + colls[0:-1]]

  interactivity=interactivity, compiler=compiler, result=result)


      canonical_string  item
                       count
98085        Ǒwaki, K.     1
98086   Șerbanescu, I.     1
98087     Șuster, P.M.     1
98088          Șık, L.     1
98089         Țopa, E.     1


### Load AVH collectors data set

[Jupyter Notebook for creating the AVH collectors data set](./create_avh_collectors_dataset.ipynb)

In [3]:
avh = pd.read_csv('data/avh_collectors.csv')
avh = avh.iloc[:, 1:]

print(avh.head())

               label                     i18nCode  count  \
0  Beauglehole, A.C.  collector.Beauglehole, A.C.  90942   
1      Forster, P.I.      collector.Forster, P.I.  64649   
2         Hyland, B.         collector.Hyland, B.  57265   
3           Latz, P.           collector.Latz, P.  51230   
4      Streimann, H.      collector.Streimann, H.  45346   

                              fq  start_date  end_date  activity_span  
0  collector:"Beauglehole, A.C."      1865.0    2005.0          140.0  
1      collector:"Forster, P.I."      1955.0    2018.0           63.0  
2         collector:"Hyland, B."      1952.0    2008.0           56.0  
3           collector:"Latz, P."      1875.0    2019.0          144.0  
4      collector:"Streimann, H."      1896.0    2001.0          105.0  


### Set up the text search

See https://towardsdatascience.com/fuzzy-matching-at-scale-84f2bfd0c536

The ngrams function is used as an analyzer in the text search later.

In [4]:
import re
!pip install ftfy # amazing text cleaning for decode issues..
from ftfy import fix_text

def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]


print(ngrams('Klazenga, N.'))
print(ngrams(avh.loc[0, 'label']))
print(ngrams(wd_test.loc[0, 'canonical_string'].values[0]))


[' Kl', 'Kla', 'laz', 'aze', 'zen', 'eng', 'nga', 'ga ', 'a N', ' N ']
[' Be', 'Bea', 'eau', 'aug', 'ugl', 'gle', 'leh', 'eho', 'hol', 'ole', 'le ', 'e A', ' Ac', 'Ac ']
[' Wa', 'Wal', 'alr', 'lra', 'rae', 'aev', 'eve', 'ven', 'ens', 'ns ', 's O', ' Oh', 'Oh ']


Vectorize Wikidata names...

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

wikidata_names = wd_test['canonical_string']

# vectorize wikidata names
print('Vectorizing data. This may take a while...')
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams, lowercase=False)
tfidf = vectorizer.fit_transform(wikidata_names)
print('Vectorizing completed')


Vectorizing data. This may take a while...
Vectorizing completed


Set up the function that performs the nearest neighbour matches...

In [6]:
from sklearn.neighbors import NearestNeighbors

nbrs = NearestNeighbors(n_neighbors=1, n_jobs=-1).fit(tfidf) # tfidf contains the vectorized wikidata names from the previous step

# matching query
def getNearestN(query):
  queryTFIDF_ = vectorizer.transform(query)
  distances, indices = nbrs.kneighbors(queryTFIDF_)
  return distances, indices


### Perform the matching

Perform the NN matches on the AVH collector names and create a data frame with matches...

In [7]:
avh_names = set(avh['label'].values) # convert list to set for better performance

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(avh_names)
duration = time.time() - start
print('Completed in:', duration, 's')

avh_names = list(avh_names) # convert back to list

print('Finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [avh_names[i], wd_test.values[j][0][0], round(distances[i][0],2)]
  matches.append(temp)

print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['name','matched_name','distance'])
print('Done') 

matches = matches.sort_values(['distance'])
matches = matches.reset_index()

matches.head()

Getting nearest neighbours...
Completed in: 1.9424328804016113 s
Finding matches...
Building data frame...
Done


Unnamed: 0,index,name,matched_name,distance
0,500,"Young, A.","Young, A.",0.0
1,507,"Norris, D.H.","Norris, D.H.",0.0
2,517,"Hall, N.","Hallé, N.",0.0
3,520,"Jackson, W.D.","Jackson, W.D.",0.0
4,521,"Baker, M.L.","Baker, M.L.",0.0


### Create output

Link the matches data frame back to the AVH collectors and Wikidata items data frames...

In [8]:
# join matches data frame back to avh dataframe 
avh_matches = pd.merge(avh, matches, left_on='label', right_on='name')

avh_matches.head()

Unnamed: 0,label,i18nCode,count,fq,start_date,end_date,activity_span,index,name,matched_name,distance
0,"Beauglehole, A.C.","collector.Beauglehole, A.C.",90942,"collector:""Beauglehole, A.C.""",1865.0,2005.0,140.0,936,"Beauglehole, A.C.","Beauglehole, A.C.",0.0
1,"Forster, P.I.","collector.Forster, P.I.",64649,"collector:""Forster, P.I.""",1955.0,2018.0,63.0,912,"Forster, P.I.","Forster, P.I.",0.0
2,"Hyland, B.","collector.Hyland, B.",57265,"collector:""Hyland, B.""",1952.0,2008.0,56.0,781,"Hyland, B.","Hyland, B.",0.0
3,"Latz, P.","collector.Latz, P.",51230,"collector:""Latz, P.""",1875.0,2019.0,144.0,919,"Latz, P.","Latz, P.K.",0.68
4,"Streimann, H.","collector.Streimann, H.",45346,"collector:""Streimann, H.""",1896.0,2001.0,105.0,608,"Streimann, H.","Streimann, H.",0.0


With grouped Wikidata items:

In [9]:
# link counts of wikidata items with canonical name string
avh_matches_g1 = pd.merge(avh_matches, wd_test, left_on='matched_name', right_on='canonical_string')
avh_matches_g1.rename(columns = {list(avh_matches_g1)[-1]: 'item_count'}, inplace=True)

# link wikidata items with canonical name string (pipe separated if more than one)
print('Creating items aggregate...')
wikidata_uniq_items = wikidata.groupby(['canonical_string'])['item'].apply('|'.join).reset_index()
print('Done.')
avh_matches_g2 = pd.merge(avh_matches_g1, wikidata_uniq_items, left_on='matched_name', right_on='canonical_string')
avh_matches_g2.rename(columns = {list(avh_matches_g2)[-1]: 'items'}, inplace=True)

# link wikidata items with canonical name string (pipe separated if more than one)
print('Creating item labels aggregate...')
wikidata_uniq_itemlabels = wikidata.groupby(['canonical_string'])['itemLabel'].apply('|'.join).reset_index()
print('Done.')
avh_matches_g3 = pd.merge(avh_matches_g2, wikidata_uniq_itemlabels, left_on='matched_name', right_on='canonical_string')
avh_matches_g3.rename(columns = {list(avh_matches_g3)[-1]: 'item_labels'}, inplace=True)

# Remove superfluous columns
avh_matches_group = avh_matches_g3[['label', 'count', 'start_date', 'end_date', 'activity_span', 
                                   'name', 'matched_name', 'distance', 'item_count', 
                                   'items', 'item_labels']]
avh_matches_group.sort_values(by=['distance', 'item_count', 'count'], inplace=True)

avh_matches_group.head()



Creating items aggregate...
Done.
Creating item labels aggregate...
Done.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,name,matched_name,distance,item_count,items,item_labels
993,"Forrest, A.",687,1818.0,1994.0,176.0,"Forrest, A.","Forrest, A.",0.0,1,http://www.wikidata.org/entity/Q706734,Alexander Forrest
991,"Hamilton, A.G.",692,1882.0,1992.0,110.0,"Hamilton, A.G.","Hamilton, A.G.",0.0,1,http://www.wikidata.org/entity/Q21514592,Alexandra Greenlaw Hamilton
989,"Bayer, R.J.",693,1944.0,2009.0,65.0,"Bayer, R.J.","Bayer, R.J.",0.0,1,http://www.wikidata.org/entity/Q7291564,Randall James Bayer
990,"Catcheside, D.G.",693,1951.0,1991.0,40.0,"Catcheside, D.G.","Catcheside, D.G.",0.0,1,http://www.wikidata.org/entity/Q21165808,David Guthrie Catcheside
988,"Wright, G.T.",694,1998.0,2019.0,21.0,"Wright, G.T.","Wright, G.T.",0.0,1,http://www.wikidata.org/entity/Q59606231,Genevieve T. Wright


Save the results...

In [10]:
avh_matches_group.to_csv('data/avhcoll_wikidata_matches_group.csv')

[CSV file with match results](./data/avhcoll_wikidata_matches_group.csv)

With individual Wikidata items:

In [11]:
# join wikidata items to avh collectors matches
avh_matches_t1 = pd.merge(avh_matches, wikidata, left_on='matched_name', right_on='canonical_string')
# avh_matches_t1.drop(columns=['canonical_string'])

# link counts of wikidata items with same canonical name string
avh_matches_t2 = pd.merge(avh_matches_t1, wd_test, left_on="matched_name", right_on="canonical_string")
avh_matches_t2.rename(columns = {list(avh_matches_t2.columns)[-1]: 'dup_count'}, inplace=True)

# Clean up data frame by removing duplicated columns
# # print(list(avh_matches_t2.columns))
cols = ['label', 'count', 'start_date', 'end_date', 'activity_span', 
        'name', 'matched_name', 'distance', 'dup_count', 
        'item', 'itemLabel', 'surname', 'initials', 'canonical_string', 
        'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 
        'yob', 'yod', 'wyb', 'wye']
avh_matches_indiv = avh_matches_t2[cols]

# Order rows by NN distance and dup. count
avh_matches_indiv.sort_values(['distance', 'dup_count', 'count'], ascending=[True, True, False], inplace=True)
avh_matches_indiv.reset_index(inplace=True)

avh_matches_indiv = avh_matches_indiv.iloc[:,1:]

avh_matches_indiv.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,count,start_date,end_date,activity_span,name,matched_name,distance,dup_count,item,...,orcid,viaf,isni,harv,ipni,abbr,yob,yod,wyb,wye
0,"Beauglehole, A.C.",90942,1865.0,2005.0,140.0,"Beauglehole, A.C.","Beauglehole, A.C.",0.0,1,http://www.wikidata.org/entity/Q16744919,...,,,,,,,1920.0,2002.0,,
1,"Forster, P.I.",64649,1955.0,2018.0,63.0,"Forster, P.I.","Forster, P.I.",0.0,1,http://www.wikidata.org/entity/Q9057027,...,,,,,18907-1,P.I.Forst.,1961.0,,,
2,"Hyland, B.",57265,1952.0,2008.0,56.0,"Hyland, B.","Hyland, B.",0.0,1,http://www.wikidata.org/entity/Q4893242,...,,,,,4262-1,B.Hyland,1937.0,,,
3,"Streimann, H.",45346,1896.0,2001.0,105.0,"Streimann, H.","Streimann, H.",0.0,1,http://www.wikidata.org/entity/Q21339679,...,,69178760.0,0000 0001 1573 7118,2053.0,15669-1,Streimann,1938.0,2001.0,,
4,"Elix, J.A.",39702,1878.0,2020.0,142.0,"Elix, J.A.","Elix, J.A.",0.0,1,http://www.wikidata.org/entity/Q21339171,...,,5178780.0,0000 0000 8084 828X,93027.0,18445-1,Elix,1941.0,,,


Save the results...

In [12]:
avh_matches_indiv.to_csv('data/avhcoll_wikidata_matches_indiv.csv')

[CSV file with match results](./data/avhcoll_wikidata_matches_indiv.csv)

Explanation of columns:

Column | Description
-|-
**AVH collectors** |
label | Collector name string from AVH
count | Number of records with this collector name string in AVH
start_date | Year of first collection
end_date | Year of last collection
activity_span | Number of years between first and last collection
**Name matching** |
name | input name; = AVH collector name string
matched_name | matched name; = Wikidata item label name is matched to
distance | Nearest Neighbour distance between the name and matched name; the lower the value, the better the match
**Wikidata** |
item | Wikidata Item ID (URL)
itemLabel | Wikidata Item label
surname	| Surname; derived from item label
initials | Initials; derived from item label
canonical_string | Canonical name string; derived from item label, used for matching
orcid | ORCID ([P496](https://www.wikidata.org/wiki/Property:P496))
viaf | VIAF ID ([P214](https://www.wikidata.org/wiki/Property:P214))
isni | ISNI ID ([P213](https://www.wikidata.org/wiki/Property:P496))	
harv | Harvard Index of Botanists ID ([P6264](https://www.wikidata.org/wiki/Property:P6264))
ipni | IPNI author ID ([P586](https://www.wikidata.org/wiki/Property:P586))
abbr | botanist author abbreviation (standard form) ([P428](https://www.wikidata.org/wiki/Property:P428))
yob	| Year of birth (derived from [P569](https://www.wikidata.org/wiki/Property:P569))
yod	| Year of death (derived from [P496](https://www.wikidata.org/wiki/Property:P570))
wyb	| Start year of work period ([P2031](https://www.wikidata.org/wiki/Property:P2031))
wye | End year of work period ([P2032](https://www.wikidata.org/wiki/Property:P2032))


## CHAH collectors

In [13]:
chah = pd.read_csv('data/chah_collectors_clean.csv')
chah = chah.iloc[:, 1:6]
chah.rename(columns={'name': 'raw_name', 'canonical_string': 'clean_name'}, inplace=True)
chah.head()

Unnamed: 0,coll_index,surname,initials,clean_name,raw_name
0,c00001,ABBOTT,F.,"ABBOTT, F.","ABBOTT, Francis, Jnr"
1,c00002,ABID,M.A.,"ABID, M.A.","ABID, Munir. A., See MUNIR"
2,c00003,ABRAHAMS,L.,"ABRAHAMS, L.","ABRAHAMS, L."
3,c00004,ABRAHAMSON,A.,"ABRAHAMSON, A.","ABRAHAMSON, Ada"
4,c00005,ACKLAND,J.J.,"ACKLAND, J.J.","ACKLAND, Judith Joan"


In [14]:
chah_names = set(chah['clean_name'].values) # convert list to set for better performance

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(chah_names)
duration = time.time() - start
print('Completed in:', duration, 's')

chah_names = list(chah_names) # convert back to list

print('Finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [chah_names[i], wd_test.values[j][0][0], round(distances[i][0],2)]
  matches.append(temp)

print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['name','matched_name','distance'])
print('Done') 

matches = matches.sort_values(['distance'])
matches = matches.reset_index()

matches.head()

Getting nearest neighbours...
Completed in: 6.594850540161133 s
Finding matches...
Building data frame...
Done


Unnamed: 0,index,name,matched_name,distance
0,1748,"BRITTAN, N.H.","Brittan, N.H.",0.0
1,1745,"WALKER, M.","Walker, M.",0.0
2,1741,"BRIGGS, B.G.","Briggs, B.G.",0.0
3,1739,"HOOKER, J.D.","Hooker, J.D.",0.0
4,1738,"COSTERMANS, L.F.","Costermans, L.F.",0.0


In [15]:
chah_matches = pd.merge(chah, matches, left_on='clean_name', right_on='name')

chah_matches_t1 = pd.merge(chah_matches, wikidata, left_on='matched_name', right_on='canonical_string')

chah_matches_t2 = pd.merge(chah_matches_t1, wd_test, left_on='matched_name', right_on='canonical_string')
chah_matches_t2.rename(columns={chah_matches_t2.columns.tolist()[-1]: 'dup_count'}, inplace=True)

chah_matches_indiv = chah_matches_t2[['coll_index', 'surname_x', 'initials_x', 'clean_name', 'raw_name', 'name', 'matched_name', 'distance', 'dup_count', 'item', 'itemLabel', 'surname_y', 'initials_y', 'canonical_string', 'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 'yob', 'yod', 'wyb', 'wye']]
chah_matches_indiv.sort_values(['distance', 'dup_count'], inplace=True)
chah_matches_indiv.reset_index(inplace=True)
chah_matches_indiv.drop(columns=['index'], inplace=True)

chah_matches_indiv.to_csv('data/chahcoll_wikidata_matches_indiv.csv')
chah_matches_indiv.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,coll_index,surname_x,initials_x,clean_name,raw_name,name,matched_name,distance,dup_count,item,...,orcid,viaf,isni,harv,ipni,abbr,yob,yod,wyb,wye
0,c00001,ABBOTT,F.,"ABBOTT, F.","ABBOTT, Francis, Jnr","ABBOTT, F.","Abbott, F.",0.0,1,http://www.wikidata.org/entity/Q4529517,...,,,,,,,1799.0,1883.0,,
1,c00010,ADAMS,L.G.(.,"ADAMS, L.G.(.","ADAMS, Laurence George (Laurie)*","ADAMS, L.G.(.","Adams, L.G.",0.0,1,http://www.wikidata.org/entity/Q10316562,...,,,,,12081-1,L.G.Adams,1929.0,2014.0,,
2,c00012,ADAMS,M.,"ADAMS, M.","ADAMS, Miss","ADAMS, M.","Adams, M.",0.0,1,http://www.wikidata.org/entity/Q22110151,...,,,,,,,,,,
3,c00013,ADAMS,N.,"ADAMS, N.","ADAMS, Nilavan","ADAMS, N.","Adams, N.",0.0,1,http://www.wikidata.org/entity/Q3297913,...,,36627065.0,0000 0000 6400 0318,20403.0,32209-1,N.M.Adams,1926.0,2007.0,,
4,c00018,ADAMSON,R.S.,"ADAMSON, R.S.","ADAMSON, Robert Stephen","ADAMSON, R.S.","Adamson, R.S.",0.0,1,http://www.wikidata.org/entity/Q3436355,...,,24268246.0,0000 0000 4587 6189,40480.0,68-1,Adamson,1885.0,1965.0,,


### MEL collectors

Filter on collectors with 1000 or more collections

In [16]:
import numpy as np

melcoll = pd.read_csv('data/mel_collectors.csv')
melcoll = melcoll.loc[melcoll['num_coll'] >= 1000]
print(len(melcoll.index))

# add string to match
match_string = []
for i, row in melcoll.iterrows():
    match_string.append(row['family_name'] + ', ' + row['initials'])
melcoll['match_string'] = match_string

# Set data type of start and end year to integer
melcoll.start_year = melcoll.start_year.astype(np.int64)
melcoll.end_year = melcoll.end_year.astype(np.int64)

melcoll.head()

147


Unnamed: 0,agent_id,family_name,initials,given_names,num_coll,start_year,end_year,match_string
0,1297,Beauglehole,A.C.,,69198,1929,2001,"Beauglehole, A.C."
1,14283,Mueller,F.,Ferdinand Jacob Heinrich,25429,1812,1895,"Mueller, F."
2,19313,Stone,I.G.,,25428,1960,1999,"Stone, I.G."
3,21883,Willis,J.H.,James,20637,1885,1996,"Willis, J.H."
4,6458,Filson,R.B.,Rex,15193,1933,2000,"Filson, R.B."


In [17]:
mel_names = set(melcoll['match_string'].values) # convert list to set for better performance

import time
start = time.time()
print('Getting nearest neighbours...')
distances, indices = getNearestN(mel_names)
duration = time.time() - start
print('Completed in:', duration, 's')

mel_names = list(mel_names) # convert back to list

print('Finding matches...')
matches = []
for i,j in enumerate(indices):
  temp = [mel_names[i], wd_test.values[j][0][0], round(distances[i][0],2)]
  matches.append(temp)

print('Building data frame...')  
matches = pd.DataFrame(matches, columns=['name','matched_name','distance'])
print('Done') 

matches = matches.sort_values(['distance'])
matches = matches.reset_index()

matches.head()

Getting nearest neighbours...
Completed in: 0.37619733810424805 s
Finding matches...
Building data frame...
Done


Unnamed: 0,index,name,matched_name,distance
0,73,"Hooker, J.D.","Hooker, J.D.",0.0
1,100,"Gray, B.","Gray, B.",0.0
2,97,"Stajsic, V.","Stajsic, V.",0.0
3,94,"Milne, J.","Milne, J.",0.0
4,92,"Jones, D.L.","Jones, D.L.",0.0


In [18]:
melcoll_matches = pd.merge(melcoll, matches, left_on='match_string', right_on='name')

melcoll_matches_wikidata = pd.merge(melcoll_matches, wikidata, left_on='matched_name', right_on='canonical_string')

melcoll_matches_wikidata = pd.merge(melcoll_matches_wikidata, wd_test, left_on='matched_name', right_on='canonical_string')

melcoll_matches_wikidata.rename(columns={melcoll_matches_wikidata.columns.tolist()[-1]: 'dup_count'}, inplace=True)

# print(melcoll_matches_wikidata.columns.tolist())
melcoll_wikidata_matches = melcoll_matches_wikidata[['agent_id', 'family_name', 'initials_x', 'given_names', 
                          'num_coll', 'start_year', 'end_year', 'match_string', 
                          'name', 'matched_name', 'distance', 'dup_count', 
                          'item', 'itemLabel', 'surname', 'initials_y', 'canonical_string', 
                          'orcid', 'viaf', 'isni', 'harv', 'ipni', 'abbr', 
                          'yob', 'yod', 'wyb', 'wye']]

melcoll_wikidata_matches.sort_values(['distance', 'dup_count', 'num_coll'], ascending=[True, True, False], inplace=True)
melcoll_wikidata_matches.reset_index(drop=True, inplace=True)

melcoll_wikidata_matches.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,agent_id,family_name,initials_x,given_names,num_coll,start_year,end_year,match_string,name,matched_name,...,orcid,viaf,isni,harv,ipni,abbr,yob,yod,wyb,wye
0,1297,Beauglehole,A.C.,,69198,1929,2001,"Beauglehole, A.C.","Beauglehole, A.C.","Beauglehole, A.C.",...,,,,,,,1920.0,2002.0,,
1,19313,Stone,I.G.,,25428,1960,1999,"Stone, I.G.","Stone, I.G.","Stone, I.G.",...,,163909337.0,0000 0001 1247 4965,40174.0,27255-1,I.G.Stone,1913.0,2001.0,,
2,6458,Filson,R.B.,Rex,15193,1933,2000,"Filson, R.B.","Filson, R.B.","Filson, R.B.",...,,,,76762.0,18803-1,Filson,1930.0,,,
3,6771,Forster,P.I.,Paul Irwin,12226,1955,2118,"Forster, P.I.","Forster, P.I.","Forster, P.I.",...,,,,,18907-1,P.I.Forst.,1961.0,,,
4,20954,Walsh,N.G.,Neville,9122,1971,2020,"Walsh, N.G.","Walsh, N.G.","Walsh, N.G.",...,,18877199.0,0000 0000 6385 4733,,14171-1,N.G.Walsh,1956.0,,,


In [19]:
melcoll_wikidata_matches.to_csv('data/melcoll_wikidata_matches_indiv.csv')