In [377]:
import json
import unicodedata
from pprint import pprint
import textdistance as td
import pandas as pd
import itertools

In [145]:
def normalize_all(s):
    return unicodedata.normalize('NFD', s).encode('ASCII','ignore').decode()

def normalize_german(s):
    uml = {'Ä' : 'Ae', 'Ö' : 'Oe', 'Ü' : 'Ue', 'ä' : 'ae', 'ö' : 'oe', 'ü' : 'ue'}
    for uu in uml:
        s = s.replace(uu, uml[uu])
    return(s)

In [3]:
with open("name-match/round_1.json", "r") as fh:
    res = json.load(fh)

In [262]:
ex1a = ['L.', "L'Her."]
ex1b = ['L.', "L'Hér."]

ex2a = ['A. Schumach.', 'H. E. Weber']
ex2b = ['Schumach.', 'H. E. Weber']

In [373]:
output = []
for m in res['Matches']:
    s = m['sourceRecord']
    try:
        r = max([j for j in m['referenceRecords'] if j['matchType'] == "Exact"], key=lambda x: x['score'])
        if r and 'authors' in s and 'authors' in r:
            rr = r['authors']
            ss = s['authors']
            #if len(rr) != len(ss):
            if rr == ss:
                case = "match"
            elif rr != ss:
                if [normalize_all(x) for x in rr] == [normalize_all(y) for y in ss]:
                    case = "match_no_diacritics"
                elif [normalize_german(x) for x in rr] == [normalize_german(y) for y in ss]:
                    case = "match_normalized_umlaut"
                elif len(rr) == len(ss):
                    # Find longest common subsequences
                    # Difference is usually in how author names are abbreviated
                    # so lcsseq is the most appropriate distance measure.
                    # Strip the trailing . from abbreviated names
                    lsseq_scores = {}
                    for a1, a2 in zip(rr, ss):
                        lsseq = td.lcsseq(a1.rstrip('.'), a2.rstrip('.'))
                        score = len(lsseq) / min([len(a1.rstrip('.')), len(a2.rstrip('.'))])
                        lsseq_scores[(a1,a2)] = {'lsseq' : lsseq, 'score' : score}
                    if all([lsseq_scores[k]['score'] == 1.0 for k in lsseq_scores]):
                        case = "subseq_contained"
                    elif any([lsseq_scores[k]['score'] == 0 for k in lsseq_scores]):
                        case = "mismatch"
                    else:
                        case = "check"
                else:
                    case = "author_count_mismatch"
        output.append({
            'authors1' : rr,
            'authors2' : ss,
            'case' : case,
        })
    except ValueError: # no referenceRecord with matchType "Exact"
        pass

In [352]:
[i for i in output if i['case'] == 'mismatch']

[{'authors1': ['Sievers', 'O. Appel'],
  'authors2': ['L.', 'O. Appel'],
  'case': 'mismatch'},
 {'authors1': ['L.', 'Peterm.'],
  'authors2': ['L.', 'Link'],
  'case': 'mismatch'},
 {'authors1': ['L.', 'Peterm.'],
  'authors2': ['L.', 'Link'],
  'case': 'mismatch'},
 {'authors1': ['Jacq.', 'DC.'], 'authors2': ['L.', 'DC.'], 'case': 'mismatch'},
 {'authors1': ['Laest.', 'Rchb. fil.', 'Soo'],
  'authors2': ['Laest.', 'Hartm.', 'Soó'],
  'case': 'mismatch'},
 {'authors1': ['L.', 'Willd.'],
  'authors2': ['L.', 'Desf.'],
  'case': 'mismatch'},
 {'authors1': ['Nathh.', 'DC.'],
  'authors2': ['L.', 'DC.'],
  'case': 'mismatch'},
 {'authors1': ['Nathh.', 'DC.'],
  'authors2': ['L.', 'DC.'],
  'case': 'mismatch'},
 {'authors1': ['L.', 'Rchb.'],
  'authors2': ['L.', 'Sweet'],
  'case': 'mismatch'},
 {'authors1': ['Sims', 'DC.'],
  'authors2': ['Sims', 'Sweet'],
  'case': 'mismatch'},
 {'authors1': ['Mill.', 'Fuss'],
  'authors2': ['Mill.', 'A. W. Hill'],
  'case': 'mismatch'},
 {'authors1': ['

In [376]:
pd.DataFrame(output).value_counts('case', dropna=False)

case
match                      3761
author_count_mismatch       169
subseq_contained            103
match_no_diacritics          50
check                        49
mismatch                     26
match_normalized_umlaut       6
Name: count, dtype: int64

In [374]:
# Get all the names and look them up in Wikidata
# botanist author abbreviation https://www.wikidata.org/wiki/Property:P428
# author citation (zoology) https://www.wikidata.org/wiki/Property:P835
allnames = list(set(itertools.chain.from_iterable([i['authors1'] + i['authors2'] for i in output])))

In [375]:
# for botany, remove spaces after the full-stop in initials
" ".join([("\"" + i.replace('. ', '.') + "\"") for i in allnames[0:10]])

'"Celak." "Jérôme" "A.Chev." "Rchb.fil." "Prokudin" "Hogg" "W.C.R.Watson" "Murray" "W.Koch" "Döll"'

Example SPARQL query for Wikidata

```
SELECT ?item ?itemLabel ?QPROP WHERE {
  VALUES ?QPROP {"Celak." "Jérôme" "A.Chev." "Rchb.fil." "Prokudin" "Hogg" "W.C.R.Watson" "Murray" "W.Koch" "Döll"}
  ?item wdt:P428 ?QPROP.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
LIMIT 100
```