# Disambiguation of WoS Institutions

In [None]:
import requests, pandas as pd, re, sqlite3, math, src
from collections import Counter

# Raw Data (Web Of Science)

In [None]:
db = sqlite3.connect(src.PATH / "data/sample.db")
cur = db.cursor()
df = pd.DataFrame(cur.execute("""select * from wos_institutions""").fetchall(), columns=[c[0] for c in cur.description])
df.head()

Unnamed: 0,index,PK_INSTITUTIONS,ORGANIZATION1,ORGANIZATION2,ORGANIZATION3,ORGANIZATION4,INSTITUTION_FULL,POSTALCODE,CITY,ADDRESS_FULL,COUNTRYCODE
0,0,24,Kings Coll London,,,,Kings Coll London,,London,"Kings Coll London, London, England",GBR
1,1,93,Kings Coll London,Dept Psychosis Studies,Inst Psychiat,,"Kings Coll London, Dept Psychosis Studies, Inst Psychiat",,London,"Kings Coll London, Inst Psychiat, Dept Psychosis Studies, London, England",GBR
2,2,159,Univ Konstanz,Dept Psychol,,,"Univ Konstanz, Dept Psychol",,Constance,"Univ Konstanz, Dept Psychol, Constance, Germany",DEU
3,3,530,Deutsch Herzzentrum Munich,,,,Deutsch Herzzentrum Munich,,Munich,"Deutsch Herzzentrum Munich, Munich, Germany",DEU
4,4,1047,Friedrich Loeffler Inst,,,,Friedrich Loeffler Inst,,Insel Riems,"Friedrich Loeffler Inst, Insel Riems, Germany",DEU


# Metadata (WikiData)

WikiData-Entity: https://www.wikidata.org/wiki/Q245247

WikiData-Query:
https://query.wikidata.org/#%23King%27s%20College%20Q245247%0ASELECT%20%3Flabel%20%3Faka%20%3Flocation%20%3FlocationLabel%20%3Fcountry%20%3FcountryLabel%20%3Fioc%20%3Fparent%20%3FparentLabel%0AWHERE%20%0A%7B%0A%20%20VALUES%20%3Fitem%20%7B%20wd%3AQ245247%20%7D%0A%20%20OPTIONAL%7B%3Fitem%20rdfs%3Alabel%20%3Flabel.%20FILTER%20%28langMatches%28%20lang%28%3Flabel%29%2C%20%22EN%22%20%29%20%29%7D%0A%20%20%23OPTIONAL%7B%3Fitem%20skos%3AaltLabel%20%3Faka.%7D%20%23%20also%20known%20as%0A%20%20OPTIONAL%7B%3Fitem%20wdt%3AP276%20%3Flocation.%7D%20%23%20location%0A%20%20OPTIONAL%7B%3Fitem%20wdt%3AP17%20%3Fcountry.%7D%20%23%20country%0A%20%20OPTIONAL%7B%3Fcountry%20wdt%3AP984%20%3Fioc.%7D%20%23%20country%0A%20%20OPTIONAL%7B%3Fitem%20wdt%3AP361%20%3Fparent.%7D%20%23%20parent%20of%0A%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22%5BAUTO_LANGUAGE%5D%2Cen%22.%20%7D%0A%7D

In [None]:
#fetching data from Wikidata

##catching empty labels
def try2unpack(x):
    try:
        x=x['value']
    except (KeyError, TypeError):
        pass
    
    try:
        if 'http://www.wikidata.org/entity/Q' in x or 'http://www.wikidata.org/entity/P' in x:
            return x.rsplit('/',1)[1]
        else:
            return x
    except TypeError:
        return x

##convert fetched data to dataframe
def json2pandas(data):
    return pd.DataFrame(data['results']['bindings'], columns=data['head']['vars']).applymap(lambda x: try2unpack(x))

##example query    
url = 'https://query.wikidata.org/sparql'


query = """
#King's College Q245247
SELECT ?label ?aka ?location ?locationLabel ?country ?countryLabel ?ioc ?parent ?parentLabel
WHERE 
{
  VALUES ?item { wd:Q245247 }
  OPTIONAL{?item rdfs:label ?label. FILTER (langMatches( lang(?label), "EN" ) )}
  #OPTIONAL{?item skos:altLabel ?aka.} # also known as
  OPTIONAL{?item wdt:P276 ?location.} # location
  OPTIONAL{?item wdt:P17 ?country.} # country
  OPTIONAL{?country wdt:P984 ?ioc.} # country
  OPTIONAL{?item wdt:P361 ?parent.} # parent of
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
""".strip()
r = requests.get(url, params = {'format': 'json', 'query': query})
if (r.status_code == 414 | r.status_code == 431 ):
    r = requests.post(url, params = {'format': 'json', 'query': query})
status=r.status_code
data = r.json()
data=json2pandas(data)
data

Unnamed: 0,label,aka,location,locationLabel,country,countryLabel,ioc,parent,parentLabel
0,King's College London,,Q84,London,Q145,United Kingdom,GBR,Q170027,University of London
1,King's College London,,Q84,London,Q145,United Kingdom,GBR,Q170027,University of London
2,King's College London,,Q84,London,Q145,United Kingdom,GBR,Q170027,University of London


In [None]:
# get all items in specific classes (ordered by organizational size / hierarchy)
get_classes={"Q1075106":"1102_UNI_system", # e.g: University of Massachusetts
             "Q3918":"1101_UNI", # e.g. University of Massachusetts Amherst
             "Q21028957":"1100_UNI_hs", # Hochschule e.g: Bucerius Law School
             "Q178706" :"0009_inst", # all kind of institute, e.g.: Faculty of Law of the Trnava University in Trnava
            }

all_Qs = pd.DataFrame()
for Q, label in get_classes.items():
    print(label)
    query=f"""SELECT DISTINCT ?item ?instance WHERE {{?item wdt:P31/wdt:P279* wd:{Q}; wdt:P31 ?instance.}}"""

    ##request the data
    r = requests.get(url, params = {'format': 'json', 'query': query})
    if (r.status_code == 414 | r.status_code == 431 ):
        r = requests.post(url, params = {'format': 'json', 'query': query})
    status=r.status_code
    data = r.json()
    data=json2pandas(data)
    data['label']=label
    all_Qs=all_Qs.append(data)

# keep the higher class: uni_system > uni > hochschule
all_Qs=all_Qs.sort_values('label', ascending=False)

all_Qs = all_Qs.drop_duplicates("item", keep='first')
print(len(all_Qs))

all_Qs.head()

1102_UNI_system
1101_UNI
1100_UNI_hs
0009_inst
444088


Unnamed: 0,item,instance,label
0,Q217439,Q3918,1102_UNI_system
28,Q2002043,Q1075106,1102_UNI_system
30,Q623581,Q45400320,1102_UNI_system
31,Q2140391,Q1075106,1102_UNI_system
32,Q2331177,Q1075106,1102_UNI_system


# Data Example: Germany

In [None]:
wos_sample = df[df['COUNTRYCODE']=='DEU'][['PK_INSTITUTIONS','ORGANIZATION1','CITY']]
wos_sample.head()

Unnamed: 0,PK_INSTITUTIONS,ORGANIZATION1,CITY
2,159,Univ Konstanz,Constance
8,530,Deutsch Herzzentrum Munich,Munich
10,1047,Friedrich Loeffler Inst,Insel Riems
12,131559,Tech Univ Munich,Am Coulombwall
13,131797,Univ Cologne,Cologne


In [None]:
#wikidata
query="""
SELECT DISTINCT ?label_de ?label_en ?item ?instance ?location1Label ?location2Label ?location3Label ?parent WHERE {?item wdt:P31/wdt:P279* wd:Q3918; wdt:P17 wd:Q183; wdt:P31 ?instance.
    
      OPTIONAL{?item rdfs:label ?label_de. FILTER (langMatches( lang(?label_de), "de" ) )}
      OPTIONAL{?item rdfs:label ?label_en. FILTER (langMatches( lang(?label_en), "en" ) )}
      OPTIONAL{?item wdt:P131 ?location1.} # location1
      OPTIONAL{?item wdt:P276 ?location2.} # location2
      OPTIONAL{?item wdt:P159 ?location3.} # location3
      OPTIONAL{?item wdt:P361 ?parent.} # parent of
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],de". }
    }
""".strip()

r = requests.get(url, params = {'format': 'json', 'query': query})
if (r.status_code == 414 | r.status_code == 431 ):
    r = requests.post(url, params = {'format': 'json', 'query': query})
status=r.status_code
data = r.json()
data=json2pandas(data)
data=data.drop_duplicates('item')
data.head()

Unnamed: 0,label_de,label_en,item,instance,location1Label,location2Label,location3Label,parent
0,Universität Bielefeld,Bielefeld University,Q24382,Q1767829,Bielefeld,,Bielefeld,
2,Hochschule Bremen,Bremen University of Applied Sciences,Q1622092,Q875538,Bremen,,,
5,Technische Universität Hamburg,Technical University of Hamburg,Q1060,Q1371037,Bezirk Harburg,,Hamburg,
6,Hochschule Bonn-Rhein-Sieg,Bonn-Rhein-Sieg University of Applied Sciences,Q1622083,Q875538,Rheinbach,Mülldorf,,
9,Hochschule Bremerhaven,University of Applied Sciences Bremerhaven,Q1622093,Q875538,Bremerhaven,,,


In [None]:
def one_location(loc1, loc2, loc3):
    if loc1:
        return loc1
    if loc2:
        return loc2
    if loc3:
        return loc3
    return None

data['location'] = data[['location1Label','location2Label','location3Label']].apply(lambda x: one_location(x[0],x[1],x[2]), axis=1)
data = data[['label_de','label_en', 'item', 'location']]
data.head()

Unnamed: 0,label_de,label_en,item,location
0,Universität Bielefeld,Bielefeld University,Q24382,Bielefeld
2,Hochschule Bremen,Bremen University of Applied Sciences,Q1622092,Bremen
5,Technische Universität Hamburg,Technical University of Hamburg,Q1060,Bezirk Harburg
6,Hochschule Bonn-Rhein-Sieg,Bonn-Rhein-Sieg University of Applied Sciences,Q1622083,Rheinbach
9,Hochschule Bremerhaven,University of Applied Sciences Bremerhaven,Q1622093,Bremerhaven


In [None]:
data_de = data[['label_de','item','location']]
data_de.columns=['label','item','location']
data_en = data[['label_en','item','location']]
data_en.columns=['label','item','location']
data = data_de.append(data_en)
data = data[~data['label'].isnull()]
data.head()

Unnamed: 0,label,item,location
0,Universität Bielefeld,Q24382,Bielefeld
2,Hochschule Bremen,Q1622092,Bremen
5,Technische Universität Hamburg,Q1060,Bezirk Harburg
6,Hochschule Bonn-Rhein-Sieg,Q1622083,Rheinbach
9,Hochschule Bremerhaven,Q1622093,Bremerhaven


# Matching Example: Germany

In [None]:
def ngrams(string, n=3):
    try:
        string = re.sub(r'[,-./]|\sBD',r'', string)
        ngrams = zip(*[string[i:] for i in range(n)])
        return [''.join(ngram) for ngram in ngrams]
    except TypeError:
        return None
    
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator
    
def matching(org, city):
    label1 = Counter(ngrams(org))
    city1 = Counter(ngrams(city))

    simil=[]

    for row in data.iterrows():
        label, city = row[1]['label'], row[1]['location']

        label2 = Counter(ngrams(label))
        city2 = Counter(ngrams(city))

        try:
            cosine_label = get_cosine(label1, label2)
        except TypeError:
            cosine_label = 0 

        try:
            cosine_city = get_cosine(city1, city2)
        except TypeError:
            cosine_city = 0 

        simil.append( (row[1]['item'], row[1]['label'], cosine_label, cosine_city) )

    results= pd.DataFrame(simil, columns=['qid','label','org_sim', 'city_sim'])

    return results.sort_values(['org_sim', 'city_sim'], ascending=[False, False])

In [None]:
wos_sample[:10]

Unnamed: 0,PK_INSTITUTIONS,ORGANIZATION1,CITY
2,159,Univ Konstanz,Constance
8,530,Deutsch Herzzentrum Munich,Munich
10,1047,Friedrich Loeffler Inst,Insel Riems
12,131559,Tech Univ Munich,Am Coulombwall
13,131797,Univ Cologne,Cologne
16,655368,Univ Heidelberg,Mannheim
17,656411,Univ Bayreuth,Bayreuth
21,1049374,Tech Univ Dresden,Dresden
24,1181416,Univ Leipzig,Leipzig
26,1311442,Univ Bremerhaven,Bremerhaven


In [None]:
for row in wos_sample[:10].iterrows():
    org, city = row[1]['ORGANIZATION1'], row[1]['CITY']
    
    result = matching(org=org, city=city)
    print(org, city , ' ---> ', result[:1].values[0])

Univ Konstanz Constance  --->  ['Q835440' 'Universität Konstanz' 0.6396021490668314 0.6172133998483676]
Deutsch Herzzentrum Munich Munich  --->  ['Q157808' 'Technical University Munich' 0.23570226039551584 0.0]
Friedrich Loeffler Inst Insel Riems  --->  ['Q17123243' 'Friedrichs-Polytechnikum' 0.3333333333333333 0.0]
Tech Univ Munich Am Coulombwall  --->  ['Q157808' 'Technical University Munich' 0.5657789498610036 0.0]
Univ Cologne Cologne  --->  ['Q54096' 'University of Cologne' 0.5803810000880093 0.0]
Univ Heidelberg Mannheim  --->  ['Q151510' 'Heidelberg University' 0.6362847629757777 0.0]
Univ Bayreuth Bayreuth  --->  ['Q702482' 'Universität Bayreuth' 0.6396021490668314 1.0000000000000002]
Tech Univ Dresden Dresden  --->  ['Q158158' 'Technische Universität Dresden' 0.5367450401216932
 0.9999999999999998]
Univ Leipzig Leipzig  --->  ['Q154804' 'Universität Leipzig' 0.6135719910778963 0.9999999999999998]
Univ Bremerhaven Bremerhaven  --->  ['Q1622093' 'Hochschule Bremerhaven' 0.597614

# Tweaked Matcher used in Project
only show case, not part of the workshop package

In [None]:
import pandas as pd, sqlite3

db = sqlite3.connect("/home/ckoss/Desktop/abd_workshop/abd_workshop_2021/data/sample.db")
cur = db.cursor()
df = pd.DataFrame(cur.execute("""select * from wos_institutions""").fetchall(), columns=[c[0] for c in cur.description])
wos_sample = df[df['COUNTRYCODE']=='DEU'][['PK_INSTITUTIONS','ORGANIZATION1','CITY']]

In [None]:
from abd.match.institutions.matcher import InstitutionMatcher as IM
matcher=IM(wos_db=False)

[17:52:42.307] Loaded dataset: wos_b_2020_matcher.db


In [None]:
for row in wos_sample[:100].iterrows():
    org, city = row[1]['ORGANIZATION1'], row[1]['CITY']
    
    if not city:
        city = ''
    
    try:
        result = matcher.find_wiki_id_by_name(organization1=org, city=city, countrycode='DEU')
        wiki = matcher.get_wiki_datapoint(result)

        print(org, city , ' ---> ', wiki.wiki_id, wiki.main_label)
    except AttributeError:
        print('Error unknown Institution:', org,city)

Univ Konstanz Constance  --->  Q835440 University of Konstanz
Deutsch Herzzentrum Munich Munich  --->  Q1205693 German Heart Center Munich
Friedrich Loeffler Inst Insel Riems  --->  Q1457808 Friedrich Loeffler Institute
Tech Univ Munich Am Coulombwall  --->  Q157808 Technical University of Munich
Univ Cologne Cologne  --->  Q54096 University of Cologne
Univ Heidelberg Mannheim  --->  Q151510 Heidelberg University
Univ Bayreuth Bayreuth  --->  Q702482 University of Bayreuth
Tech Univ Dresden Dresden  --->  Q158158 TU Dresden
Univ Leipzig Leipzig  --->  Q154804 Leipzig University
Univ Bremerhaven Bremerhaven  --->  Q1622093 University of Applied Sciences Bremerhaven
Tech Univ Bergakad Freiberg Freiberg  --->  Q689854 Freiberg University of Mining and Technology
Univ Tubingen Tubingen  --->  Q153978 University of Tübingen
Hahn Meitner Inst Berlin GmbH Berlin  --->  Q314578 Helmholtz-Zentrum Berlin
Univ Stuttgart Stuttgart  --->  Q122453 University of Stuttgart
Univ Gottingen Gottingen  --