In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzzy_proc
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.sql import text as sql_text
from collections import defaultdict
import re

In [2]:
class ComboFuzzer:
    def __init__(self, fuzzers):
        self.fuzzers = fuzzers
        # Define the normalisation variable in advance
        self.norm = 1/np.sqrt(len(fuzzers))
    
    def combo_fuzz(self, target, candidate):
        _score = 0
        for _fuzz in self.fuzzers:
            _raw_score = (_fuzz(target,candidate)/100)
            _score += _raw_score**2
        return np.sqrt(_score)*self.norm


class LatLonGetter:
    def __init__(self, df, scorer):
        self.scorer = scorer
        self.df = df
        
        # Find the multinational organisations which have the form:
        # <org name> (<country>)
        bracketed = {}
        for n in df_grid.Name:
            if "(" in n and ")" in n:
                org = re.sub("[\(\[].*?[\)\]]", "", n)
                country = re.findall("[\(\[](.*?)[\)\]]", n)        
                if org not in bracketed:
                    bracketed[org] = dict(names=[], countries=[])
                bracketed[org]["countries"] += country
                bracketed[org]["names"].append(n)

        # If more than one country appears, then assume to be a multi-national
        self.multinationals = {}
        for org, info in bracketed.items():
            info["countries"] = set(info["countries"])    
            if len(info["countries"]) > 1:
                for n in info["names"]:
                    self.multinationals[n] = org
        self.mn_lu = set(self.multinationals.values())
        
        # Generate the list of all names + aliases
        df_not_null = df.loc[~pd.isnull(df.alias)]
        all_possible_values = list(df.Name) + list(df_not_null.alias)
        
        # Replace multinationals
        self.all_possible_values = []
        for x in all_possible_values:
            if x in self.multinationals:
                self.all_possible_values.append(self.multinationals[x])
            else:
                self.all_possible_values.append(x)
        self.all_possible_values = list(set(self.all_possible_values))
        
        # Also generate a concurrently identical list of lower
        # case matches, which enables faster indexing
        self.lower_possible_values = [x.lower() for x in 
                                      self.all_possible_values]
        self.fuzzy_matches = {}

    def get_latlon(self, mak_name):
        assert mak_name != ""
        # Super-fast check to see if there is an exact match
        try:
            idx = self.lower_possible_values.index(mak_name)
            match = self.all_possible_values[idx]
            score = 1.
        # Otherwise, fuzzy match
        except ValueError:
            # If already done a fuzzy match for this
            if mak_name in self.fuzzy_matches:
                match, score = self.fuzzy_matches[mak_name]
            # Otherwise, do the fuzzy match
            else:
                match, score = fuzzy_proc.extractOne(query=mak_name,
                                                     choices=self.all_possible_values,
                                                     scorer=self.scorer)
                self.fuzzy_matches[mak_name] = (match, score)
        
        # Check whether the match was a multinational
        if match in self.mn_lu:
            condition = np.array([0]*len(self.df))            
            for k, v in self.multinationals.items():
                if v == match:
                    condition = condition | (self.df.Name == k)
        # Check whether the match was a Name or alias
        else:                 
            condition = self.df.Name == match
            if condition.sum() == 0:
                condition = self.df.alias == match
        _df = self.df.loc[condition]

        # Get the lat/lon
        lats = []
        lons = []
        for lat, lon in zip(_df["lat"].values, _df["lng"].values):
            if lats.count(lat) > 0 and lons.count(lon) > 0:
                continue
            lats.append(lat)
            lons.append(lon)
        return (lats, lons, score, match)


    def process_latlons(self, mak_institutes):
        isnull = pd.isnull(mak_institutes)
        if type(isnull) is bool:
            if isnull:
                return []
        elif all(isnull):     
            return []
        return [self.get_latlon(mak_name) 
                for mak_name in mak_institutes]


def get_grid_data(file_name, set_index=False):
    df = pd.read_csv(file_name, low_memory=False)
    if set_index:
        df.set_index(keys=["grid_id"], inplace=True)
    return df

In [5]:
# Build the grid data from it's components
_df_grid_address = get_grid_data("data/grid/full_tables/addresses.csv", set_index=True)
_df_grid_alias = get_grid_data("data/grid/full_tables/aliases.csv", set_index=True)
df_grid = get_grid_data("data/grid/grid.csv")
df_grid = df_grid.join(_df_grid_address,on="ID")
df_grid = df_grid.join(_df_grid_alias,on="ID")
df_grid = df_grid[["Name","lat","lng","ID","alias"]]

In [6]:
# Get the MAG-arXiv links
df_mag_arxiv = pd.read_json("data/magapi_oag_arxiv_match.json", orient="records")
df_corex = pd.read_json("data/topics_corex.json", orient="records")

In [7]:
# Instantiate the Fuzzer and LatLonGetter
cf = ComboFuzzer([fuzz.token_sort_ratio, fuzz.partial_ratio])
llg = LatLonGetter(df=df_grid, scorer=cf.combo_fuzz)

In [8]:
# Now extract orgs and match on 
df_mag_arxiv.mag_institutes.head()

0      [university of amsterdam, university of sussex]
1    [instituto de estudios superiores de administr...
2    [university of toronto, university of primorsk...
3    [yale university, georgia institute of technol...
4                                              [sagem]
Name: mag_institutes, dtype: object

In [9]:
matches = []
for _, row in df_mag_arxiv.iterrows():
    if len(row.mag_institutes) == 0:
        continue
    results = llg.process_latlons(row.mag_institutes)
    for u, (lat, lon, score, match) in zip(row.mag_institutes, results):
        if score < 0.85:
            continue
        elif score < 0.9:
            print(u, lat, lon, score, match)
            print()
        matches.append(dict(oag_id=row["oag_id"], 
                            grid_lat=lat, grid_lon=lon, 
                            match_score=score, match_value=match))

credit suisse [47.376653000000005] [8.538197] 0.855102333057278 Credit Suisse (Switzerland)

technische universitat munchen [50.813611] [12.929167] 0.8699999999999999 Technische Universität Chemnitz

technische universitat munchen [50.813611] [12.929167] 0.8699999999999999 Technische Universität Chemnitz

centrum wiskunde informatica [52.356389] [4.951944] 0.8956840960963859 Centrum Wiskunde and Informatica

technische universitat munchen [50.813611] [12.929167] 0.8699999999999999 Technische Universität Chemnitz

university of dusseldorf [51.322774] [9.507562] 0.8609297300012353 University of Kassel

orange s a [48.835023, 52.222794] [2.292472, 21.016947000000002] 0.8838834764831843 Orange 

universite du quebec [45.426334000000004] [-75.717061] 0.8705170877128144 Université du Québec à Hull

university of nice sophia antipolis [43.717658] [7.2677179999999995] 0.8791188770581597 Nice Sophia Antipolis University

national research foundation of south africa [-25.746948, 1.304062] [28.27

In [10]:
len(matches)

98861

In [11]:
df_matches = pd.DataFrame(matches)
df_matches.to_json("data/grid_oag_match.json", orient="records")

In [20]:
df_matches.head(100)

Unnamed: 0,grid_lat,grid_lon,match_score,match_value,oag_id
0,[52.355792],[4.954782],1.000000,University of Amsterdam,f65c2f5a-648f-4dcc-a974-f841a7d59f4b
1,[50.865278],[-0.085556],1.000000,University of Sussex,f65c2f5a-648f-4dcc-a974-f841a7d59f4b
2,[47.376653000000005],[8.538197],0.855102,Credit Suisse (Switzerland),ed6b3532-1de5-40a4-b811-5f05d9e618c3
3,[43.661667],[-79.395],1.000000,University of Toronto,d1c359e7-bcb1-4433-b77b-a3e393eec667
4,[45.547924],[13.729504],1.000000,University of Primorska,d1c359e7-bcb1-4433-b77b-a3e393eec667
5,[50.813519],[4.381621],0.975013,Université Libre de Bruxelles,d1c359e7-bcb1-4433-b77b-a3e393eec667
6,[41.3020932],[-72.9306542],1.000000,Yale University,f78e174e-8566-451d-8113-7ee891750c18
7,[33.775886],[-84.39629599999999],1.000000,Georgia Institute of Technology,f78e174e-8566-451d-8113-7ee891750c18
8,[35.905164],[-79.046945],1.000000,University of North Carolina at Chapel Hill,d49ef4ec-21ec-4f72-9dd5-dc8144e202fa
9,[42.35982],[-71.09210999999999],1.000000,Massachusetts Institute of Technology,7a165e25-b2ce-442b-8ff1-4381f16f192d
