In [1]:
# import libraries
import re
import pandas as pd
import recordlinkage
from recordlinkage.index import Full

## Preprocessing

In [2]:
path = "/Users/zhangyicheng/Library/CloudStorage/Dropbox/My Mac (张苡铖’s MacBook Pro (2))/Desktop/MACS122/Project/"
affiliations_df = pd.read_csv(path +"affiliation.csv")
affiliations_df["institution"] = affiliations_df.name

# Take out the location part in the name
for i, row in affiliations_df.iterrows():
    ls = row.loc["name"].split(",")
    ele = ls[0] 
    row.loc['institution'] = ele

# To make the comparison more sensitive, strip "university" and "of" from the string
affiliations_df["institution"] = affiliations_df["institution"]\
                                    .apply(lambda x: x.lower())\
                                    .apply(lambda x: x.strip())\
                                    .apply(lambda x: x.replace("university",""))\
                                    .apply(lambda x: x.replace("of",""))

affiliations_df.head(10)

Unnamed: 0,affiliationid,name,email,institution
0,au.aalto.fi,Aalto University,aalto.fi,aalto
1,kriot.kth.se,KTH Royal Institute of Technology,kth.se,kth royal institute technology
2,uocla.ucla.edu,"University of California, Los Angeles",ucla.edu,california
3,aei.aei.org,American Enterprise Institute,aei.org,american enterprise institute
4,miot.mit.edu,Massachusetts Institute of Technology,mit.edu,massachusetts institute technology
5,ciot.caltech.edu,California Institute of Technology,caltech.edu,california institute technology
6,hu.harvard.edu,Harvard University,harvard.edu,harvard
7,su.stanford.edu,Stanford University,stanford.edu,stanford
8,uop.upenn.edu,University of Pennsylvania,upenn.edu,pennsylvania
9,ai.amazon.com,Amazon Inc.,amazon.com,amazon inc.


In [3]:
path = "/Users/zhangyicheng/Library/CloudStorage/Dropbox/My Mac (张苡铖’s MacBook Pro (2))/Desktop/MACS122/Project/code/"
university_df = pd.read_csv(path +"university_ranking.csv")
university_df = university_df.rename(columns={"rank": "Rank"})

university_df["institution"] = university_df["University"]\
                                    .apply(lambda x: x.lower())\
                                    .apply(lambda x: x.strip())\
                                    .apply(lambda x: x.replace("university",""))\
                                    .apply(lambda x: x.replace("of",""))\
                                    .apply(lambda x: re.sub("\(.*\)","",x))
university_df = university_df.drop_duplicates()
university_df.head(10)

Unnamed: 0.1,Unnamed: 0,University,Rank,institution
0,0,London School of Economics,1,london school economics
1,1,World Bank Group,2,world bank group
2,2,National Bureau of Economic Research,3,national bureau economic research
3,3,International Monetary Fund,4,international monetary fund
4,4,Harvard University,5,harvard
5,5,European Central Bank,6,european central bank
6,6,University of California-Berkeley,7,california-berkeley
7,7,Massachusetts Institute of Technology,8,massachusetts institute technology
8,8,University of Chicago,9,chicago
9,9,Federal Reserve Board,10,federal reserve board


### INDEXING

In [4]:
index_full = Full()
candidate_links = index_full.index(affiliations_df, university_df)

our_comparison = recordlinkage.Compare()
# compare all our columns
our_comparison.string("institution", "institution", method = "jarowinkler", label = "jw_uni_name")
features = our_comparison.compute(candidate_links, affiliations_df, university_df)

thr_matches = features[(features["jw_uni_name"] >= 0)]
print("Threshold-based approach: {} matches".format(len(thr_matches)))
thr_matches

Threshold-based approach: 520608 matches


Unnamed: 0,Unnamed: 1,jw_uni_name
0,0,0.541667
0,1,0.555556
0,2,0.614379
0,3,0.629630
0,4,0.625000
...,...,...
1391,369,0.366667
1391,370,0.465079
1391,371,0.561440
1391,372,0.584874


In [5]:
# Store the matching info to a dictionary of list
jwdic = {}
for i, row in thr_matches.iterrows():
    author_index = i[0]
    if author_index in jwdic:
        jwdic[author_index].append((row.jw_uni_name, i[1]))                                     
    else: 
        jwdic[author_index] = []

##  Get the best match (highest match score) for each author
links = {"author_index":[], "university_index":[], "match_score":[]}
for key, values in jwdic.items():
    max_score = 0
    for value in values:
        if value[0] > max_score:
            max_score = value[0]
            index = value[1]
        else:
            continue
    links["author_index"].append(key)
    links["university_index"].append(index)
    links["match_score"].append(max_score)

links_df = pd.DataFrame(links)
links_df

Unnamed: 0,author_index,university_index,match_score
0,0,227,0.796296
1,1,60,0.747008
2,2,32,0.933333
3,3,160,0.759988
4,4,7,1.000000
...,...,...,...
1387,1387,299,0.824561
1388,1388,247,0.686379
1389,1389,52,0.771717
1390,1390,138,0.826667


In [6]:
links_df["author_ins"] = links_df["author_index"]
links_df["univer_ins"] = links_df["author_index"]
links_df["Rank"] = links_df["author_index"]

for r, row in links_df.iterrows():
    author_uni = affiliations_df.iloc[int(row.author_index)].institution
    univer = university_df.iloc[int(row.university_index)].University
    rank = university_df.iloc[int(row.university_index)].Rank
    links_df["author_ins"].at[r] = author_uni
    links_df["univer_ins"].at[r] = univer
    links_df["Rank"].at[r] = rank
links_df.head()

Unnamed: 0,author_index,university_index,match_score,author_ins,univer_ins,Rank
0,0,227,0.796296,aalto,Carleton University,309
1,1,60,0.747008,kth royal institute technology,Beijing Institute of Technology,78
2,2,32,0.933333,california,University of California-Davis,39
3,3,160,0.759988,american enterprise institute,Economic and Social Research Institute,212
4,4,7,1.0,massachusetts institute technology,Massachusetts Institute of Technology,8


In [8]:
pd_con = pd.concat([affiliations_df, links_df], axis=1)
affilication_ranking_linked = pd_con[["affiliationid", "name", "email", "match_score", "univer_ins", "Rank"]]
affilication_ranking_linked.head(10)

Unnamed: 0,affiliationid,name,email,match_score,univer_ins,Rank
0,au.aalto.fi,Aalto University,aalto.fi,0.796296,Carleton University,309
1,kriot.kth.se,KTH Royal Institute of Technology,kth.se,0.747008,Beijing Institute of Technology,78
2,uocla.ucla.edu,"University of California, Los Angeles",ucla.edu,0.933333,University of California-Davis,39
3,aei.aei.org,American Enterprise Institute,aei.org,0.759988,Economic and Social Research Institute,212
4,miot.mit.edu,Massachusetts Institute of Technology,mit.edu,1.0,Massachusetts Institute of Technology,8
5,ciot.caltech.edu,California Institute of Technology,caltech.edu,0.867773,Beijing Institute of Technology,78
6,hu.harvard.edu,Harvard University,harvard.edu,1.0,Harvard University,5
7,su.stanford.edu,Stanford University,stanford.edu,1.0,Stanford University,14
8,uop.upenn.edu,University of Pennsylvania,upenn.edu,1.0,University of Pennsylvania,55
9,ai.amazon.com,Amazon Inc.,amazon.com,0.693182,Chapman University,448


In [64]:
affilication_ranking_linked.to_csv("real_full_links.csv",index=False, encoding = "utf-8-sig")