In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import string
import html
import Levenshtein

In [2]:
def preprocess_titles(df):
    return (
        df.title.str.lower() # normalize to lower case
                .str.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) # replace punctuation characters with spaces
                .map(lambda x: ' '.join(x.split())) # remove double spaces
    )

def preprocess_authors(df):
    return (
         df.loc[df.authors.notna(), 'authors'] # select only rows with some authors
            .map(lambda x: html.unescape(x)) # convert html escaped latin characters
            .map(lambda x: ', '.join(sorted(x.split(', ')))) # sort authors alphabetically
    )

In [3]:
def title_match_accuracy(title_A, title_B):
    title_A = set(title_A.split(" "))
    title_B = set(title_B.split(" "))

    intersection = title_A & title_B
    normalization_factor = min(len(title_A), len(title_B))
    return len(intersection) / normalization_factor

def authors_match_accuracy(authors_A, authors_B):
    authors_A = set(str(authors_A).split(", "))
    authors_B = set(str(authors_B).split(", "))

    intersection = authors_A & authors_B
    normalization_factor = min(len(authors_A), len(authors_B))
    return len(intersection) / normalization_factor

In [4]:
dfa = pd.read_csv('DBLP-ACM/ACM.csv')
dfa = dfa.set_index('id', drop=True)
dfa.loc[:, 'title'] = preprocess_titles(dfa)
dfa.loc[dfa.authors.notna(), 'authors'] = preprocess_authors(dfa)
dfa

Unnamed: 0_level_0,title,authors,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
304586,the wasa2 object oriented workflow management ...,"Gottfried Vossen, Mathias Weske",International Conference on Management of Data,1999
304587,a user centered interface for querying distrib...,"Isabel F. Cruz, Kimberly M. James",International Conference on Management of Data,1999
304589,world wide database integrating the web corba ...,"Athman Bouguettaya, Boualem Benatallah, James ...",International Conference on Management of Data,1999
304590,xml based information mediation with mix,"Amarnath Gupta, Bertram Ludäscher, Chaitan Bar...",International Conference on Management of Data,1999
304582,the ccube constraint object oriented database ...,"Alexander Brodsky, Jia Chen, Paval A. Exarkhop...",International Conference on Management of Data,1999
...,...,...,...,...
672977,dual buffering strategies in object bases,"Alfons Kemper, Donald Kossmann",Very Large Data Bases,1994
950482,guest editorial,"Philip A. Bernstein, Raghu Ramakrishnan, Yanni...",The VLDB Journal &mdash; The International Jou...,2003
672980,graphdb modeling and querying graphs in databases,Ralf Hartmut Güting,Very Large Data Bases,1994
945741,review of the data warehouse toolkit the compl...,Alexander A. Anisimov,ACM SIGMOD Record,2003


In [5]:
dfb = pd.read_csv('DBLP-ACM/DBLP2.csv', encoding='latin_1')
dfb = dfb.set_index('id', drop=True)
dfb.loc[:, 'title'] = preprocess_titles(dfb)
dfb.loc[dfb.authors.notna(), 'authors'] = preprocess_authors(dfb)
dfb

Unnamed: 0_level_0,title,authors,venue,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
journals/sigmod/Mackay99,semantic integration of environmental models f...,D. Scott Mackay,SIGMOD Record,1999
conf/vldb/PoosalaI96,estimation of query result distribution and it...,"Viswanath Poosala, Yannis E. Ioannidis",VLDB,1996
conf/vldb/PalpanasSCP02,incremental maintenance for non distributive a...,"Hamid Pirahesh, Richard Sidle, Roberta Cochran...",VLDB,2002
conf/vldb/GardarinGT96,cost based selection of path expression proces...,"Georges Gardarin, Jean-Robert Gruser, Zhao-Hui...",VLDB,1996
conf/vldb/HoelS95,benchmarking spatial join operations with spat...,"Erik G. Hoel, Hanan Samet",VLDB,1995
...,...,...,...,...
journals/tods/KarpSP03,a simple algorithm for finding frequent elemen...,"Christos H. Papadimitriou, Richard M. Karp, Sc...",ACM Trans. Database Syst.,2003
conf/vldb/LimWV03,sash a self adaptive histogram set for dynamic...,"Jeffrey Scott Vitter, Lipyeow Lim, Min Wang",VLDB,2003
journals/tods/ChakrabartiKMP02,locally adaptive dimensionality reduction for ...,"Eamonn J. Keogh, Kaushik Chakrabarti, Michael ...",ACM Trans. Database Syst.,2002
journals/sigmod/Snodgrass01,chair s message,Richard T. Snodgrass,SIGMOD Record,2001


In [6]:
#matches = []
#for year in dfa.year.unique(): # blocking the data by year
#    dfa_block = dfa.loc[dfa.year == year, 'title']
#    dfb_block = dfb.loc[dfb.year == year, 'title']
#    for index, title_a in dfa_block.items(): # with all records from one block
#        for match in dfb_block.index[dfb_block == title_a].tolist(): # finding perfect matches within the other block
#            matches.append([match, index])

matches = []
for year in dfa.year.unique(): # blocking the data by year
    dfa_block = dfa.loc[dfa.year == year, ["title", "authors"]]
    dfb_block = dfb.loc[dfb.year == year, ["title", "authors"]]
    for index_a, (title_a, author_a) in dfa_block.iterrows(): # with all records from first block
        for index_b, (title_b, author_b) in dfb_block.iterrows(): # comapare to all records from second block
            #title_match = title_match_accuracy(title_a, title_b)
            title_match = Levenshtein.ratio(title_a, title_b, score_cutoff=0.75)
            
            if title_match >= 0.95:
                matches.append([index_b, index_a])
            else:
                author_match = authors_match_accuracy(author_a, author_b)
                if title_match >= 0.85 and author_match >= 0.5:
                    matches.append([index_b, index_a])
                elif title_match >= 0.75 and author_match >= 1.0:
                    matches.append([index_b, index_a])

In [7]:
df = pd.DataFrame(matches, columns=['idDBLP', 'idACM'])
df['pred'] = 1
df

Unnamed: 0,idDBLP,idACM,pred
0,conf/sigmod/VossenW99,304586,1
1,conf/sigmod/CruzJ99,304587,1
2,conf/sigmod/BouguettayaBH99,304589,1
3,conf/sigmod/BaruGLMPVC99,304590,1
4,conf/sigmod/BrodskySCE99,304582,1
...,...,...,...
2230,journals/vldb/ChanGR03,950484,1
2231,journals/vldb/RamamurthyDS03,950483,1
2232,journals/vldb/AtluriJY03,950482,1
2233,journals/vldb/BernsteinIR03,950482,1


In [8]:
pm = pd.read_csv('DBLP-ACM/DBLP-ACM_perfectMapping.csv')
pm['true'] = 1
pm

Unnamed: 0,idDBLP,idACM,true
0,conf/sigmod/SlivinskasJS01,375678,1
1,conf/sigmod/ChaudhuriDN01,375694,1
2,conf/sigmod/RinfretOO01,375669,1
3,conf/sigmod/BreunigKKS01,375672,1
4,conf/sigmod/JagadishJOT01,375687,1
...,...,...,...
2219,journals/sigmod/Scholl01,604275,1
2220,journals/sigmod/Rosneblatt94,190649,1
2221,journals/sigmod/Winslett02b,601871,1
2222,journals/sigmod/Labrinidis01,604283,1


In [9]:
full = pd.merge(df, pm, how='outer').fillna(0) # outer merge/join and Na replacement with zeros adds false positives and false negatives 
                                               # to the data frame
full

Unnamed: 0,idDBLP,idACM,pred,true
0,conf/sigmod/VossenW99,304586,1.0,1.0
1,conf/sigmod/CruzJ99,304587,1.0,1.0
2,conf/sigmod/BouguettayaBH99,304589,1.0,1.0
3,conf/sigmod/BaruGLMPVC99,304590,1.0,1.0
4,conf/sigmod/BrodskySCE99,304582,1.0,1.0
...,...,...,...,...
2281,journals/sigmod/Scholl01,604275,0.0,1.0
2282,journals/sigmod/Rosneblatt94,190649,0.0,1.0
2283,journals/sigmod/Winslett02b,601871,0.0,1.0
2284,journals/sigmod/Labrinidis01,604283,0.0,1.0


In [10]:
pr = precision_score(full.true, full.pred)
re = recall_score(full.true, full.pred)
f1 = f1_score(full.true, full.pred)
pr, re, f1

(0.9722595078299776, 0.977068345323741, 0.9746579950661584)

exact title match: (0.9796806966618288, 0.9105215827338129, 0.9438359356793289)