In [1]:
import pandas as pd
import numpy as np

# Data Preparation

## Schema Mapping

In [143]:
df1 = pd.read_csv('ACM_1995_2004.csv', sep='|')
print(len(df1))
df1.dropna(ignore_index=True, inplace=True)
print(len(df1))
df1.head()

2902
2846


Unnamed: 0,PaperID,Title,Authors,Venue,Year
0,5390972920f70186a0dfac85,The next database revolution,Jim Gray,SIGMOD '04 Proceedings of the 2004 ACM SIGMOD ...,2004
1,5390972920f70186a0dfac86,The role of cryptography in database security,Ueli Maurer,SIGMOD '04 Proceedings of the 2004 ACM SIGMOD ...,2004
2,5390972920f70186a0dfac8d,Tree logical classes for efficient evaluation ...,"Stelios Paparizos, Yuqing Wu, Laks V. S. Laksh...",SIGMOD '04 Proceedings of the 2004 ACM SIGMOD ...,2004
3,5390972920f70186a0dfac88,Adaptive stream resource management using Kalm...,"Ankur Jain, Edward Y. Chang, Yuan-Fang Wang",SIGMOD '04 Proceedings of the 2004 ACM SIGMOD ...,2004
4,5390972920f70186a0dfac8a,Holistic UDAFs at streaming speeds,"Graham Cormode, Theodore Johnson, Flip Korn, S...",SIGMOD '04 Proceedings of the 2004 ACM SIGMOD ...,2004


In [144]:
df2 = pd.read_csv('DBLP_1995_2004.csv', sep='|')
print(len(df2))
df2.dropna(ignore_index=True, inplace=True)
print(len(df2))
df2.head()

2153
2130


Unnamed: 0,PaperID,Title,Authors,Venue,Year
0,53e9a515b7602d9702e350a0,An initial study of overheads of eddies.,Amol Deshpande,SIGMOD Record,2004
1,53e9b275b7602d9703d174f6,Engineering Federated Information Systems: Rep...,"Stefan Conrad, Wilhelm Hasselbring, Uwe Hohens...",SIGMOD Record,1999
2,53e9a5beb7602d9702eea180,Information Finding in a Digital Library: The ...,"Tak W. Yan, Hector Garcia-Molina",SIGMOD Record,1995
3,53e99800b7602d970200b618,Editor's Notes.,Jennifer Widom,SIGMOD Record,1995
4,53e9a718b7602d970304d814,Report on the 5th international workshop on th...,"Hans-Joachim Lenz, Panos Vassiliadis, Manfred ...",SIGMOD Record,2003


In [145]:
def preprocessing(df):
    df['Title'] = (df['Title'].str.lower()
                    .replace("[^a-z0-9]", " ", regex=True)  
                    .replace(" +", " ", regex=True)         
                    .str.strip())
    df['Authors'] = (df['Authors']
                     .str.lower()
                     .replace("[^a-z0-9]", " ", regex=True)
                     .replace(" +", " ", regex=True)
                     .str.strip())  

    df['Venue'] = (df['Venue']
                   .str.lower()
                   .replace(" +", " ", regex=True)
                   .str.strip())

In [146]:
preprocessing(df1)
df1.index.names=['index_acm']
df1.columns = ['paperID_acm', 'title_acm', 'authors_acm', 'venue_acm', 'year_acm']
preprocessing(df2)
df2.columns = ['paperID_dblp', 'title_dblp', 'authors_dblp', 'venue_dblp', 'year_dblp']
df2.index.names=['index_dblp']

- Contradictions & wrong values, Missing Values, Ref. Integrity, Typos
- data validation

# Blocking

In [147]:
from collections import defaultdict
import itertools

def blocking_w_pairs(df1, df2, cols=['year_acm', 'year_dblp']):

    b1 = defaultdict(list)
    b2 = defaultdict(list)
    
    for idx, key in df1[cols[0]].items():
        if key:
            b1[key].append(idx)
    
    for idx, key in df2[cols[1]].items():
        if key:
            b2[key].append(idx)
            
    pairs = [list(pair) for key in b1.keys() for pair in list(itertools.product(b1[key], b2[key]))]
    
    return b1, b2, np.array(pairs)

In [153]:
from nltk.tokenize import word_tokenize

def token_blocking(df1, df2, stop_words: set):

    blocks1 = defaultdict(list)
    blocks2 = defaultdict(list)
    
    for idx, row in enumerate(df1.itertuples()):

        string = " ".join([str(value) for value in row if not pd.isna(value)])
        tokens = set(
            [word for word in word_tokenize(string) if word not in stop_words]
        )

        for token in tokens:
            blocks1[token].append(idx)
            
    for idx, row in enumerate(df2.itertuples()):

        string = " ".join([str(value) for value in row if not pd.isna(value)])
        tokens = set(
            [word for word in word_tokenize(string) if word not in stop_words]
        )

        for token in tokens:
            blocks2[token].append(idx)
            

    blocks1 = {
        key: indices
        for key, indices in blocks1.items()
        if len(indices) < 1000 and len(indices) > 1
    }
    blocks2 = {
        key: indices
        for key, indices in blocks2.items()
        if len(indices) < 1000 and len(indices) > 1
    }


    pairs = [list(pair) for key in (blocks1.keys()& blocks2.keys()) for pair in list(itertools.product(blocks1[key], blocks2[key]))]

    return np.array(pairs)

In [154]:
import string
import nltk
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english') + list(string.punctuation))
token_blocks = token_blocking(df1[['title_acm', 'authors_acm']], df2[['title_dblp', 'authors_dblp']], stop_words)

In [234]:
len(np.unique(token_blocks, axis=0))

981190

In [25]:
b1, b2, pairs_year = blocking_w_pairs(df1, df2)
pairs_year.shape

(638200, 2)

In [157]:
df_blocking = pd.concat([df1.loc[pairs_year[:, 0]].reset_index(), 
                      df2.loc[pairs_year[:, 1]].reset_index()], axis=1)
df_blocking.head()

Unnamed: 0,index,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index.1,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp
0,389,53909f8c20f70186a0e3fe31,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb '03 proceedings of the 29th international...,2003,1268,53e9bae6b7602d97047129f6,efficient algorithms for minimizing tree patte...,prakash ramanan,sigmod conference,2002
1,389,53909f8c20f70186a0e3fe31,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb '03 proceedings of the 29th international...,2003,2033,53e99e6ab7602d9702732379,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb,2003
2,1784,5390882d20f70186a0d8dabb,efficient algorithms for minimizing tree patte...,prakash ramanan,proceedings of the 2002 acm sigmod internation...,2002,1268,53e9bae6b7602d97047129f6,efficient algorithms for minimizing tree patte...,prakash ramanan,sigmod conference,2002
3,1784,5390882d20f70186a0d8dabb,efficient algorithms for minimizing tree patte...,prakash ramanan,proceedings of the 2002 acm sigmod internation...,2002,2033,53e99e6ab7602d9702732379,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb,2003
4,940,539087dd20f70186a0d634a1,extraction of object oriented structures from ...,shekar ramanathan julia hodges,acm sigmod record,1997,98,53e9b40eb7602d9703f03182,extraction of object oriented structures from ...,shekar ramanathan julia e hodges,sigmod record,1997


In [34]:
df_blocking.to_csv('pairs_by_year.csv')

# Matching

In [237]:
import re

def matching(df, sim='jaccard', weights=[0.33, 0.33, 0.33]):
    if sim == 'jaccard':
        df['jaccard_sim'] = df.apply(lambda x: weights[0] * jaccard_sim(x.title_acm, x.title_dblp) 
                                     + weights[1] * jaccard_sim(x.authors_acm, x.authors_dblp)
                                     + weights[2] * int(x.year_acm==x.year_dblp)
                                        , axis=1)
    elif sim == 'trigram':
        df['trigram_sim'] = df.apply(lambda x: weights[0] * trigram_sim(get_ngrams(x.title_acm), get_ngrams(x.title_dblp))
                                     + weights[1] * trigram_sim(get_ngrams(x.authors_acm), get_ngrams(x.authors_dblp))
                                     + weights[2] * int(x.year_acm==x.year_dblp)
                                        , axis=1)
def jaccard_sim(s1, s2):
    s_intersection = set(set(s1.split()).intersection(set(s2.split())))
    s_union = set(s1.split()).union(set(s2.split()))
    return len(s_intersection) / len(s_union)


def trigram_sim(ngram_1, ngram_2):
    return 2 * len(ngram_1.intersection(ngram_2)) / (len(ngram_1)+len(ngram_2))


def get_ngrams(text, number=3):
    if not text:
        return set()
    text = ' ' * (number-1) + text + ' ' * (number-1)
    ngrams = set()
    for x in range(0, len(text) - number + 1):
        ngrams.add(text[x:x+number])
    return ngrams

In [257]:
matching(df_blocking)
# matching(df_blocking, 'trigram')
df_blocking.head()

Unnamed: 0,index_acm,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index_dblp,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp,jaccard_sim,trigram_sim
0,389,53909f8c20f70186a0e3fe31,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb '03 proceedings of the 29th international...,2003,1268,53e9bae6b7602d97047129f6,efficient algorithms for minimizing tree patte...,prakash ramanan,sigmod conference,2002,0.380769,0.394545
1,389,53909f8c20f70186a0e3fe31,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb '03 proceedings of the 29th international...,2003,2033,53e99e6ab7602d9702732379,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb,2003,0.99,1.0
2,1784,5390882d20f70186a0d8dabb,efficient algorithms for minimizing tree patte...,prakash ramanan,proceedings of the 2002 acm sigmod internation...,2002,1268,53e9bae6b7602d97047129f6,efficient algorithms for minimizing tree patte...,prakash ramanan,sigmod conference,2002,0.99,1.0
3,1784,5390882d20f70186a0d8dabb,efficient algorithms for minimizing tree patte...,prakash ramanan,proceedings of the 2002 acm sigmod internation...,2002,2033,53e99e6ab7602d9702732379,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb,2003,0.380769,0.394545
4,940,539087dd20f70186a0d634a1,extraction of object oriented structures from ...,shekar ramanathan julia hodges,acm sigmod record,1997,98,53e9b40eb7602d9703f03182,extraction of object oriented structures from ...,shekar ramanathan julia e hodges,sigmod record,1997,0.924,0.981818


In [28]:
all_pairs = np.array([list(pair) for pair in itertools.product(df1.index, df2.index)])
all_pairs_df = pd.concat([df1.loc[all_pairs[:, 0]].reset_index(), 
                      df2.loc[all_pairs[:, 1]].reset_index()], axis=1)
all_pairs_df.head()

Unnamed: 0,index_acm,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index_dblp,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp
0,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,0,53e9a515b7602d9702e350a0,an initial study of overheads of eddies,amol deshpande,sigmod record,2004
1,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,1,53e9b275b7602d9703d174f6,engineering federated information systems repo...,stefan conrad wilhelm hasselbring uwe hohenste...,sigmod record,1999
2,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,2,53e9a5beb7602d9702eea180,information finding in a digital library the s...,tak w yan hector garcia molina,sigmod record,1995
3,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,3,53e99800b7602d970200b618,editor s notes,jennifer widom,sigmod record,1995
4,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,4,53e9a718b7602d970304d814,report on the 5th international workshop on th...,hans joachim lenz panos vassiliadis manfred a ...,sigmod record,2003


In [258]:
matching(all_pairs_df)
# matching(all_pairs_df, 'trigram', weights=[0.4, 0.3, 0.3])
all_pairs_df.head()

Unnamed: 0,index_acm,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index_dblp,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp,jaccard_sim,trigram_sim
0,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,0,53e9a515b7602d9702e350a0,an initial study of overheads of eddies,amol deshpande,sigmod record,2004,0.33,0.3
1,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,1,53e9b275b7602d9703d174f6,engineering federated information systems repo...,stefan conrad wilhelm hasselbring uwe hohenste...,sigmod record,1999,0.0,0.032
2,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,2,53e9a5beb7602d9702eea180,information finding in a digital library the s...,tak w yan hector garcia molina,sigmod record,1995,0.0275,0.051064
3,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,3,53e99800b7602d970200b618,editor s notes,jennifer widom,sigmod record,1995,0.0,0.023077
4,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,4,53e9a718b7602d970304d814,report on the 5th international workshop on th...,hans joachim lenz panos vassiliadis manfred a ...,sigmod record,2003,0.019412,0.073711


In [168]:
all_pairs_df.to_csv('all_pairs.csv')

# Evaluation

In [259]:
def f1_evaluation(df, bs_df):
    
    tp = len(pd.merge(df, bs_df, how ='inner', on =['index_acm', 'index_dblp']))
    fp = len(df) - tp
    fn = len(bs_df) - tp
    print(tp, len(bs_df), len(df))
    precision = (tp / (tp+fp) ) if (tp + fp) > 0 else 0
    recall = (tp / (tp+fn) ) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall)
    
    return f1, precision, recall

In [260]:
thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:

    # bs_df_trigram = all_pairs_df[all_pairs_df.trigram_sim>threshold].sort_values(by=['trigram_sim'], ascending=False)
    # match_df_trigram = df_blocking[df_blocking.trigram_sim>threshold].sort_values(by=['trigram_sim'], ascending=False)
    # f1, prec, rec = f1_evaluation(match_df_trigram, bs_df_trigram)
    # print(f'Trigram: threshold: {threshold}, f1: {f1}, precision: {prec}, recall: {rec} \n')

    bs_df_jaccard = all_pairs_df[all_pairs_df.jaccard_sim>threshold].sort_values(by=['jaccard_sim'], ascending=False)
    match_df_jaccard = df_blocking[df_blocking.jaccard_sim>threshold].sort_values(by=['jaccard_sim'], ascending=False)
    f1, prec, rec = f1_evaluation(match_df_jaccard, bs_df_jaccard)
    print(f'Jaccard: threshold: {threshold}, f1: {f1}, precision: {prec}, recall: {rec} \n')


2402 2403 2402
Jaccard: threshold: 0.5, f1: 0.9997918834547346, precision: 1.0, recall: 0.9995838535164377 

2132 2132 2132
Jaccard: threshold: 0.55, f1: 1.0, precision: 1.0, recall: 1.0 

2032 2032 2032
Jaccard: threshold: 0.6, f1: 1.0, precision: 1.0, recall: 1.0 

1952 1952 1952
Jaccard: threshold: 0.65, f1: 1.0, precision: 1.0, recall: 1.0 

1769 1769 1769
Jaccard: threshold: 0.7, f1: 1.0, precision: 1.0, recall: 1.0 

1699 1699 1699
Jaccard: threshold: 0.75, f1: 1.0, precision: 1.0, recall: 1.0 

1620 1620 1620
Jaccard: threshold: 0.8, f1: 1.0, precision: 1.0, recall: 1.0 

1544 1544 1544
Jaccard: threshold: 0.85, f1: 1.0, precision: 1.0, recall: 1.0 

1435 1435 1435
Jaccard: threshold: 0.9, f1: 1.0, precision: 1.0, recall: 1.0 

1207 1207 1207
Jaccard: threshold: 0.95, f1: 1.0, precision: 1.0, recall: 1.0 



In [240]:
df_blocking.drop_duplicates(inplace=True, ignore_index=True)

In [227]:
df_blocking[df_blocking.trigram_sim>0.8].sort_values(by=['trigram_sim'], ascending=False)

Unnamed: 0,index_acm,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index_dblp,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp,jaccard_sim,trigram_sim
1,389,53909f8c20f70186a0e3fe31,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb '03 proceedings of the 29th international...,2003,2033,53e99e6ab7602d9702732379,covering indexes for xml queries bisimulation ...,prakash ramanan,vldb,2003,1.000000,1.000000
83419,2322,53908a9620f70186a0da4e2d,using referential integrity to easily define c...,brad hammond,vldb '96 proceedings of the 22th international...,1996,1448,53e9b295b7602d9703d3ac79,using referential integrity to easily define c...,brad hammond,vldb,1996,1.000000,1.000000
84148,674,539087cf20f70186a0d5b535,mapping extended entity relationship model to ...,joseph fong,acm sigmod record,1995,1527,53e9a55cb7602d9702e83be7,mapping extended entity relationship model to ...,joseph fong,sigmod record,1995,1.000000,1.000000
84132,2208,53908a9620f70186a0da47eb,telcordia s database reconciliation and data q...,francesco caruso munir cochinwala uma ganapath...,vldb '00 proceedings of the 26th international...,2000,2031,53e9b65bb7602d97041bb03f,telcordia s database reconciliation and data q...,francesco caruso munir cochinwala uma ganapath...,vldb,2000,1.000000,1.000000
84125,1175,539087e720f70186a0d6968d,olympic records for data at the 1998 nagano games,edwin r lassettre,sigmod '98 proceedings of the 1998 acm sigmod ...,1998,766,53e9bc21b7602d970488dcdb,olympic records for data at the 1998 nagano games,edwin r lassettre,sigmod conference,1998,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62875,899,539087d920f70186a0d60e56,mining optimized association rules for numeric...,takeshi fukuda yasuhido morimoto shinichi mori...,pods '96 proceedings of the fifteenth acm siga...,1996,180,53e9b5b6b7602d97040fa1e4,constructing efficient decision trees by using...,takeshi fukuda yasuhiko morimoto shinichi mori...,vldb,1996,0.648077,0.806108
82595,608,53909f8c20f70186a0e40216,progressive optimization in action,vijayshankar raman volker markl david simmen g...,vldb '04 proceedings of the thirtieth internat...,2004,1989,53e9abbfb7602d97035713f9,robust query processing through progressive op...,volker markl vijayshankar raman david e simmen...,sigmod conference,2004,0.650000,0.804533
55634,2724,53908d6520f70186a0dd1a40,a system for watermarking relational databases,rakesh agrawal peter j haas jerry kiernan,proceedings of the 2003 acm sigmod internation...,2003,1330,53e9b130b7602d9703baf9c8,watermarking relational data framework algorit...,rakesh agrawal peter j haas jerry kiernan,vldb j.,2003,0.672727,0.803636
57215,2779,5390958a20f70186a0def8e2,watermarking relational data framework algorit...,rakesh agrawal peter j haas jerry kiernan,the vldb journal — the international journal o...,2003,1478,53e9bb0fb7602d970474b559,a system for watermarking relational databases,rakesh agrawal peter j haas jerry kiernan,sigmod conference,2003,0.672727,0.803636


In [244]:
thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
print('Trigram matching \n')
for threshold in thresholds:

    bs_df_trigram = all_pairs_df[all_pairs_df.trigram_sim>threshold].sort_values(by=['trigram_sim'], ascending=False)
    match_df_trigram = df_blocking[df_blocking.trigram_sim>threshold].sort_values(by=['trigram_sim'], ascending=False)
    f1, prec, rec = f1_evaluation(match_df_trigram, bs_df_trigram)
    print(f'Threshold: {threshold}, f1: {f1}, precision: {prec}, recall: {rec} \n')

Trigram matching 

3586 3592 3586
Threshold: 0.5, f1: 0.9991641125661744, precision: 1.0, recall: 0.9983296213808464 

2547 2548 2547
Threshold: 0.55, f1: 0.9998037291462217, precision: 1.0, recall: 0.999607535321821 

2163 2163 2163
Threshold: 0.6, f1: 1.0, precision: 1.0, recall: 1.0 

1981 1981 1981
Threshold: 0.65, f1: 1.0, precision: 1.0, recall: 1.0 

1839 1839 1839
Threshold: 0.7, f1: 1.0, precision: 1.0, recall: 1.0 

1782 1782 1782
Threshold: 0.75, f1: 1.0, precision: 1.0, recall: 1.0 

1732 1732 1732
Threshold: 0.8, f1: 1.0, precision: 1.0, recall: 1.0 

1701 1701 1701
Threshold: 0.85, f1: 1.0, precision: 1.0, recall: 1.0 

1635 1635 1635
Threshold: 0.9, f1: 1.0, precision: 1.0, recall: 1.0 

1509 1509 1509
Threshold: 0.95, f1: 1.0, precision: 1.0, recall: 1.0 



In [245]:
thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
print('Jaccard matching \n')
for threshold in thresholds:

    bs_df_jaccard = all_pairs_df[all_pairs_df.jaccard_sim>threshold].sort_values(by=['jaccard_sim'], ascending=False)
    match_df_jaccard = df_blocking[df_blocking.jaccard_sim>threshold].sort_values(by=['jaccard_sim'], ascending=False)
    f1, prec, rec = f1_evaluation(match_df_jaccard, bs_df_jaccard)
    print(f'Threshold: {threshold}, f1: {f1}, precision: {prec}, recall: {rec} \n')

Jaccard matching 

2214 2215 2214
Threshold: 0.5, f1: 0.9997742153985097, precision: 1.0, recall: 0.999548532731377 

2077 2077 2077
Threshold: 0.55, f1: 1.0, precision: 1.0, recall: 1.0 

1931 1931 1931
Threshold: 0.6, f1: 1.0, precision: 1.0, recall: 1.0 

1839 1839 1839
Threshold: 0.65, f1: 1.0, precision: 1.0, recall: 1.0 

1740 1740 1740
Threshold: 0.7, f1: 1.0, precision: 1.0, recall: 1.0 

1705 1705 1705
Threshold: 0.75, f1: 1.0, precision: 1.0, recall: 1.0 

1633 1633 1633
Threshold: 0.8, f1: 1.0, precision: 1.0, recall: 1.0 

1574 1574 1574
Threshold: 0.85, f1: 1.0, precision: 1.0, recall: 1.0 

1447 1447 1447
Threshold: 0.9, f1: 1.0, precision: 1.0, recall: 1.0 

1257 1257 1257
Threshold: 0.95, f1: 1.0, precision: 1.0, recall: 1.0 



In [249]:
all_pairs_df[all_pairs_df.trigram_sim>0.8].sort_values(by=['trigram_sim'], ascending=False)

Unnamed: 0,index_acm,paperID_acm,title_acm,authors_acm,venue_acm,year_acm,index_dblp,paperID_dblp,title_dblp,authors_dblp,venue_dblp,year_dblp,jaccard_sim,trigram_sim
1984,0,5390972920f70186a0dfac85,the next database revolution,jim gray,sigmod '04 proceedings of the 2004 acm sigmod ...,2004,1984,53e99aecb7602d9702373cec,the next database revolution,jim gray,sigmod conference,2004,1.000000,1.000000
4544433,2133,53908a7420f70186a0da4672,relational databases for querying xml document...,jayavel shanmugasundaram kristin tufte chun zh...,vldb '99 proceedings of the 25th international...,1999,1143,53e9b137b7602d9703bbcc97,relational databases for querying xml document...,jayavel shanmugasundaram kristin tufte chun zh...,vldb,1999,1.000000,1.000000
4486290,2106,53908a7420f70186a0da4675,industrial panel on data warehousing technolog...,umeshwar dayal,vldb '99 proceedings of the 25th international...,1999,510,53e9acf0b7602d97036cda05,industrial panel on data warehousing technolog...,umeshwar dayal,vldb,1999,1.000000,1.000000
4484441,2105,53908a7420f70186a0da4670,xml repository and active views demonstration,serge abiteboul vincent aguilera s bastien ail...,vldb '99 proceedings of the 25th international...,1999,791,53e9bb6cb7602d97047ac179,xml repository and active views demonstration,serge abiteboul vincent aguilera s bastien ail...,vldb,1999,1.000000,1.000000
4482005,2104,53908a7420f70186a0da4669,in cyber space no one can hear you scream,chris pound,vldb '99 proceedings of the 25th international...,1999,485,53e99df7b7602d97026b7679,in cyber space no one can hear you scream,chris pound,vldb,1999,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915050,899,539087d920f70186a0d60e56,mining optimized association rules for numeric...,takeshi fukuda yasuhido morimoto shinichi mori...,pods '96 proceedings of the fifteenth acm siga...,1996,180,53e9b5b6b7602d97040fa1e4,constructing efficient decision trees by using...,takeshi fukuda yasuhiko morimoto shinichi mori...,vldb,1996,0.648077,0.806108
1297029,608,53909f8c20f70186a0e40216,progressive optimization in action,vijayshankar raman volker markl david simmen g...,vldb '04 proceedings of the thirtieth internat...,2004,1989,53e9abbfb7602d97035713f9,robust query processing through progressive op...,volker markl vijayshankar raman david e simmen...,sigmod conference,2004,0.650000,0.804533
5803450,2724,53908d6520f70186a0dd1a40,a system for watermarking relational databases,rakesh agrawal peter j haas jerry kiernan,proceedings of the 2003 acm sigmod internation...,2003,1330,53e9b130b7602d9703baf9c8,watermarking relational data framework algorit...,rakesh agrawal peter j haas jerry kiernan,vldb j.,2003,0.672727,0.803636
5920748,2779,5390958a20f70186a0def8e2,watermarking relational data framework algorit...,rakesh agrawal peter j haas jerry kiernan,the vldb journal — the international journal o...,2003,1478,53e9bb0fb7602d970474b559,a system for watermarking relational databases,rakesh agrawal peter j haas jerry kiernan,sigmod conference,2003,0.672727,0.803636
