In [3]:
# Source: https://alchemy.cs.washington.edu/mlns/er/
# Entity Resolution with Markov Logic
# Parag Singla, Pedro Domingos
'''
"The hand-labeled Cora dataset is provided by McCallum2
and has previously been used by Bilenko and Mooney [3]
and others. This dataset is a collection of 1295 different citations
to computer science research papers from the Cora
Computer Science Research Paper Engine. The original
dataset contains only unsegmented citation strings. Bilenko
and Mooney [3] segmented each citation into fields (author,
venue, title, publisher, year, etc.) using an information
extraction system. We used this processed version of
Cora. We further cleaned it up by correcting some labels.
This cleaned version contains references to 132 different research
papers. We used only the three most informative
fields: first author, title and venue (with venue including
conferences, journals, workshops, etc.). We compared the
performance of the algorithms for the task of de-duplicating
citations, authors and venues. For training and testing purposes,
we hand-labeled the field pairs. The labeled data
contains references to 50 authors and 103 venues. After
forming canopies, the total number of match decisions was
61,177."

[3] M. Bilenko and R. Mooney. Adaptive duplicate detection
using learnable string similarity measures. In Proc. KDD-
03, pages 39–48, 2003
'''
print()




In [11]:
# Working directory
wdir = 'data/cora-uwash'

# Read and convert the dataset into a friendlier format

import xml.etree.ElementTree as ET
import os.path
from xml.etree.ElementTree import ParseError

database= []
exit = False
for fpath in [os.path.join(wdir,fname) for fname in ['cora0.txt','cora1.txt','cora2.txt']]:
    d = {}
    f = open(fpath)
    for line in f:
        if line.strip() == '':
            database.append(d)
            d = dict()
        else:
            try:
                root = ET.fromstring('<root>'+line.strip().replace('&','&amp;')+'</root>')
                for child in root:
                    if child.tag == 'meta':
                        for attrname,attrvalue in child.attrib.items():
                            d[attrname] = attrvalue
                    else:
                        d[child.tag] = child.text
            except ParseError as perr:
                print(perr)
                print(line)
                exit = True
                break
    if exit:
        break

In [22]:
# Validation1: "This dataset is a collection of 1295 different citations"
len(database)

1295

In [15]:
import json
json.dump(database,open(os.path.join(wdir,'cora0+1+2.json'),'w'))

In [23]:
import pandas as pd
df = pd.DataFrame(database)

In [47]:
df

Unnamed: 0,address,author,bib_no,class_no,editor,institution,month,note,pages,publisher,ref_no,title,venue,volume,year
0,,"h. drucker, r. schapire, and p. simard.",0001,drucker1992,"in steven j. hanson, jack d. cowan, and c. le...",,,,pages 42-49.,"morgan kaufmann,",0095,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",1993
1,,"h. drucker, r. schapire, and p. simard.",0002,drucker1992,"in steven j. hanson, jack d. cowan, and c. le...",,,,pages 42-49.,"morgan kaufmann,",0096,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",1993
2,san francisco:,"drucker h., schapire r., & simard p.",0003,drucker1992,,,,,,morgan kaufmann.,0097,improving performance in neural networks usin...,in advances in neural information processing ...,,(1992).
3,"palo alto, ca.","drucker h., schapire r., & simard p.",0004,drucker1992,"in hanson, j., cowan, j., & giles, c., editors,",,,,"(pp. 42-49),",morgan kaufmann.,0098,improving performance in neural networks usin...,advances in neural information processing sys...,,(1992).
4,"san mateo, ca.","drucker h., schapire r., & simard p.",0005,drucker1992,"in hanson, s. j., cowan, j. d., & giles, c. l...",,,,,"morgan kaufmann,",0099,improving performance in neural networks usin...,advances in neural information processing sys...,,(1993).
5,,"drucker h., schapire r., & simard p.",0006,drucker1992,,,,,42-49.,,0100,improving the performance in neural networks ...,advances in neural information processing sys...,5,(1993).
6,,"drucker h., schapire r., and simard p.",0007,drucker1992,"in hanson, s. j., cowan, j. d., and giles, c....",,,,pages 42-49.,morgan kaufmann.,0101,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",(1993).
7,"san mateo, ca.","drucker h., schapire r., and simard r.",0008,drucker1992,"in hanson, s. j., cowan, j. d., and giles, c....",,,,"pages 42-49,",morgan kaufmann.,0102,improving performance in neural networks usin...,advances in neural information processing sys...,,1993
8,"san ma-teo, ca.","drucker harris, schapire, robert, and simard p...",0009,drucker1992,,,,,42-49.,morgan kaufmann.,0103,improving performance in neural networks using...,in advances in neural informations processing...,,1993
9,"42-49,","drucker h., schapire r., simard p.",0010,drucker1992,"s. j. hanson, j. cowan, l. giles (eds.),",,,,,morgan kaufmann.,0104,'improving performance in neural networks usin...,in advances in neural information processing ...,,1993


In [28]:
# Validation2: "This cleaned version contains references to 132 different research papers."
# Maybe the cleaning that they talk about eliminates two papers?
print(len(df.class_no.unique()))
df.class_no.unique()

134


array(['drucker1992', 'drucker1992C1', 'freund0000a', 'freund1992a',
       'freund1993cC', 'freund1995e', 'freund1996c', 'freund1997a',
       'freund1997d', 'goldman1990a', 'goldman1993', 'haussler1988C',
       'haussler1994a', 'haussler1994aC2', 'kautz1993', 'kautz1995',
       'kearns1994cryp', 'kearns1994d', 'kearns1994e', 'kearns1996a',
       'kearns1996b', 'kearns1997b', 'lewis1996', 'littlestone',
       'rivest1987d', 'rivest1989', 'rivest1992', 'rivest1992C',
       'rivest1994', 'schapire', 'schapire1988', 'schapire1989',
       'schapire1990', 'schapire1991C2', 'schapire1992C', 'schapire1994',
       'schapire1996', 'schapire1996p', 'schapire1997C', 'schapire1997u',
       'schapire1997um', 'schapire1998', 'schapire1998mm', 'auer1995a',
       'blum1993', 'cesa', 'cesaR', 'cesaJ', 'cohen1998', 'drucker1992C2',
       'feder1995', 'freund1992b', 'freund1993a', 'freund1993b',
       'freund1993c', 'freund1995aC', 'freund1995d', 'freund1995f',
       'freund1996a', 'freund19

In [41]:
'''
"The labeled data
contains references to 50 authors and 103 venues. After
forming canopies, the total number of match decisions was
61,177."
'''
# I'm going to see if the .db files help confirm this
import importlib
import UnionFind
importlib.reload(UnionFind)

In [197]:
uf_authors = UnionFind.UnionFind()
with open(wdir+'/SameAuthor.txt') as f:
    for line in f:
        name1, name2 = line.strip().split(',')
        uf_authors.union(name1,name2)
        
uf_authors.printDisjointSets()
# So this gives only 21 names, which I presume are the first authors only. It's a start. 
# Why wouldn't they give the labeled data containing 50 authors though, if they did indeed label it :/


Disjoint sets: 
{'a_blum_', 'blum_a_', 'avrim_blum_'}
{'schapire_r_e_', 'shapire_r_', 'r_schapire_', 'schapire_', 'robert_schapire_', 'r_', 'robert_e_schapire_', 'schapire_r_', 'schapire_robert_e_', 'shapire_r_e_', 'r_e_schapire_'}
{'n_cesa_bianchi_', 'cesa_bianchi_n_', 'nicolo_cesa_bianchi_'}
{'goldman_s_', 's_a_goldman_', 'goldman_s_a_', 'sally_a_goldman_', 's_goldman_'}
{'kautz_henry_'}
{'d_p_helmbold_', 'david_p_helmhold_', 'david_p_helmbold_', 'helmbold_d_p_', 'd_helmbold_'}
{'p_auer_'}
{'r_l_rivest_', 'rivest_ronald_l_', 'rivest_r_l_', 'rivest_r_', 'r_rivest_', 'ronald_l_rivest_'}
{'littlestone_n_'}
{'freud_y_', 'yoav_freund_', 'freund_y_', 'freund_v_', 'y_freund_', 'freund_yoav_'}
{'d_haussler_', 'haussler_david_', 'haussler_d_', 'david_haussler_'}
{'d_d_lewis_', 'david_d_lewis_', 'lewis_d_d_', 'lewis_d_', 'david_lewis_', 'd_lewis_'}
{'j_p_kearns_'}
{'ehrenfeucht_a_', 'ehrenfeucht_andrzej_', 'andrzej_ehrenfeucht_', 'a_ehrenfeucht_'}
{'eric_bauer_'}
{'kearns_m_', 'micahel_kearns

In [46]:
uf_venues = UnionFind.UnionFind()
with open(wdir+'/SameVenue.txt') as f:
    for line in f:
        name1, name2 = line.strip().split(',')
        uf_venues.union(name1,name2)
        
print("Number of venues: %d" % len(list(uf_venues.getDisjointSets())))
uf_venues.printDisjointSets()
# Ok, this looks better. Promised 103, got 117... Perhaps there are some sets that need merging within these
# Several of these don't make a lot of sense though. 
# Also, how are there singleton sets, given pairs in the file? Checked: some pairs repeat an element twice :/

Number of venues: 117

Disjoint sets: 
{'an_extended_abstract_appeared_proceedings_30th_symposium_foundations_computer_science_'}
{'to_appear_proceedings_eleventh_conference_computational_learning_theory_'}
{'technical_report_ucsc_crl_94_33_'}
{'inform_comput_', 'inf_computation_', 'information_computation_', 'information_control_', 'information_compuation_'}
{'preprint_'}
{'proc_thirteenth_international_conference_machine_learning_', 'proc_13th_int_conf_machine_learning_', 'proc_thirteenth_conf_machine_learning_', 'machine_learning_proceedings_thirteenth_international_conference_', 'proc_thirteenth_international_conference_machine_learning_pp_', 'machine_learning_proceedings_13th_international_conference_', 'proceedings_thirteenth_international_conference_machine_learning_', 'proc_thirteenth_intl_conf_machine_learning_pp_', 'proc_13th_international_conference_machine_learning_', 'proceedings_13rd_international_conference_machine_learning_pp_', 'machine_learning_proceedings_thirteenth_

In [58]:
df[['author','class_no']]
## Applying the following transformation
## 1. remove ' and '
## 2. remove &
## 3. split by ','
authors = df[['author','class_no']]['author'].str.replace(' and ','').str.replace('&','').str.split(',')

In [82]:
authors_with_class = pd.DataFrame({'class_no':df['class_no'],'extracted_author_string':df['author'],'author_list':authors})

In [83]:
authors_with_class

Unnamed: 0,author_list,class_no,extracted_author_string
0,"[h. drucker, r. schapire, p. simard.]",drucker1992,"h. drucker, r. schapire, and p. simard."
1,"[h. drucker, r. schapire, p. simard.]",drucker1992,"h. drucker, r. schapire, and p. simard."
2,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., & simard p."
3,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., & simard p."
4,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., & simard p."
5,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., & simard p."
6,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., and simard p."
7,"[drucker h., schapire r., simard r.]",drucker1992,"drucker h., schapire r., and simard r."
8,"[drucker harris, schapire, robert, simard pa...",drucker1992,"drucker harris, schapire, robert, and simard p..."
9,"[drucker h., schapire r., simard p.]",drucker1992,"drucker h., schapire r., simard p."


In [84]:
titles_with_class = df[['class_no','title']]
titles_with_class

Unnamed: 0,class_no,title
0,drucker1992,improving performance in neural networks usin...
1,drucker1992,improving performance in neural networks usin...
2,drucker1992,improving performance in neural networks usin...
3,drucker1992,improving performance in neural networks usin...
4,drucker1992,improving performance in neural networks usin...
5,drucker1992,improving the performance in neural networks ...
6,drucker1992,improving performance in neural networks usin...
7,drucker1992,improving performance in neural networks usin...
8,drucker1992,improving performance in neural networks using...
9,drucker1992,'improving performance in neural networks usin...


In [87]:
# So, taking stock of the situation, we have 1295 records, each is a citation: a mention of a publication
# The citation refers to the same underlying entity, the publication
# Perhaps we can think of the publication as the (not so) latent entity, whose attributes we are trying to resolve.
# We are trying to resolve the authors, and titles. 
# There are several authors mentions per publication mention, and one title mention per 
# So, we must create predicates for:
# a) The titles  - The ground truth for title1 == title2 is class_no1 == class_no2
# b) The authors - This is trickier. We don't have the ground truth for this explicitly.
# So we attempt to derive it from the authors_with_class dataframe.

In [90]:
#First, let's at least get a complete list of authors:
authors_set = set()
for index, row in authors_with_class.iterrows():
    for author in row['author_list']:
        authors_set.add(author.strip())
print(len(authors_set))
#print(authors_set)

326

In [93]:
# Looks alright. We need to keep a list of references to the citations. The row index in our case.
authors_set = set()
for index, row in authors_with_class.iterrows():
    for author in row['author_list']:
        authors_set.add((index, author.strip()))
len(authors_set)

3524

In [100]:
# Since we will use year to standard block:
years = df[['class_no','year']]
#But looks like it could use some cleaning up
years['year'] = years['year'].str.replace(r'\D+', '').astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [101]:
years

Unnamed: 0,class_no,year
0,drucker1992,1993
1,drucker1992,1993
2,drucker1992,1992
3,drucker1992,1992
4,drucker1992,1993
5,drucker1992,1993
6,drucker1992,1993
7,drucker1992,1993
8,drucker1992,1993
9,drucker1992,1993


In [108]:
'''
Do we have everything we need now? Let's see:
R1: SIMTITLE(P1,P2) ⇒ SAMEPUB(P1,P2)
R2: SAMEAUTHOR(P1,P2) ⇒ SAMEPUB(P1,P2)
R3: SIMNAME(A1,A2) ⇒ SAMEAUTHOR(A1,A2)
R4: COAUTHORS(A1,A2)∧COAUTHORS(A2,A3) ⇒ SAMEAUTHOR(A1,A3)
R5: SAMEAUTHOR(A1,A2)∧SAMEAUTHOR(A2,A3) ⇒SAMEAUTHOR(A1,A3)

I see several problems already.

SAMEAUTHOR(P1,P2) vs SAMEAUTHOR(A1,A2). 
What I wanted to say was haveSameAuthors(P1, P2), not SAMEAUTHOR(P1,P2)

Well, how would you say haveSameAuthors(P1, P2)?
SameAuthors(P1,+A)  -> haveSameAuthors(P1, P2), maybe?

Secondly, rule two seems to be wrong, outright. Having the same authors makes the publication the same? 
Of course not.

There needs to be a predicate tying an author and the publication/citation, no?

This seems a better way to write the model:
// If two publications have similar titles, they are likely the same publication
haveSimTitle(P1,P2) ⇒ SamePub(P1,P2)

// If two publications have similar author strings, they are likely the same publication
haveSimAuthors(P1,P2) ⇒ SamePub(P1,P2)

// If two publications are the same, they must have the same authors
hasAuthor(P1, A1) ^ hasAuthor(P2, A2) ^ SamePub(P1,P2) ^ haveSimilarNames(A1, A2) -> SameAuthor(A1, A2)

// If two publications are the same, their authors must be the same
hasAuthor(P1, A1) ^ hasAuthor(P2, A2) ^ SamePub(P1,P2) ^ haveSimilarNames(A1, A2) -> SameAuthor(A1, A2)

// The Co-occurrence rule: If authors who are known to be the same occur with unresolved authors with similar names,
//     the unresolved authors are likely the SameAuthor
areCoAuthors(A1, A2) ^ areCoAuthors(A3, A4) ^ SameAuthor(A1, A3) -> SameAuthor(A2, A4)

// The transitivity rule
SameAuthor(A1, A2) ^ SameAuthor(A2, A3) -> SameAuthor(A1, A3)

Can we come up with some negative rules?

areCoAuthors -> !SameAuthor
haveDissimilarNames -> !SameAuthor
haveDissimilarTitles -> !SamePub

Notes:
It seems wrong to say that if two publications aren't the same,
then the cross product of the authors of the two publications aren't the same...

Well, come to think of it, it seems wrong to use similar name to say the authors are the same
    because entity disambiguation :/
'''
'''
So, the list of predicates we want is:
Blocking:
Block

Obs:
haveSimTitle
hasAuthor
haveSimilarNames
haveSimilarAuthors
areCoAuthors

Targets and GroundTruth:
SamePub
SameAuthor
'''
print("Continued below...")

Continued below...


In [97]:
df

Unnamed: 0,address,author,bib_no,class_no,editor,institution,month,note,pages,publisher,ref_no,title,venue,volume,year
0,,"h. drucker, r. schapire, and p. simard.",0001,drucker1992,"in steven j. hanson, jack d. cowan, and c. le...",,,,pages 42-49.,"morgan kaufmann,",0095,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",1993
1,,"h. drucker, r. schapire, and p. simard.",0002,drucker1992,"in steven j. hanson, jack d. cowan, and c. le...",,,,pages 42-49.,"morgan kaufmann,",0096,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",1993
2,san francisco:,"drucker h., schapire r., & simard p.",0003,drucker1992,,,,,,morgan kaufmann.,0097,improving performance in neural networks usin...,in advances in neural information processing ...,,(1992).
3,"palo alto, ca.","drucker h., schapire r., & simard p.",0004,drucker1992,"in hanson, j., cowan, j., & giles, c., editors,",,,,"(pp. 42-49),",morgan kaufmann.,0098,improving performance in neural networks usin...,advances in neural information processing sys...,,(1992).
4,"san mateo, ca.","drucker h., schapire r., & simard p.",0005,drucker1992,"in hanson, s. j., cowan, j. d., & giles, c. l...",,,,,"morgan kaufmann,",0099,improving performance in neural networks usin...,advances in neural information processing sys...,,(1993).
5,,"drucker h., schapire r., & simard p.",0006,drucker1992,,,,,42-49.,,0100,improving the performance in neural networks ...,advances in neural information processing sys...,5,(1993).
6,,"drucker h., schapire r., and simard p.",0007,drucker1992,"in hanson, s. j., cowan, j. d., and giles, c....",,,,pages 42-49.,morgan kaufmann.,0101,improving performance in neural networks usin...,advances in neural information processing sys...,"volume 5,",(1993).
7,"san mateo, ca.","drucker h., schapire r., and simard r.",0008,drucker1992,"in hanson, s. j., cowan, j. d., and giles, c....",,,,"pages 42-49,",morgan kaufmann.,0102,improving performance in neural networks usin...,advances in neural information processing sys...,,1993
8,"san ma-teo, ca.","drucker harris, schapire, robert, and simard p...",0009,drucker1992,,,,,42-49.,morgan kaufmann.,0103,improving performance in neural networks using...,in advances in neural informations processing...,,1993
9,"42-49,","drucker h., schapire r., simard p.",0010,drucker1992,"s. j. hanson, j. cowan, l. giles (eds.),",,,,,morgan kaufmann.,0104,'improving performance in neural networks usin...,in advances in neural information processing ...,,1993


In [103]:
dfrelevant = pd.DataFrame({'class_no':df['class_no'], 'authors':df['author'], 
                           'authors_as_list':authors_with_class['author_list'],
                           'title':df['title'],'year':years['year']})
dfrelevant['pubid'] = dfrelevant.index
dfrelevant

Unnamed: 0,authors,authors_as_list,class_no,title,year,pubid
0,"h. drucker, r. schapire, and p. simard.","[h. drucker, r. schapire, p. simard.]",drucker1992,improving performance in neural networks usin...,1993,0
1,"h. drucker, r. schapire, and p. simard.","[h. drucker, r. schapire, p. simard.]",drucker1992,improving performance in neural networks usin...,1993,1
2,"drucker h., schapire r., & simard p.","[drucker h., schapire r., simard p.]",drucker1992,improving performance in neural networks usin...,1992,2
3,"drucker h., schapire r., & simard p.","[drucker h., schapire r., simard p.]",drucker1992,improving performance in neural networks usin...,1992,3
4,"drucker h., schapire r., & simard p.","[drucker h., schapire r., simard p.]",drucker1992,improving performance in neural networks usin...,1993,4
5,"drucker h., schapire r., & simard p.","[drucker h., schapire r., simard p.]",drucker1992,improving the performance in neural networks ...,1993,5
6,"drucker h., schapire r., and simard p.","[drucker h., schapire r., simard p.]",drucker1992,improving performance in neural networks usin...,1993,6
7,"drucker h., schapire r., and simard r.","[drucker h., schapire r., simard r.]",drucker1992,improving performance in neural networks usin...,1993,7
8,"drucker harris, schapire, robert, and simard p...","[drucker harris, schapire, robert, simard pa...",drucker1992,improving performance in neural networks using...,1993,8
9,"drucker h., schapire r., simard p.","[drucker h., schapire r., simard p.]",drucker1992,'improving performance in neural networks usin...,1993,9


In [107]:
#Number of blocks:
print(len(dfrelevant['year'].unique()))
dfrelevant['year'].unique()
# Clearly there will be errors - see 0, 11995

17


array([ 1993,  1992,  1973,     0,  1995,  1996,  1997,  1990,  1988,
        1991,  1994,  1987,  1989,  1998,  2000, 11995,  1999])

In [137]:
'''
So, the list of predicates we want is:
Blocking:
Block Author        x
Block Publication   x

Obs:
haveSimTitle
hasAuthor           x
haveSimilarNames
haveSimilarAuthors
areCoAuthors        x

Targets and GroundTruth:
SamePub
SameAuthor
'''

'''
 DataFrame.to_csv(path_or_buf=None, sep=', ', na_rep='', float_format=None, columns=None, 
                  header=True, index=True, index_label=None, mode='w', encoding=None, 
                  compression=None, quoting=None, quotechar='"', line_terminator='\n', 
                  chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, 
                  escapechar=None, decimal='.')[source]
'''

# Block publications
dfrelevant[['pubid','year']].to_csv(wdir+'/psl/BlocksPubs.txt',sep='\t',header=False,index=False)

In [175]:
# hasAuthor
# areCoAuthors

# Note that in the absence of an author identifier, and in the presence of multiple authors per publication
#    we must use a unique author id for every mention in the publication, because PSL treats the same literal 
#    value to mean the same entity
# We can make use of this to also segregate the names into first, middle, last names and initials
# We also need to block authors

# First come up with identifiers for the Authors. An authorid -> authorname file: AuthorNames.txt
# Also, it will be useful to block authors. An authorid -> blockid file: BlocksAuthors.txt
authid = 0 
from itertools import combinations

with open(wdir+'/psl/AuthorNames.txt','w') as file_authors, \
     open(wdir+'/psl/HasAuthor.txt','w') as file_hasAuthor, \
     open(wdir+'/psl/AreCoAuthors.txt','w') as file_areCoAuthors, \
     open(wdir+'/psl/AuthorTrace.txt','w') as file_authorTrace, \
     open(wdir+'/psl/BlocksAuthors.txt','w') as file_blocksAuthors:
    for index, row in dfrelevant.iterrows():
        authids_for_this_pub = list()
        for author in row['authors_as_list']:
            if author.strip() == '':
                continue
            authids_for_this_pub.append(authid)
            file_authors.write('{}\t{}\n'.format(authid, author.strip()))
            file_hasAuthor.write('{}\t{}\n'.format(index, authid))
            file_blocksAuthors.write('{}\t{}\n'.format(authid, row['year']))
            file_authorTrace.write('{}\t{}\t{}\n'.format(authid, author.strip(), index))
            authid += 1
            
        for auth1, auth2 in combinations(authids_for_this_pub,2):
            file_areCoAuthors.write('{}\t{}\n'.format(auth1, auth2))
            file_areCoAuthors.write('{}\t{}\n'.format(auth2, auth1))

In [123]:
# Similarities.
# Note that this is where we will have cross products
# areCoAuthors should technically have a cross product of authors,
#     but let's see if we can imply that as a closed predicate, not-mentioned pairs are false.

# haveSimTitle
# haveSimilarAuthors
# haveSimilarNames (for authors)

In [190]:
# Let's begin with the smallest of the cross products: haveSimilarNames
# First read the AuthorNames files as a dataframe
dfauthors_names = pd.read_csv(wdir+'/psl/AuthorNames.txt', sep='\t', names=['author_id','name'], header=None)

# Then read their block information
dfauthors_blocks = pd.read_csv(wdir+'/psl/BlocksAuthors.txt', sep='\t', names=['author_id','block'], header=None)

# Combine the dataframes by author_id
# Note that this can be done at this stage 
#    because we assume every mention is a unique author
#    until further evidence
dfauthors = dfauthors_names.merge(dfauthors_blocks, on='author_id')

# Then get the cross product.
dfauthors_cross = dfauthors.merge(dfauthors, on='block')

In [157]:
# Now you can add a column for similarity between name_x and name_y
# TODO: Just realized - Blocking on year makes sense for publications,
#       but not really for authors. If we compare authors only for 
#       publications in the same year, we may not even hit the same 
#       author twice :/
#       Also, this flouts the meaning of a block - a cluster of mentions
#             that are likely the same entity
#       Suggestion: use a different blocking scheme, and different blocks

In [428]:
dfauthors = dfauthors_names.merge(dfauthors_blocks, on='author_id')
dfauthors

Unnamed: 0,author_id,name,block
0,0,h. drucker,1993
1,1,r. schapire,1993
2,2,p. simard.,1993
3,3,h. drucker,1993
4,4,r. schapire,1993
5,5,p. simard.,1993
6,6,drucker h.,1992
7,7,schapire r.,1992
8,8,simard p.,1992
9,9,drucker h.,1992


In [401]:
dfauthors

Unnamed: 0,author_id,name,block,bc1
0,0,h. drucker,1993,"{h, dru}"
1,1,r. schapire,1993,"{r, sch}"
2,2,p. simard.,1993,"{p, sim}"
3,3,h. drucker,1993,"{h, dru}"
4,4,r. schapire,1993,"{r, sch}"
5,5,p. simard.,1993,"{p, sim}"
6,6,drucker h.,1992,"{h, dru}"
7,7,schapire r.,1992,"{r, sch}"
8,8,simard p.,1992,"{p, sim}"
9,9,drucker h.,1992,"{h, dru}"


In [194]:
# Add blocking condition(s) as (a) column(s)
# Remember, it needs to be inexpensive to compare
dfauthors['blockcondition1_3letters'] = dfauthors['name'].apply(lambda x : set([part_of_name.replace('.','')[:3] for part_of_name in x.split(' ')]))
dfauthors['blockcondition2_sortedInitials'] = dfauthors['name'].apply(lambda x : ''.join(sorted([part_of_name[0] for part_of_name in x.replace('.','').split(' ') if part_of_name !=''])))
# Let's say 2 author mentions are in the same block if 
# 1. intersection of their blockcondition1 has at least one element
# 2. their blockcondition2 has a substring-superstring relationship
# 3. there is at least one 3 letter match in the intersection of blockcondition1

In [195]:
dfauthors

Unnamed: 0,author_id,name,block,blockcondition1_3letters,blockcondition2_sortedInitials
0,0,h. drucker,1993,"{h, dru}",dh
1,1,r. schapire,1993,"{r, sch}",rs
2,2,p. simard.,1993,"{p, sim}",ps
3,3,h. drucker,1993,"{h, dru}",dh
4,4,r. schapire,1993,"{r, sch}",rs
5,5,p. simard.,1993,"{p, sim}",ps
6,6,drucker h.,1992,"{h, dru}",dh
7,7,schapire r.,1992,"{r, sch}",rs
8,8,simard p.,1992,"{p, sim}",ps
9,9,drucker h.,1992,"{h, dru}",dh


In [198]:
# TODO: Compare the blocking schemes
# TODO: Write a universal blocking stats calculating function
#       Signature will likely be - blockstats(blocks, groundtruth)
#       Which brings me to my moment of truth - where is the ground truth for same authors :/

# Let's try deriving it from the databases shared by the UW group
# As mentioned, we have a seed set of 21 authors:
uf_authors.printDisjointSets()
# That's a pretty good set of sets.
# We must now look at 
# 1) authors who are not first author in any publication 
#      (according to the statistic shared in the paper, these should be about 29 in number)
# 2) author mentions that belong to one of these sets
# Let's look at the number of distinct names that are not one of the names in these 21 sets


Disjoint sets: 
{'a_blum_', 'blum_a_', 'avrim_blum_'}
{'schapire_r_e_', 'shapire_r_', 'r_schapire_', 'schapire_', 'robert_schapire_', 'r_', 'robert_e_schapire_', 'schapire_r_', 'schapire_robert_e_', 'shapire_r_e_', 'r_e_schapire_'}
{'n_cesa_bianchi_', 'cesa_bianchi_n_', 'nicolo_cesa_bianchi_'}
{'goldman_s_', 's_a_goldman_', 'goldman_s_a_', 'sally_a_goldman_', 's_goldman_'}
{'kautz_henry_'}
{'d_p_helmbold_', 'david_p_helmhold_', 'david_p_helmbold_', 'helmbold_d_p_', 'd_helmbold_'}
{'p_auer_'}
{'r_l_rivest_', 'rivest_ronald_l_', 'rivest_r_l_', 'rivest_r_', 'r_rivest_', 'ronald_l_rivest_'}
{'littlestone_n_'}
{'freud_y_', 'yoav_freund_', 'freund_y_', 'freund_v_', 'y_freund_', 'freund_yoav_'}
{'d_haussler_', 'haussler_david_', 'haussler_d_', 'david_haussler_'}
{'d_d_lewis_', 'david_d_lewis_', 'lewis_d_d_', 'lewis_d_', 'david_lewis_', 'd_lewis_'}
{'j_p_kearns_'}
{'ehrenfeucht_a_', 'ehrenfeucht_andrzej_', 'andrzej_ehrenfeucht_', 'a_ehrenfeucht_'}
{'eric_bauer_'}
{'kearns_m_', 'micahel_kearns

In [206]:
# First normalize both sets to the form where the name is just the parts separated by spaces:
authors_known_ground_truth = set()
for author_set in uf_authors.getDisjointSets():
    for author in author_set:
        authors_known_ground_truth.add(author.replace('_',' ').strip())
print(len(authors_known_ground_truth))
print(authors_known_ground_truth)

authors_all = set()
for index, row in dfauthors.iterrows():
    authors_all.add(row['name'].replace('.',''))
print(len(authors_all))
print(authors_all)

# There are 85 out of 88 overlaps. Why not 88?
print(authors_known_ground_truth - authors_all)
# Interesting... it has to do with cesa bianchi in the ground truth dataset vs cesa-bianchi in the raw names

88
{'nicolo cesa bianchi', 'kautz henry', 'david d lewis', 'andrzej ehrenfeucht', 'shapire r e', 'blum a', 'tom dietterich', 'a blum', 'freund yoav', 'd d lewis', 'freund y', 'lewis d', 'freund v', 'david p helmbold', 'kautz h a', 'haussler david', 'h kautz', 'sally a goldman', 'dietterich t g', 'h a kautz', 'kearns', 'freud y', 'y freund', 'drucker h', 't dietterich', 'r rivest', 'kautz h', 'henry kautz', 'm kearns', 'david haussler', 'shapire r', 'j p kearns', 'r l rivest', 'kearns michael', 'r schapire', 'yoav freund', 'rivest r', 'goldman s', 'ehrenfeucht a', 'h drucker', 'robert schapire', 'eric bauer', 'd helmbold', 'd haussler', 't g dietterich', 'david p helmhold', 'michael j kearns', 'ehrenfeucht andrzej', 'drucker', 'drucker harris', 'd lewis', 'haussler d', 'r', 'm feder', 'r e schapire', 'a ehrenfeucht', 'michael kearns', 'schapire robert e', 'p auer', 'robert e schapire', 'william w cohen', 's goldman', 's a goldman', 'h druker', 'helmbold d p', 'dietterich t', 'rivest ron

In [237]:
# Anyway, 264 shouldn't be hard to hand cluster...
# But... Perhaps a good time to run dedupe and see how well it does...
import dedupe
fields = [{'field': 'name', 'type': 'String'}]
deduper = dedupe.Dedupe(fields)

data_d = {}
for index, row in dfauthors.iterrows():
    data_d[row['author_id']] = {'name': row['name']}

deduper.sample(data_d,)

author_name_clusters = list(uf_authors.getDisjointSets())

match = []
for author_name_cluster in author_name_clusters:
    for author_inst1 in author_name_cluster:
        for author_inst2 in author_name_cluster:
            match.append(({'name':author_inst1.replace('_',' ')},{'name':author_inst2.replace('_',' ')}))

from itertools import combinations
distinct = []
for author_name_cluster1,author_name_cluster2 in combinations(author_name_clusters,2):
    for author1 in author_name_cluster1:
        for author2 in author_name_cluster2:
            distinct.append(({'name':author1.replace('_',' ')},{'name':author2.replace('_',' ')}))
            
deduper.markPairs({'match':match, 'distinct':distinct})

In [238]:
deduper.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.100000, score 0.644809872524
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (2, name)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, name)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (3, name)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (4, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.8, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.4, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.2, name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.4, name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.2, name)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.6, name)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPr

In [239]:
with open(wdir+'/deduper.training.txt','w') as tf:
    deduper.writeTraining(tf)
with open(wdir+'/deduper.settings.txt','w') as ts:
    deduper.writeTraining(ts)
threshold = deduper.threshold(data_d, recall_weight=1)
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold)
print('# duplicate sets', len(clustered_dupes))

INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.820
INFO:dedupe.api:precision: 0.828
INFO:dedupe.api:With threshold: 0.423


clustering...
# duplicate sets 87


In [253]:
import csv
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_d[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(wdir+'/deduper.output.txt', 'w') as f_output, open(wdir+'/psl/AuthorNames.txt') as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input, delimiter='\t')

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

In [254]:
!head 'data/cora-uwash/deduper.output.txt'

Cluster ID,confidence_score,id,name,canonical_name
39,0.7173422575,0,h. drucker,b'harris drucker'
45,0.762571705487,1,r. schapire,b'r. e. schapire.'
61,0.977741758029,2,p. simard.,b'p. simard.'
39,0.7173422575,3,h. drucker,b'harris drucker'
45,0.762571705487,4,r. schapire,b'r. e. schapire.'
61,0.977741758029,5,p. simard.,b'p. simard.'
4,0.978106518586,6,drucker h.,b'drucker h.'
38,0.97618725402,7,schapire r.,b'schapire r.'
9,0.975046575069,8,simard p.,b'simard p.'


In [258]:
from collections import defaultdict
dfclusters = pd.read_csv(wdir+'/deduper.output.txt')
clusters = defaultdict(set)
for index,row in dfclusters.iterrows():
    clusters[row['Cluster ID']].add(row['name'])

In [262]:
with open(wdir+'/author.clusters','w') as f:
    for cluster in clusters.values():
        f.write('\t'.join(cluster)+'\n\n')

In [289]:
# After some corrections on dedupes clusters, I was able to create 51 clusters of names
#    i.e. reduce the number of clusters from 87 to 51
# Beware: Pitfalls that have not been accounted for include the possibilty that
#    two different entities with the same name may be in the same cluster.
#    Think of this as a cluster of name similarities rather than entities.

# Does the number of entities in this clustering equal 264
clusters_corrected = {}
cluster_id = 1
num_entities = 0
with open(wdir+'/authors.clusters.corrected') as f:
    for line in f:
        clusters_corrected[cluster_id] = set(line.strip().split('\t'))
        num_entities += len(clusters_corrected[cluster_id])
        cluster_id += 1
        
print(num_entities)
# Perhaps this is because of the non-replacement of the periods?

326


In [290]:
# With replacement of periods
clusters_corrected_alt = {}
cluster_id = 1
num_entities = 0
with open(wdir+'/authors.clusters.corrected') as f:
    for line in f:
        clusters_corrected_alt[cluster_id] = set(line.replace('.','').strip().split('\t'))
        num_entities += len(clusters_corrected_alt[cluster_id])
        cluster_id += 1
        
print(num_entities)

265


In [309]:
# Good. nearly there. what's the difference?
clusters_corrected_combo = set()
for cluster in clusters_corrected_alt.values():
    for authorname in cluster:
        clusters_corrected_combo.add(authorname)
        
clusters_corrected_combo - authors_all

{'simard patrice up'}

In [316]:
# Legit. It should really have showed up in the authors_all set. What's up?
dfrelevant.iloc[442]['authors']
# It's the extra space at the end. Nothing to worry about.

'drucker harris, schapire robert, and simard patrice up .'

In [320]:
# So, now that  we have (possibly noisy) ground truth for authors, 
#    we can get to writing our blocking stats function.
def get_blocking_stats(blocks_dict, ground_truth_set, num_total_elements):
    '''
    Req: ground_truth_set is a set of tuples (el1,el2) s.t. el1 < el2
         blocks_dict      is a dict with block_key : set(el1, el2, ...)
    Modifies ground_truth_set.
    '''
    num_total_comparisons = num_total_elements * (num_total_elements - 1) / 2
    num_ground_truth_total = len(ground_truth_set)
    num_comparisons_after_blocking = 0
    for block in blocks_dict.values():
        num_comparisons_after_blocking += len(block)*(len(block)-1)/2
        for el1 in block:
            for el2 in block:
                if el1 == el2:
                    continue
                elif el1 > el2:
                    ground_truth_set.discard((el2, el1))
                else:
                    ground_truth_set.discard((el1, el2))
    num_blocked_ground_truth = num_ground_truth_total - len(ground_truth_set)
    recall = num_blocked_ground_truth / num_ground_truth_total
    reduction_ratio = num_comparisons_after_blocking / num_total_comparisons
    return recall, reduction_ratio

In [325]:
num_total_elements = len(dfauthors)

ground_truth_set = set()
for cluster in clusters_corrected:
    for author1,author2 in combinations(cluster,2):
        ground_truth_set.add((author1,author2))

recall, reduction_ratio = get_blocking_stats(blocks1, ground_truth_set, )

{1: {'p. auer'},
 2: {'bartlett p.', 'p. bartlett', 'peter bartlett'},
 3: {'eric bauer'},
 4: {'a. blum', 'avrim blum', 'blum a.'},
 5: {'callan j.',
  'callan j. p.',
  'j. callan',
  'j. p. callan',
  'james callan',
  'james p. callan'},
 6: {'cesa-bianchi n.', 'n. cesa-bianchi', 'nicolo cesa-bianchi'},
 7: {'w. w. cohen', 'william w. cohen'},
 8: {'d. ron', 'd. ron.', 'dana ron', 'dana ron.', 'ron d.'},
 9: {'dietterich t.',
  'dietterich t. g.',
  't. dietterich',
  't. g. dietterich',
  'tom dietterich'},
 10: {'drucker',
  'drucker h.',
  'drucker harris',
  'h. drucker',
  'h. druker',
  'harris drucker'},
 11: {'a. ehrenfeucht',
  'andrzej ehrenfeucht',
  'ehrenfeucht a.',
  'ehrenfeucht andrzej'},
 12: {'m. feder'},
 13: {'freud y.',
  'freund v.',
  'freund y',
  'freund y.',
  'freund yoav',
  'y. freund',
  'y. freund.',
  'yoav freund',
  'yoav freund.'},
 14: {'furst m.', 'm. furst', 'merrick furst'},
 15: {'goldman s.',
  'goldman s. a.',
  'goldman s.a.',
  's. a. gol

In [None]:
# 2. their blockcondition2 has a substring-superstring relationship
# 3. there is at least one 3 letter match in the intersection of blockcondition1

In [363]:
print(len(blocks))
num_comparisons = 0
num_elements_in_block = 0
for block in blocks.values():
    num_comparisons += len(block) * len(block) - 1
    num_of_elements_in_block += 

245

In [344]:
authorname_to_ids = defaultdict(set)
for index, row in dfauthors.iterrows():
    authorname_to_ids[row['name']].add(row['author_id'])

In [346]:
clusters_by_id = defaultdict(set)
for key,cluster in clusters_corrected.items():
    for author in cluster:
        if author not in authorname_to_ids:
            print(author+" not found")
            continue
        clusters_by_id[key].update(authorname_to_ids[author])

In [422]:
def get_blocking_statistics(blocks, ground_truth, num_elements):
    
    # Get number of comparisons without blocking
    num_comparisons = num_elements * (num_elements - 1) / 2
    # Get number of comparisons after blocking
    num_comparisons_blocked = get_num_comparisons_blocked(blocks)
    compression = num_comparisons_blocked / num_comparisons
    
    num_ground_truth = len(ground_truth)
    num_ground_truth_blocked = get_num_ground_truth_blocked(blocks, ground_truth)
    recall = num_ground_truth_blocked / num_ground_truth
    
    print("Recall:", recall)
    print("Compression:", compression)
    
    return recall, compression

In [418]:
def get_num_comparisons_blocked(blocks):

    num_blocked_pairs = 0
    for block in blocks.values():
        l = len(blocks)
        num_blocked_pairs += l * (l-1) / 2

    return num_blocked_pairs

In [419]:
def get_num_ground_truth_blocked(blocks, ground_truth):
    
    num_ground_truth_blocked = 0
    for e1, e2 in ground_truth:
        for block in blocks.values():
            if e1 in block and e2 in block:
                num_ground_truth_blocked += 1
                break
                
    return num_ground_truth_blocked

In [449]:
ground_truth = set()
for cluster in clusters_by_id.values():
    for authid1,authid2 in combinations(cluster,2):
        ground_truth.add((authid1,authid2))
        
len(ground_truth)

599646

In [432]:
%%time
# Two author mentions are in the same block if 
# 1a. intersection of their blockcondition1 has at least two elements
dfauthors['bc'] = dfauthors['name'].apply(lambda x : set(part_of_name.replace('.','')[:3] for part_of_name in x.replace('.','. ').split()))

blockkeys = set()
for set1,set2 in combinations(dfauthors['bc'],2):
    if len(set1 & set2) > 1:
        blockkeys.add(tuple(set1 & set2))
        
blocks1 = defaultdict(set)
for blockkey in blockkeys:
    for index, row in dfauthors.iterrows():
        if row['bc'] & set(blockkey) == set(blockkey):
            blocks1[blockkey].add(row['author_id'])
            
recall, compression = get_blocking_statistics(blocks1, ground_truth, len(dfauthors))
print("Number of blocks:",len(blocks1))

Recall: 0.699476024187604
Compression: 0.19682050443882765
Number of blocks: 135
CPU times: user 31 s, sys: 0 ns, total: 31 s
Wall time: 31 s


In [433]:
# What's wrong about this?
get_blocking_stats(blocks1, ground_truth, len(dfauthors))

(0.4143844868472399, 0.10864920600457478)

In [435]:
%%time
# Two author mentions are in the same block if 
# 1b. intersection of their blockcondition1 has at least one element
# TODO:
# I'm pretty sure there's a way to reduce redundancy of num comparisons here... 
# Authors with {a','b','c'} appear in both {'a','b'} as well as {'a','c'} etc.
dfauthors['bc'] = dfauthors['name'].apply(lambda x : set(part_of_name.replace('.','')[:3] for part_of_name in x.replace('.','. ').split()))

blockkeys = set()
for set1,set2 in combinations(dfauthors['bc'],2):
    if len(set1 & set2):
        blockkeys.add(tuple(set1 & set2))
        
blocks2 = defaultdict(set)
for blockkey in blockkeys:
    for index, row in dfauthors.iterrows():
        if row['bc'] & set(blockkey) == set(blockkey):
            blocks2[blockkey].add(row['author_id'])
            
recall, compression = get_blocking_statistics(blocks2, ground_truth, len(dfauthors))
print("Number of blocks:",len(blocks2))

Recall: 0.9877179193648515
Compression: 0.6037327834947855
Number of blocks: 196
CPU times: user 41.3 s, sys: 0 ns, total: 41.3 s
Wall time: 41.3 s


In [448]:
# What's wrong about this?
get_blocking_stats(blocks2, ground_truth, len(dfauthors))

(0.3103781160831753, 0.33406689197281175)

In [447]:
%%time
# Two author mentions are in the same block if 
# 2. Their initials have a substring superstring relationship
dfauthors['bc'] = dfauthors['name'].apply(lambda x : ''.join(sorted([part_of_name[0] for part_of_name in x.replace('.','. ').split() if (part_of_name !='' and part_of_name !='.')])))

blockkeys = set()
for key in  dfauthors['bc'].unique():
    blockkeys.add(key)
        
blocks3 = defaultdict(set)
for blockkey in blockkeys:
    for index, row in dfauthors.iterrows():
        if row['bc'] in blockkey:
            blocks3[blockkey].add(row['author_id'])
            
recall, compression = get_blocking_statistics(blocks3, ground_truth, len(dfauthors))
print("Number of blocks:",len(blocks3))

Recall: 0.9879485821358803
Compression: 0.04230172035700176
Number of blocks: 81
CPU times: user 16.1 s, sys: 0 ns, total: 16.1 s
Wall time: 16.1 s


In [465]:
with open('data/cora-uwash/psl/BlocksAuthors.txt','w') as f:
    for blockid,block in blocks3.items():
        for authid in block:
            f.write('{}\t{}\n'.format(authid,blockid))

In [466]:
!wc -l data/cora-uwash/psl/BlocksAuthors.txt

7388 data/cora-uwash/psl/BlocksAuthors.txt
