# 基于多属性注意力机制的匹配模型

In [1]:
# Import modules 导入模块
import pandas as pd
import numpy as np
import csv
import py_entitymatching as em
import os

## 预处理

### 读取数据

In [2]:
filepath1 = "datasets/DBLP-Scholar/DBLP1.csv"
filepath2 = "datasets/DBLP-Scholar/Scholar.csv"
filepath3 = "datasets/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv"

**注意文件编码格式**:其中`DBLP-Scholar/Scholar.csv`文件的编码尝试为`iso8859-1`,可能是其中的`authors`一栏中有作者的名字为法语,编码不是常见的`utf-8`

文件的读取直接使用`pandan`的`read_csv`函数读出,然后使用`.head()`查看部分数据

In [3]:
dblp = pd.read_csv(filepath1, encoding="iso8859-1")
scholar = pd.read_csv(filepath2)
dblp_scholar_map = pd.read_csv(filepath3)

### 查看数据格式

In [4]:
dblp.head()

Unnamed: 0,id,title,authors,venue,year
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The Cooperative Activity Model,"M Rusinkiewicz, W Klas, T Tesch, J Wï¿½sch, P Muth",VLDB,1995
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions,"P Ammann, S Jajodia, I Ray",VLDB,1995
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001),,,2002


In [5]:
scholar.head()

Unnamed: 0,id,title,authors,venue,year
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase N in,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates and Releasees From Correctional Facilities,"TM Hammett, P Harmon, W Rhodes",see,
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and Engineering,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0


In [6]:
dblp_scholar_map.head()

Unnamed: 0,idDBLP,idScholar
0,conf/sigmod/AbadiC02,f2Lea-RN8dsJ
1,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,eBnT7lhV2LwJ
2,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,gBVNSFeS4P8J
3,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,VuY9Y49GqXgJ
4,conf/sigmod/AbiteboulBCMM03,AxpQwgyRyLgJ


In [7]:
# set metadata/设置 metadata
em.set_key(dblp, 'id')
em.set_key(scholar, 'id')

# set title and artists to lower case/将'title'和'artists'转换为小写
dblp["title"] = dblp["title"].str.lower()
scholar["title"] = scholar["title"].str.lower()

dblp["authors"] = dblp["authors"].str.lower()
scholar["authors"] = scholar["authors"].str.lower()

# preprocessing -- set all year to be interger/预处理,将年份变为整数
def short_year(x):
    """ 
    if x is nan, then `x == x` will be False
    如果`x`是nan类型的,那么x==x这个比较会得到False, Interesting!
    """
    if x == x:
        return int(x)
    return 0

scholar["year"] = scholar["year"].apply(short_year)
dblp["year"] = dblp["year"].apply(short_year)

check the two set after preprocessing/在预处理之后检查两个数据集

In [8]:
dblp.head()

Unnamed: 0,id,title,authors,venue,year
0,conf/vldb/RusinkiewiczKTWM95,towards a cooperative transaction model - the cooperative activity model,"m rusinkiewicz, w klas, t tesch, j wï¿½sch, p muth",VLDB,1995
1,journals/sigmod/EisenbergM02,sql/xml is making good progress,"a eisenberg, j melton",SIGMOD Record,2002
2,conf/vldb/AmmannJR95,using formal methods to reason about semantics-based decompositions of transactions,"p ammann, s jajodia, i ray",VLDB,1995
3,journals/sigmod/Liu02,editor's notes,l liu,SIGMOD Record,2002
4,journals/sigmod/Hammer02,report on the acm fourth international workshop on data warehousing and olap (dolap 2001),,,2002


In [9]:
scholar.head()

Unnamed: 0,id,title,authors,venue,year
0,aKcZKwvwbQwJ,11578 sorrento valley road,qd inc,"San Diego,",0
1,ixKfiTHoaDoJ,initiation of crazes in polystyrene,"as argon, jg hannoosh","Phil. Mag,",0
2,3BxllB4wwcIJ,immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase n in,"gh hansen, ll wetterberg, h sjã¶strã¶m, o norã©n","The Histochemical Journal,",1992
3,d2WWxwKMex4J,the burden of infectious disease among inmates and releasees from correctional facilities,"tm hammett, p harmon, w rhodes",see,0
4,cZCX-AQpjccJ,the role of faculty advising in science and engineering,jr cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995


## 寻找一个候选集(Blocking)

### 1. block with title/用title栏组成混合集

In [10]:
ab1 = em.AttrEquivalenceBlocker()
C1 = ab1.block_tables(dblp, scholar, 
                      l_block_attr='title', r_block_attr='title',
                      l_output_attrs=['title', 'authors', 'year'],
                      r_output_attrs=['title', 'authors', 'year'])
print(len(C1))
C1.head()

3324


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_authors,ltable_year,rtable_title,rtable_authors,rtable_year
0,0,journals/sigmod/EisenbergM02,wgK6p4mDSIMJ,sql/xml is making good progress,"a eisenberg, j melton",2002,sql/xml is making good progress,"a eisenberg, j melton",2002
1,1,conf/vldb/AmmannJR95,x-H7BqZ0Hw8J,using formal methods to reason about semantics-based decompositions of transactions,"p ammann, s jajodia, i ray",1995,using formal methods to reason about semantics-based decompositions of transactions,"p ammann, s jajodia, i ray",1995
2,2,journals/sigmod/Liu02,ntqMqfgRXM4J,editor's notes,l liu,2002,editor's notes,r goldstein,1996
3,3,journals/sigmod/Liu02,url:http://www.roc.noaa.gov/news/vol1is4.pdf,editor's notes,l liu,2002,editor's notes,nl smith,1981
4,4,journals/sigmod/Liu02,TUOBVMb4PBsJ,editor's notes,l liu,2002,editor's notes,dw leslie,0


### 2. block with author/用author栏组成混合集 

In [11]:
ab2 = em.AttrEquivalenceBlocker()
C2 = ab2.block_tables(dblp, scholar,
                      l_block_attr='authors', r_block_attr='authors',
                      l_output_attrs=['title', 'authors', 'year'],
                      r_output_attrs=['title', 'authors', 'year'])
print(len(C2))
C2.head()

3554


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_authors,ltable_year,rtable_title,rtable_authors,rtable_year
0,0,journals/sigmod/EisenbergM02,e3s4OFTeBqwJ,sql/xml is making good progress,"a eisenberg, j melton",2002,sqlj part 1: sql routines using the java tm programming language,"a eisenberg, j melton",1999
1,1,journals/sigmod/EisenbergM02,-RAYAJbKLNUJ,sql/xml is making good progress,"a eisenberg, j melton",2002,standards in practice,"a eisenberg, j melton",1998
2,2,journals/sigmod/EisenbergM02,6uelfg3RgEoJ,sql/xml is making good progress,"a eisenberg, j melton",2002,an early look at xquery,"a eisenberg, j melton",2002
3,3,journals/sigmod/EisenbergM02,4mD1eFCHKKwJ,sql/xml is making good progress,"a eisenberg, j melton",2002,sql standardization: the next steps,"a eisenberg, j melton",2000
4,4,journals/sigmod/EisenbergM02,Z1P9QnSfDuAJ,sql/xml is making good progress,"a eisenberg, j melton",2002,"sql: 1999, formerly known as sql3","a eisenberg, j melton",1999


### 3. Overlap(重叠)
3.1 overlap at least 6 words of title/标题至少用六个词重复

In [12]:
ob3 = em.OverlapBlocker()
C3 = ob3.block_tables(dblp, scholar, 'title', 'title',
                      word_level=True, overlap_size=6,
                      l_output_attrs=['title', 'authors', 'year'],
                      r_output_attrs=['title', 'authors', 'year'],
                      show_progress=True)
print(len(C3))

0% [##############################] 100% | ETA: 00:00:005634

Total time elapsed: 00:00:07


3.2 same year/年份相同

In [13]:
a3 = em.AttrEquivalenceBlocker()
c3 = a3.block_candset(C3, 'year', 'year', show_progress=True)
print(len(C3))

0% [##############################] 100% | ETA: 00:00:005634

Total time elapsed: 00:00:00


### 4. Overlap(重叠)
4.1 overlap at least 5 words of authors/作者栏至少有五个单次重合

In [14]:
ob4 = em.OverlapBlocker()
C4 = ob4.block_tables(dblp, scholar, 'authors', 'authors',
                      word_level=True, overlap_size=5,
                      l_output_attrs=['title', 'authors', 'year'],
                      r_output_attrs=['title', 'authors', 'year'],
                      show_progress=True)
print(len(C4))

0% [##############################] 100% | ETA: 00:00:005958

Total time elapsed: 00:00:03


4.2 same year/年份相同

In [15]:
a4 = em.AttrEquivalenceBlocker()
C4 = a4.block_candset(C4, 'year', 'year',show_progress=True)
print(len(C4))

0% [##############################] 100% | ETA: 00:00:001145

Total time elapsed: 00:00:00


### Union results together to get final candiate set(合并之前得到的候选集合得到最终的候选集合))

In [16]:
G = em.combine_blocker_outputs_via_union([C1, C2, C3, C4])
print(len(G))

G.head()

9558


Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_authors,ltable_year,rtable_title,rtable_authors,rtable_year
0,0,conf/sigmod/2000,19VIHSiMAXcJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,carnot and infosleuth-database technology and the www. acm sigmod intern. conf. on management of,"n jacobs, r shea",0
1,1,conf/sigmod/2000,2DtY9exkFcgJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,"umform techniques for loop optimization, in proceedings of the acm international conference on",w pugh,1991
2,2,conf/sigmod/2000,5k-GwvznWRUJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,discover: keyword search in relational databases in proceedings of the international conference ...,"v hristidis, y papakonstantinou",0
3,3,conf/sigmod/2000,5wcgt7bNx7YJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,"shoring up persistent applications, acm sigmod international conference on management of data","mj carey, dj dewitt, mj franklinâ?¦",0
4,4,conf/sigmod/2000,AYtgczYwVnYJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,concept based design of data warehouses: the dwq demonstrators. in 2000 acm sigmod intl,"m jarke, c quix, d calvanese, m lenzerini, e",0


### Run debugger to make sure not missing any valid matches(测试是否遗漏匹配)

In [17]:
dbg = em.debug_blocker(G, dblp, scholar, output_size=50)
dbg

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_authors,ltable_venue,rtable_title,rtable_authors,rtable_venue
0,0,journals/tods/CliffordDIJS97,sflWMxkOH2cJ,on the semantics of ``now'' in databases,"j clifford, c dyreson, t isakowitz, c jensen, r snodgrass",ACM Trans. Database Syst.,on the semantics of,"j clifford, c dyreson, t isakowitz, cs jensen, r","Nowâ?? in databases,â?? ACM Transactions on Database Systems,"
1,1,conf/sigmod/HammerGNYBV97,2rysKgS6lugJ,template-based wrappers in the tsimmis system,"j hammer, h garcia-molina, s nestorov, r yerneni, m breunig, v vassalos",SIGMOD Conference,template-based wrappers in the tsimmis experience,"j hammer, h garcia-molina, s nestorov, r yerneni,",Proceedings of the ACM SIGMOD International Conference on
2,2,conf/vldb/KoudasMJ99,wzae2lO15U8J,mining deviants in a time series database,"h jagadish, n koudas, s muthukrishnan",VLDB,mining deviants in time series databases,"h jagdish, n koudas, s muthukrishnan","Proc. VLDB,"
3,3,conf/vldb/NishioSTTL01,S30IXg4ClFIJ,functional properties of information filtering,"r sawai, m tsukamoto, y loh, t terada, s nishio",VLDB,on functional properties of information filtering,"r sawai, m tsukamoto, t terada, yh loh, s nishio","Electronics and Communications in Japan(Part II Electronics),"
4,4,conf/sigmod/AgrawalASY97,7Of1HyK-VnkJ,efficient view maintenance at data warehouses,"d agrawal, a abbadi, a singh, t yurek",SIGMOD Conference,efficient view maintenance warehouses,"d agrawal, a el abbadi, a singh, t yurek","Proc. Of the 1997 ACM SIGMOD International Conference on &hellip;,"
5,5,conf/vldb/MadhavanBR01,66dWDpWjbbwJ,generic schema matching with cupid,"j madhavan, p bernstein, e rahm",VLDB,generic schema matching with cupid. 2001,"j madhavan, pa bernstein, e rahm",Microsoft
6,6,conf/sigmod/FernandezFKLS98,ek26aiEheesJ,catching the boat with strudel: experiences with a web-site management system,"m fernandez, d florescu, j kang, a levy, d suciu",SIGMOD Conference,strudel: a web-site management system,"m fernandez, j kang, a levy, d suciu",
7,7,conf/sigmod/AcharyaGPR99a,ahfoveFQC2sJ,join synopses for approximate query answering,"s acharya, p gibbons, v poosala, s ramaswamy",SIGMOD Conference,join synopses for improving approximate query answers,"s acharya, p gibbons, v poosala, s ramaswarmy","Proc. of ACM SIGMOD Conf,"
8,8,conf/vldb/HaasKWY97,yIRQRtTEzSAJ,optimizing queries across diverse data sources,"l haas, d kossmann, e wimmers, j yang",VLDB,optimizing queries across diverse sources,"l haas, d kossmann, e wimmers, j yan","&hellip; of the 23rd VLDB Conference, Athens, Greece"
9,9,conf/sigmod/MaddenSHR02,kjtuCATx_uoJ,continuously adaptive continuous queries over streams,"s madden, m shah, j hellerstein, v raman",SIGMOD Conference,continuously adaptive continuous queries,"s madden, m shah, jm hellerstein, v raman",Proc. of the ACM SIGMOD International Conference on


### Save dataset and set aside sample for labeling
保存数据集并且取部分样本作为标记

In [18]:
# save data/保存数据
file_name = 'datasets/DBLP-Scholar/candidates.csv'
G.to_csv(file_name, sep=",")

# Take sample to label/取样本为标签
s_file_name = "datasets/DBLP-Scholar/candidates_sample.csv"
S = em.sample_table(G, 1000)

In [19]:
S.head(10)

Unnamed: 0,_id,ltable_id,rtable_id,ltable_title,ltable_authors,ltable_year,rtable_title,rtable_authors,rtable_year
5,5,conf/sigmod/2000,CpwYam_LYyEJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,on predicting data cache behavior for real {time systems. in proceedings of the acm sigplan work...,"c ferdinand, r wilhelm",0
6,6,conf/sigmod/2000,F9KzvnDpCPUJ,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,similarity-based queries for time series data. in: proceedings of the acm sigmod conference on t...,"d raflei, a mendelzon",1997
16,16,conf/sigmod/2000,url:http://portal.acm.org/citation.cfm%3Fid%3D191839,"proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...",,2000,source international conference on management of data archive,"rt snodgrass, m winslett",1994
35,35,conf/sigmod/2002,QnMrKru1S1MJ,"proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...",,2002,"blocation-based spatial queries,^ in proceedings of the acm international conference on manageme...","j zhang, m zhu, d papadias, y tao, dl lee",2003
40,40,conf/sigmod/2002,url:http://portal.acm.org/citation.cfm%3Fcoll%3DGUIDE%26dl%3DGUIDE%26id%3D276377,"proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...",,2002,full text pdf format pdf (424 kb)source international conference on management of data archive,"c olston, a woodruff, a aiken, m chu, v ercegovac,",1998
42,42,conf/sigmod/2002,url:http://portal.acm.org/citation.cfm%3Fid%3D16894%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...,"proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...",,2002,source international conference on management of data archive,c zaniolo,1986
51,51,conf/sigmod/2002,url:http://portal.acm.org/citation.cfm%3Fid%3D38713.38742,"proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...",,2002,full text pdf format pdf (1.24 mb)source international conference on management of data archive,"h garcia-molina, k salem",1987
55,55,conf/sigmod/2002,vfTH6FzOpDAJ,"proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...",,2002,data modeling of time-based media. in: proceedings of the 1994 acm sigmod international conferen...,"s gibbs, c breiteneder, d tsichritzis",1994
81,81,conf/sigmod/2003,url:http://portal.acm.org/citation.cfm%3Fid%3D50202%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...,"proceedings of the 2003 acm sigmod international conference on management of data, san diego, ca...",,2003,source international conference on management of data archive,"h boral, pa larson",1988
93,93,conf/sigmod/AbadiCCCCEGHMRSSTXYZ03,eBnT7lhV2LwJ,aurora: a data stream management system,"d abadi, d carney, u ï¿½etintemel, m cherniack, c convey, c erwin, e galvez, m hatoun, a maskey,...",2003,aurora: a data stream management system (demo description),"d abadi, d carney, u cetintemel, m cherniack, c",0


In [20]:
# 插入labels
labels = []
# change DataFrame into np.array/将pandas转换为numpy,方便处理
map_arr = np.array(dblp_scholar_map)
s_arr = np.array(S) 
l_flag = False
for i in range(len(s_arr)):
    for j in range(len(map_arr)):
        if s_arr[i][1] == map_arr[j][0] and s_arr[i][2] == map_arr[j][1]:
            labels.append('1')
            l_flag = True
            # print("find compitable!")
            continue
    if not l_flag:
        labels.append('0')
    l_flag = False
print(len(labels))
S.insert(loc=9, column='label', value=labels)
S.to_csv(s_file_name, sep=",")

1000


In [21]:
i_file = "datasets/DBLP-Scholar/I.csv"
j_file = "datasets/DBLP-Scholar/J.csv"

if not os.path.isfile(i_file):
    IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
    I = IJ['train']
    J = IJ['test']
    I.to_csv(i_file, sep=",")
    J.to_csv(j_file, sep=",")
    print("Split samples into I and J")
else:
    I = em.read_csv_metadata(i_file, key="_id",
                             ltable=dblp, rtable=scholar,
                             fk_ltable="ltable_id", fk_rtable='rtable_id')
    J = em.read_csv_metadata(j_file, key="_id",
                             ltable=dblp, rtable=scholar,
                         fk_ltable='ltable_id', fk_rtable='rtable_id')
    print("Reading I and J from files")                         
print(len(I))
print(len(J))

Split samples into I and J
500
500


参考文献:

[1] Python-使用Magellan进行数据匹配总结.https://blog.csdn.net/weixin_34124939/article/details/86357847[OL].2020,11,12.
