# Material

## Libraries

In [1]:
# importing the necessary libraries
import scipy as sp
import numpy as np
import pandas as pd
import sklearn as sk
from collections import defaultdict

try:
    import py_entitymatching as em
except:
    !pip install py_entitymatching

try:
    import deepmatcher as dm
except:
    !pip uninstall preprocessing -y
    !pip uninstall fastai -y 
    !pip uninstall allennlp -y
    !pip install deepmatcher

# ensuring the current pandas version is 1.2.4
if pd.__version__ != "1.2.4":
 !pip install pandas==1.2.4
 import pandas as pd
 print(f"Verify pandas=={pd.__version__}")
else:
 print(f"Verify pandas=={pd.__version__}")

import sys
import tensorflow as tf
from tensorflow.python.client import device_lib
import os
from imblearn.under_sampling import RandomUnderSampler
import time

Verify pandas==1.2.4




## Data

In [2]:
# retrieving the data from my GitHub
!git clone https://github.com/ljhoh1/Dissertation_Data_2

Cloning into 'Dissertation_Data_2'...
remote: Enumerating objects: 105, done.[K
remote: Total 105 (delta 0), reused 0 (delta 0), pack-reused 105[K
Receiving objects: 100% (105/105), 124.06 MiB | 5.10 MiB/s, done.
Resolving deltas: 100% (46/46), done.
Checking out files: 100% (36/36), done.


In [3]:
# preparation for loading the data
# Cora data (with labelled)
path_Match = os.path.join('.', 'Dissertation_Data_2', 'DBLP-Scholar_Perfect_Mapping.csv')
path_Match_Cora = os.path.join('.', 'Dissertation_Data_2', 'cora_duplicates_data.tsv')
csv_table=pd.read_table(path_Match_Cora, sep='\t')
csv_table.to_csv(os.path.join('.', 'Dissertation_Data_2', 'cora_labelled.csv'),index=False)
path_Cora = os.path.join('.', 'Dissertation_Data_2', 'cora_data.tsv')
csv_table=pd.read_table(path_Cora,sep='\t')
csv_table.to_csv(os.path.join('.', 'Dissertation_Data_2', 'cora_data.csv'),index=False)

# Scholar and DBLP data (with labelled)
path_Scholar = os.path.join('.', 'Dissertation_Data_2', 'Scholar_data.csv')
path_DBLP = os.path.join('.', 'Dissertation_Data_2', 'DBLP1_data.csv')

Scholar = em.read_csv_metadata(path_Scholar, key='id')
DBLP = em.read_csv_metadata(path_DBLP, key='id')
# Load the labeled data into a dataframe.
labelled_DS = em.read_csv_metadata(path_Match, 
                         ltable=Scholar, rtable=DBLP, 
                         fk_ltable='idScholar', fk_rtable='idDBLP')



# Helper Functions

In [4]:
# transforming string features
def string_cleaner(data):
  '''
  Creating a function to remove non-numerical/alphabetical string characters and merges features
  Input:
  - data (pandas DataFrame): the input data containing at least one non-identifier column
  Output:
  - data(pandas DataFrame): the transformed dataset
  '''
  # remove special characters in all features but id and cast as lower characters
  for col in data.columns:
    if col != "id":    
      data[col] = data[col].astype(str).str.replace('[^A-Za-z0-9 ]+', '')
      data[col] = data[col].astype(str).str.lower()
      data.loc[data[col] == 'nan', [col]] = np.nan

  #merge columns into one and remove all others
  cols_data = [i for i in data.columns if i != "id"]
  data["title"] = data[cols_data].astype(str).agg(' '.join, axis=1)
  for i in cols_data:
    if i != "id" and i != "title":
      del data[i]

  return data 

In [5]:
# Loading labelled and non-labelled Cora data
Cora = em.read_csv_metadata(os.path.join('.', 'Dissertation_Data_2', 'cora_data.csv'), key='id')
cora_labelled = em.read_csv_metadata(os.path.join('.', 'Dissertation_Data_2', 'cora_labelled.csv'), 
                         ltable=Cora, rtable=Cora, 
                         fk_ltable='id2', fk_rtable='id1')

for col in Cora.columns:
  Cora[col] = Cora[col].astype(str).str.replace('[^A-Za-z0-9 ]+', '')
  Cora[col] = Cora[col].astype(str).str.lower()

Cora['date'] = Cora['date'].astype(str).str.replace('[^0-9]+', '')
Cora.year.fillna(Cora.date, inplace=True)
Cora.loc[Cora['year'] == '', ['year']] = np.nan
del Cora['date']

Cora.rename(columns = {'type':'new_type'}, inplace = True)

Cora = string_cleaner(Cora)
Scholar = string_cleaner(Scholar)
DBLP = string_cleaner(DBLP)

  
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [6]:
def labelling(train, labelled, key = "idDBLP", val = "idScholar"):
  '''
  Create dictionary for checking how many matches are retained and perform labelling
  Input:
  - train (pd.DataFrame): DataFrame object of blocked data used for training
  - lbelled (pd.DataFrame): The labelled DataFrame for labelling the train data
  - key (string): which table id in blocked "train" to use as key for labelling
  - val (string): which table id in blocked "train" to use as value for labelling
  Output:
  - train (pd.DataFrame): The labelled training dataframe
  '''
  # merge id columns in one to uniquely identify
  train["id_comb"] = train["rtable_id"].astype(str) + train["ltable_id"].astype(str)
  labelled["id_comb"] = labelled[key].astype(str) + labelled[val].astype(str)
  # merge the training and labelled dataset and assign labels
  K = train.merge(labelled, on='id_comb', how='left', indicator=True)
  K.loc[(K['_merge'] == "both"), 'label'] = 1
  K.loc[(K['_merge'] == "left_only") | (K['_merge'] == "right_only"), 'label'] = 0
  # delete unused columns
  del K['_merge']
  del K[key]
  del K[val]
  del K['id_comb']

  # cleaning the data
  for col in K.columns:
    if 'id' not in col and col != "label":
      K[col] = K[col].astype(str).str.replace('[^A-Za-z0-9 ]+', '')
  
  if key == "id1":
      K['_id'] = K['_id'].astype(int)
  K['label'] = K['label'].astype(int)
  return K

In [7]:
def scores(time, predictions, iteration, train_n, model, epochs=15):
  '''
  Report the most important metrics of performance from the predictions made by the model
  Input:
  - time (float): time the model ran for
  - predictions (pd.DataFrame): contains the output features and column "match_score", containing the prediction information
  - iteration (int64): the current iteration if applicable
  - train_n (int64): the number of samples used in training
  - model (string): the model that was used
  - epochs (int64): the number of epochs the model was trained for
  Output:
  - dict (dictionary): dictionary containing the performance information  
  '''
  # casting prediction scores as binary labels
  predictions['match_prediction'] = predictions['match_score'].apply(lambda score: 1 if score >= 0.5 else 0)
  # Reset index as Magellan requires the key to be a column in the table
  predictions.reset_index(inplace=True)
  # Update metadata in the catalog. This information can later be used by triggers to modify the labels from 
  # the learning-based matcher 
  em.set_key(predictions, '_id')
  em.set_fk_ltable(predictions, 'ltable_id')
  em.set_fk_rtable(predictions, 'rtable_id')
  # AUC-ROC metrics
  auc = sk.metrics.roc_auc_score(predictions['label'], predictions['match_score'], average = None)
  # Precision Metrics
  prec = em.eval_matches(predictions, 'label', 'match_prediction')
  # Creating dictionary of performance metrics to append to a DataFrame
  dict = {'Model':model, 'Time':time, 'Epochs':epochs,'Iteration':iteration, '#Training_samples':train_n,
                          'F1':prec['f1'], 'Precision':prec['precision'], 
                          'Recall':prec['recall'], 'AUC':auc}
  return dict
# Create a placeholder matrix to append precision scores to
prec_matrix = pd.DataFrame(columns=['Model', 'Time', 'Epochs', 'Iteration', '#Training_samples', 'F1', 'Precision', 'Recall', 'AUC'])

In [8]:
def subsampling(cand):
  '''
  Function to subsample to avoid unbalanced data
  Input:
  - cand (pd.DataFrame): contains the blocked (and transformed) data with "label" feature
  Output:
  - cand_resampled (pd.DataFrame): contains the resampled data to avoid imbalance
  '''
  # subsampling the candidate set for better balance
  # creating the explanatory and dependent feature data
  binary_X = cand.loc[:, cand.columns != "label"]
  binary_y = cand.loc[:, cand.columns == "label"]
  # initialising the RandomUnderSampler
  rus = RandomUnderSampler(random_state=0)
  # resampling to 0.5/0.5
  X_res, y_res = rus.fit_resample(binary_X, binary_y)
  res_labelled = pd.DataFrame(X_res)
  res_labelled.columns = cand.columns[0:-1]
  # take the index of resampled observations to subsample from the original
  sel_idx = list(res_labelled._id)
  cand_resampled = cand[cand["_id"].isin(sel_idx)]
  return cand_resampled

# Blocking

In [9]:
# feature blocking for Cora 
# Get tokenizers and similarity function
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()
# Get attributes
atypes1 = em.get_attr_types(Cora)
atypes2 = em.get_attr_types(Cora)
# Get correspondence
block_c = em.get_attr_corres(Cora, Cora)
#Get Features
block_f = em.get_features(Cora, Cora, atypes1, atypes2, block_c, block_t, block_s)

In [10]:
# Creating a rule based blocker with threshold 0.4
rb = em.RuleBasedBlocker()
cols_cora = [i for i in Cora.columns]
rule = ['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.3']
rb.add_rule(rule, feature_table=block_f)
K1 = rb.block_tables(Cora, Cora,
                   l_output_attrs=cols_cora, 
                   r_output_attrs=cols_cora,
                   n_jobs=1)

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finding pairs with missing value...


# Processing and Modell Training

## Cora

### Processing

In [11]:
cols = list(cora_labelled.columns)
a, b = cols.index('id1'), cols.index('id2')
cols[b], cols[a] = cols[a], cols[b]
cor_lab_rev = cora_labelled[cols]
cor_lab_rev.columns = ['id1', 'id2']
cora_labelled = cora_labelled.append(cor_lab_rev)
# Labelling the blocked data set
cand = labelling(K1, cora_labelled, key = "id1", val = "id2")
print("Proportion of matches retained after blocking:", round(sum(cand['label']) / cora_labelled.shape[0], 2))

#cand_resampled = subsampling(cand)



Proportion of matches retained after blocking: 1.0


In [12]:
#pred.to_csv(os.path.join('.', 'Dissertation_Data_2', 'pred_Cora.csv'),index=False)
# The directory where the data splits will be saved.
split_path = os.path.join('.', 'Dissertation_Data_2')
# Split labeled data into train, valid, and test csv files to disk, with the split ratio of 3:1:1.
dm.data.split(cand, split_path, 'train.csv', 'valid.csv', 'test.csv',
              [3, 1, 1])

In [13]:
cand.shape

(233183, 6)

In [14]:
sum(cand['label'])

129109

In [15]:
# The directory where the data splits will be saved.
split_path = os.path.join('.', 'Dissertation_Data_2')
# Split labeled data into train, valid, and test csv files to disk, with the split ratio of 3:1:1.
dm.data.split(cand, split_path, 'train.csv', 'valid.csv', 'test.csv',
              [3, 1, 1],
              stratified=True)

In [16]:
print("Proportion of matches retained after blocking:", round(sum(cand['label']) / cand.shape[0], 2))

Proportion of matches retained after blocking: 0.55


In [17]:
# Load the training data files from the disk. Ignore the "left_id" and "right_id" 
# columns for data preprocessing.
# The 'use_magellan_convention' parameter asks deepmatcher to use Magellan's 
# naming convention for the left and right table column name prefixes 
# ("ltable_", and "rtable_"), and also to consider "_id" as the ID column.
train, validation, test = dm.data.process(
    path=os.path.join('.', 'Dissertation_Data_2'),
    cache='train_cache_4.pth',
    train='train.csv',
    validation='valid.csv',
    test='test.csv',
    use_magellan_convention=True,
    ignore_columns=('ltable_id', 'rtable_id'))


Reading and processing data from "./Dissertation_Data_2/train.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/valid.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/test.csv"
0% [##############################] 100% | ETA: 00:00:00INFO:deepmatcher.data.field:Downloading vectors from https://drive.google.com/uc?export=download&id=1Vih8gAmgBnuYDxfblbT94P6WjB7s1ZSh to /root/.vector_cache/wiki.en.bin
INFO:deepmatcher.data.field:Unable to fetch cached English Word Embeddings from https://drive.google.com/uc?export=download&id=1Vih8gAmgBnuYDxfblbT94P6WjB7s1ZSh
Downloading embeddings from https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip to /root/.vector_cache/wiki.en.zip
  self.destination = self.backup_destination
INFO:deepmatcher.data.field:Extracting vectors into /root/.vector_cache

Building vocabulary
0% [#############################

### Model

In [18]:
# Create a hybrid model.
model_cor = dm.MatchingModel(attr_summarizer='hybrid')
model_cor.initialize(train)  # Initilization

# Train ing model on 10 epochs, batch size of 16, positive-to-negative 
# ratio to be 1:1. We save the best model (with the 
# highest F1 score on the validation set) to 'sifDiff_model.pth'.
startTime = time.time()
model_cor.run_train(
    train,
    validation,
    epochs=15,
    batch_size=128,
    best_save_path='hybrid_Trans_Cor.pth')
executionTime_Cora = (time.time() - startTime)/60
# Evaluate the accuracy on the test data.
model_cor.run_eval(test)



* Number of trainable parameters: 2798703
===>  TRAIN Epoch 1


  "reduction: 'mean' divides the total loss by both the batch size and the support size."
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 1 || Run Time:  141.7 | Load Time:  278.6 || F1:  97.87 | Prec:  97.15 | Rec:  98.61 || Ex/s: 332.84

===>  EVAL Epoch 1


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 1 || Run Time:   18.0 | Load Time:   86.8 || F1:  99.28 | Prec:  98.76 | Rec:  99.79 || Ex/s: 445.35

* Best F1: tensor(99.2757, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 2


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 2 || Run Time:  141.9 | Load Time:  278.7 || F1:  99.25 | Prec:  98.74 | Rec:  99.76 || Ex/s: 332.59

===>  EVAL Epoch 2


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 2 || Run Time:   17.9 | Load Time:   86.8 || F1:  99.26 | Prec:  99.04 | Rec:  99.48 || Ex/s: 445.29

---------------------

===>  TRAIN Epoch 3


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 3 || Run Time:  141.5 | Load Time:  278.9 || F1:  99.27 | Prec:  98.85 | Rec:  99.70 || Ex/s: 332.80

===>  EVAL Epoch 3


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 3 || Run Time:   18.0 | Load Time:   86.9 || F1:  99.28 | Prec:  99.19 | Rec:  99.38 || Ex/s: 444.80

* Best F1: tensor(99.2823, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 4


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 4 || Run Time:  141.5 | Load Time:  278.4 || F1:  99.31 | Prec:  98.96 | Rec:  99.67 || Ex/s: 333.17

===>  EVAL Epoch 4


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 4 || Run Time:   17.9 | Load Time:   86.3 || F1:  99.32 | Prec:  99.23 | Rec:  99.40 || Ex/s: 447.57

* Best F1: tensor(99.3171, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 5


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 5 || Run Time:  141.4 | Load Time:  278.5 || F1:  99.36 | Prec:  99.03 | Rec:  99.69 || Ex/s: 333.14

===>  EVAL Epoch 5


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 5 || Run Time:   17.9 | Load Time:   86.7 || F1:  99.31 | Prec:  99.20 | Rec:  99.43 || Ex/s: 445.51

---------------------

===>  TRAIN Epoch 6


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 6 || Run Time:  141.5 | Load Time:  278.5 || F1:  99.39 | Prec:  99.09 | Rec:  99.70 || Ex/s: 333.12

===>  EVAL Epoch 6


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 6 || Run Time:   18.0 | Load Time:   86.7 || F1:  99.32 | Prec:  99.20 | Rec:  99.43 || Ex/s: 445.83

---------------------

===>  TRAIN Epoch 7


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 7 || Run Time:  141.5 | Load Time:  278.2 || F1:  99.41 | Prec:  99.12 | Rec:  99.71 || Ex/s: 333.31

===>  EVAL Epoch 7


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 7 || Run Time:   17.9 | Load Time:   86.9 || F1:  99.34 | Prec:  99.16 | Rec:  99.53 || Ex/s: 444.77

* Best F1: tensor(99.3448, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 8


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 8 || Run Time:  141.6 | Load Time:  278.7 || F1:  99.43 | Prec:  99.14 | Rec:  99.72 || Ex/s: 332.91

===>  EVAL Epoch 8


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 8 || Run Time:   17.9 | Load Time:   86.8 || F1:  99.36 | Prec:  99.16 | Rec:  99.56 || Ex/s: 445.45

* Best F1: tensor(99.3585, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 9


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:57


Finished Epoch 9 || Run Time:  141.4 | Load Time:  278.3 || F1:  99.44 | Prec:  99.15 | Rec:  99.73 || Ex/s: 333.36

===>  EVAL Epoch 9


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 9 || Run Time:   18.0 | Load Time:   86.8 || F1:  99.36 | Prec:  99.15 | Rec:  99.57 || Ex/s: 444.99

* Best F1: tensor(99.3585, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 10


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 10 || Run Time:  141.7 | Load Time:  278.5 || F1:  99.47 | Prec:  99.19 | Rec:  99.74 || Ex/s: 332.94

===>  EVAL Epoch 10


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 10 || Run Time:   17.9 | Load Time:   86.7 || F1:  99.35 | Prec:  99.16 | Rec:  99.55 || Ex/s: 445.77

---------------------

===>  TRAIN Epoch 11


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:57


Finished Epoch 11 || Run Time:  141.4 | Load Time:  278.1 || F1:  99.48 | Prec:  99.21 | Rec:  99.75 || Ex/s: 333.51

===>  EVAL Epoch 11


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 11 || Run Time:   17.9 | Load Time:   86.6 || F1:  99.35 | Prec:  99.12 | Rec:  99.57 || Ex/s: 446.36

---------------------

===>  TRAIN Epoch 12


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 12 || Run Time:  141.5 | Load Time:  278.6 || F1:  99.49 | Prec:  99.22 | Rec:  99.77 || Ex/s: 333.05

===>  EVAL Epoch 12


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 12 || Run Time:   17.9 | Load Time:   86.8 || F1:  99.35 | Prec:  99.10 | Rec:  99.59 || Ex/s: 445.38

---------------------

===>  TRAIN Epoch 13


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 13 || Run Time:  141.5 | Load Time:  278.2 || F1:  99.50 | Prec:  99.22 | Rec:  99.78 || Ex/s: 333.32

===>  EVAL Epoch 13


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 13 || Run Time:   18.0 | Load Time:   86.7 || F1:  99.33 | Prec:  99.08 | Rec:  99.59 || Ex/s: 445.37

---------------------

===>  TRAIN Epoch 14


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 14 || Run Time:  141.7 | Load Time:  278.6 || F1:  99.51 | Prec:  99.23 | Rec:  99.80 || Ex/s: 332.86

===>  EVAL Epoch 14


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 14 || Run Time:   17.9 | Load Time:   87.1 || F1:  99.34 | Prec:  99.06 | Rec:  99.63 || Ex/s: 444.16

---------------------

===>  TRAIN Epoch 15


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:06:58


Finished Epoch 15 || Run Time:  141.5 | Load Time:  278.6 || F1:  99.52 | Prec:  99.23 | Rec:  99.80 || Ex/s: 333.11

===>  EVAL Epoch 15


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:44


Finished Epoch 15 || Run Time:   17.9 | Load Time:   86.9 || F1:  99.34 | Prec:  99.03 | Rec:  99.64 || Ex/s: 444.99

---------------------

Loading best model...
Training done.
===>  EVAL Epoch 9


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:50


Finished Epoch 9 || Run Time:   39.5 | Load Time:   71.3 || F1:  99.34 | Prec:  99.16 | Rec:  99.52 || Ex/s: 421.14



tensor(99.3389, device='cuda:0')

## Scholar-DBLP

### Process

In [19]:
# blocking for predictions on other data set
# Get tokenizers and similarity function
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()
# Get attributes
atypes1 = em.get_attr_types(Scholar)
atypes2 = em.get_attr_types(DBLP)
# Get correspondence
block_c = em.get_attr_corres(Scholar, DBLP)
#Get Features
block_f = em.get_features(Scholar, DBLP, atypes1, atypes2, block_c, block_t, block_s)

In [20]:
# Creating a rule based blocker for prediction
rb = em.RuleBasedBlocker()
# 0.3 achieves 49% balance negative to positive
rule = ['title_title_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.3']
rb.add_rule(rule, feature_table=block_f)
C = rb.block_tables(Scholar, DBLP,
                   l_output_attrs=['title'], 
                   r_output_attrs=['title'], 
                   n_jobs=1)

0% [##############################] 100% | ETA: 00:00:00

Finding pairs with missing value...



Total time elapsed: 00:02:25


In [21]:
# labelling data for prediction
pred = labelling(C, labelled_DS, key = "idDBLP", val = "idScholar")
print("Proportion of matches retained after blocking:", round(sum(pred['label']) / labelled_DS.shape[0], 2))

pred.to_csv(os.path.join('.', 'Dissertation_Data_2', 'pred_DS.csv'),index=False)
# The directory where the data splits will be saved.
split_path = os.path.join('.', 'Dissertation_Data_2')
# Split labeled data into train, valid, and test csv files to disk, with the split ratio of 3:1:1.
dm.data.split(pred, split_path, 'train_DS.csv', 'valid_DS.csv', 'test_DS.csv',
              [3, 1, 1])



Proportion of matches retained after blocking: 0.98


In [22]:
pred.shape

(11322, 6)

In [23]:
# processing the data for prediction
train_DS, validation_DS, test_DS = dm.data.process(
    path=os.path.join('.', 'Dissertation_Data_2'),
    cache='train_cache_2.pth',
    train='train_DS.csv',
    validation='valid_DS.csv',
    test='test_DS.csv',
    use_magellan_convention=True,
    ignore_columns=('ltable_id', 'rtable_id'))

train_DS, validation_DS, pred_DS = dm.data.process(
    path=os.path.join('.', 'Dissertation_Data_2'),
    cache='train_cache_3.pth',
    train='train_DS.csv',
    validation='valid_DS.csv',
    test='pred_DS.csv',
    use_magellan_convention=True,
    ignore_columns=('ltable_id', 'rtable_id'))


Reading and processing data from "./Dissertation_Data_2/train_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/valid_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/test_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Building vocabulary
0% [#######] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00

Computing principal components
0% [#######] 100% | ETA: 00:00:00
Total time elapsed: 00:00:01

Reading and processing data from "./Dissertation_Data_2/train_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/valid_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/pred_DS.csv"
0% [############################# ] 100% | ETA: 00:00:00
Building vocabulary
0% [#######] 100% | ETA: 00:00:00
Total 

### Model

In [24]:
# Create a hybrid model for DBLP-Scholar data
model_DS = dm.MatchingModel(attr_summarizer='hybrid')
model_DS.initialize(train_DS)  # Initilization

# Train ing model on 15 epochs, batch size of 16, positive-to-negative 
# ratio to be 1:1. We save the best model (with the 
# highest F1 score on the validation set) to 'sifDiff_model.pth'.
startTime = time.time()
model_DS.run_train(
    train_DS,
    validation_DS,
    epochs=15,
    batch_size=16,
    best_save_path='hybrid_Trans_DS.pth')
executionTime_DS = (time.time() - startTime)/60
# Evaluate the accuracy on the test data.
model_DS.run_eval(test_DS)

* Number of trainable parameters: 2798703
===>  TRAIN Epoch 1


  "reduction: 'mean' divides the total loss by both the batch size and the support size."
0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 1 || Run Time:   18.2 | Load Time:    6.0 || F1:  85.72 | Prec:  84.45 | Rec:  87.03 || Ex/s: 279.73

===>  EVAL Epoch 1


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 1 || Run Time:    3.0 | Load Time:    2.1 || F1:  92.39 | Prec:  91.23 | Rec:  93.59 || Ex/s: 443.37

* Best F1: tensor(92.3949, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 2


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 2 || Run Time:   18.1 | Load Time:    6.0 || F1:  94.28 | Prec:  94.22 | Rec:  94.34 || Ex/s: 281.87

===>  EVAL Epoch 2


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 2 || Run Time:    3.0 | Load Time:    2.1 || F1:  92.73 | Prec:  91.52 | Rec:  93.97 || Ex/s: 449.18

* Best F1: tensor(92.7290, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 3


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 3 || Run Time:   18.3 | Load Time:    6.1 || F1:  96.07 | Prec:  95.88 | Rec:  96.25 || Ex/s: 278.46

===>  EVAL Epoch 3


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 3 || Run Time:    3.0 | Load Time:    2.0 || F1:  93.13 | Prec:  91.04 | Rec:  95.31 || Ex/s: 451.18

* Best F1: tensor(93.1276, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 4


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 4 || Run Time:   18.1 | Load Time:    6.0 || F1:  97.41 | Prec:  97.48 | Rec:  97.33 || Ex/s: 281.62

===>  EVAL Epoch 4


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 4 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.58 | Prec:  91.65 | Rec:  95.60 || Ex/s: 446.30

* Best F1: tensor(93.5831, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 5


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 5 || Run Time:   18.2 | Load Time:    6.0 || F1:  97.98 | Prec:  98.24 | Rec:  97.71 || Ex/s: 280.31

===>  EVAL Epoch 5


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 5 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.81 | Prec:  92.71 | Rec:  94.93 || Ex/s: 447.07

* Best F1: tensor(93.8061, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 6


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 6 || Run Time:   18.1 | Load Time:    6.0 || F1:  98.89 | Prec:  99.07 | Rec:  98.70 || Ex/s: 281.39

===>  EVAL Epoch 6


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 6 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.95 | Prec:  92.17 | Rec:  95.79 || Ex/s: 450.34

* Best F1: tensor(93.9465, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 7


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 7 || Run Time:   18.2 | Load Time:    6.0 || F1:  99.28 | Prec:  99.55 | Rec:  99.01 || Ex/s: 280.51

===>  EVAL Epoch 7


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 7 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.78 | Prec:  93.03 | Rec:  94.55 || Ex/s: 445.87

---------------------

===>  TRAIN Epoch 8


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 8 || Run Time:   18.2 | Load Time:    6.0 || F1:  99.52 | Prec:  99.78 | Rec:  99.27 || Ex/s: 280.71

===>  EVAL Epoch 8


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 8 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.69 | Prec:  92.22 | Rec:  95.22 || Ex/s: 445.92

---------------------

===>  TRAIN Epoch 9


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:23


Finished Epoch 9 || Run Time:   18.0 | Load Time:    6.0 || F1:  99.59 | Prec:  99.84 | Rec:  99.33 || Ex/s: 283.56

===>  EVAL Epoch 9


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 9 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.57 | Prec:  93.09 | Rec:  94.07 || Ex/s: 441.40

---------------------

===>  TRAIN Epoch 10


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 10 || Run Time:   18.2 | Load Time:    6.0 || F1:  99.67 | Prec:  99.90 | Rec:  99.43 || Ex/s: 279.89

===>  EVAL Epoch 10


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 10 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.70 | Prec:  93.52 | Rec:  93.88 || Ex/s: 448.22

---------------------

===>  TRAIN Epoch 11


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 11 || Run Time:   18.2 | Load Time:    6.0 || F1:  99.68 | Prec:  99.94 | Rec:  99.43 || Ex/s: 280.41

===>  EVAL Epoch 11


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 11 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.53 | Prec:  93.00 | Rec:  94.07 || Ex/s: 451.71

---------------------

===>  TRAIN Epoch 12


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 12 || Run Time:   18.1 | Load Time:    6.0 || F1:  99.68 | Prec:  99.94 | Rec:  99.43 || Ex/s: 281.12

===>  EVAL Epoch 12


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 12 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.63 | Prec:  93.01 | Rec:  94.26 || Ex/s: 444.12

---------------------

===>  TRAIN Epoch 13


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 13 || Run Time:   18.2 | Load Time:    6.1 || F1:  99.68 | Prec:  99.94 | Rec:  99.43 || Ex/s: 280.04

===>  EVAL Epoch 13


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 13 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.64 | Prec:  92.93 | Rec:  94.35 || Ex/s: 449.66

---------------------

===>  TRAIN Epoch 14


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 14 || Run Time:   18.1 | Load Time:    6.0 || F1:  99.68 | Prec:  99.94 | Rec:  99.43 || Ex/s: 282.17

===>  EVAL Epoch 14


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 14 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.64 | Prec:  92.93 | Rec:  94.35 || Ex/s: 448.74

---------------------

===>  TRAIN Epoch 15


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:24


Finished Epoch 15 || Run Time:   18.1 | Load Time:    6.0 || F1:  99.68 | Prec:  99.94 | Rec:  99.43 || Ex/s: 281.34

===>  EVAL Epoch 15


0% [████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 15 || Run Time:    3.0 | Load Time:    2.1 || F1:  93.68 | Prec:  93.02 | Rec:  94.35 || Ex/s: 442.66

---------------------

Loading best model...
Training done.
===>  EVAL Epoch 6


0% [██████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


Finished Epoch 6 || Run Time:    1.6 | Load Time:    2.1 || F1:  86.73 | Prec:  90.22 | Rec:  83.51 || Ex/s: 609.79



tensor(86.7327, device='cuda:0')

In [25]:
sum(pred['label'])

5240

In [26]:
pred.shape

(11322, 6)

In [27]:
# create data for prediction on the second model
cand.to_csv(os.path.join('.', 'Dissertation_Data_2', 'pred_cora.csv'),index=False)
train, validation, pred_cora = dm.data.process(
    path=os.path.join('.', 'Dissertation_Data_2'),
    cache='train_cache_1.pth',
    train='train.csv',
    validation='valid.csv',
    test='pred_cora.csv',
    use_magellan_convention=True,
    ignore_columns=('ltable_id', 'rtable_id'))


Reading and processing data from "./Dissertation_Data_2/train.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/valid.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "./Dissertation_Data_2/pred_cora.csv"
0% [##############################] 100% | ETA: 00:00:00
Building vocabulary
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06

Computing principal components
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:53


# Predictions

In [28]:
# predicting with the cora trained model on DBLP-Scholar
train_n = em.read_csv_metadata(os.path.join('.', 'Dissertation_Data_2', 'pred_DS.csv'))
predictions_cor_ds = model_cor.run_prediction(test_DS, output_attributes=list(test_DS.get_raw_table().columns))
score_cor_ds = scores(executionTime_Cora, predictions_cor_ds, 15, train_n.shape[0], model = 'TransOnCora')




===>  PREDICT Epoch 9


0% [██████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:00:03


Finished Epoch 9 || Run Time:    1.8 | Load Time:    2.1 || F1:  81.50 | Prec:  73.84 | Rec:  90.94 || Ex/s: 588.50



In [29]:
prec_matrix = prec_matrix.append(score_cor_ds, ignore_index=True)

In [30]:
# making predictions
predictions_ds_cor = model_DS.run_prediction(test, output_attributes=list(test.get_raw_table().columns))


===>  PREDICT Epoch 6


0% [██████████████████████████████] 100% | ETA: 00:00:00
Total time elapsed: 00:01:52


Finished Epoch 6 || Run Time:   42.2 | Load Time:   71.0 || F1:  83.38 | Prec:  83.52 | Rec:  83.24 || Ex/s: 412.15



In [31]:
score_ds_cor = scores(executionTime_DS, predictions_ds_cor, 15, cand.shape[0], model = 'TransOnDS')
prec_matrix = prec_matrix.append(score_ds_cor, ignore_index=True)

In [32]:
path_DL_res = os.path.join('.', 'Dissertation_Data_2', 'DL_res.csv')
prec_matrix.to_csv(path_DL_res)

In [33]:
predictions_ds_cor['match_prediction'] = predictions_ds_cor['match_score'].apply(lambda score: 1 if score >= 0.5 else 0)
path_pred_ds_cor = os.path.join('.', 'Dissertation_Data_2', 'pred_ds_cor.csv')
predictions_ds_cor.to_csv(path_pred_ds_cor)

In [34]:
predictions_cor_ds['match_prediction'] = predictions_cor_ds['match_score'].apply(lambda score: 1 if score >= 0.5 else 0)
path_pred_cor_ds = os.path.join('.', 'Dissertation_Data_2', 'pred_cor_ds.csv')
predictions_cor_ds.to_csv(path_pred_cor_ds)

In [35]:
i = 0
try:
  while True:
    i += 1
  if not i % 1000:
    gc.collect()
except KeyboardInterrupt:
 print(i)

594396042160
