In [398]:
import py_entitymatching as em
import pandas as pd
import numpy as np
import os

In [399]:
datasets_dir = em.get_install_path() + os.sep + 'datasets'
path_A = datasets_dir + os.sep + 'tracks_sample.csv'
path_B = datasets_dir + os.sep + 'songs_sample.csv'
path_G = datasets_dir + os.sep + 'labeled_data.csv'

In [400]:
# Read the CSV files
A = em.read_csv_metadata(path_A,key='id', low_memory=False) # setting the parameter low_memory to False  to speed up loading.
B = em.read_csv_metadata(path_B,key='id',low_memory=False)
G = em.read_csv_metadata(path_G,key='id',low_memory=False,ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [401]:
# Split G into I an J
train_test = em.split_train_test(G, train_proportion=0.5,random_state=0)
I = train_test['train']
J = train_test['test']
I.to_csv('train.csv')
J.to_csv('test.csv')

In [402]:
# Generate a set of features
F = em.get_features_for_matching(A, B)
print(F.feature_name)
# Remove all features on id parameters
F = F[4:]
# Remove some features on year parameter
F = F.drop(F.index[[0,1,2]])

0                                     id_id_exm
1                                     id_id_anm
2                                id_id_lev_dist
3                                 id_id_lev_sim
4                                 year_year_exm
5                                 year_year_anm
6                            year_year_lev_dist
7                             year_year_lev_sim
8         song_title_song_title_jac_qgm_3_qgm_3
9     song_title_song_title_cos_dlm_dc0_dlm_dc0
10    song_title_song_title_jac_dlm_dc0_dlm_dc0
11                    song_title_song_title_mel
12               song_title_song_title_lev_dist
13                song_title_song_title_lev_sim
14                    song_title_song_title_nmw
15                     song_title_song_title_sw
16              artists_artists_jac_qgm_3_qgm_3
17          artists_artists_cos_dlm_dc0_dlm_dc0
18          artists_artists_jac_dlm_dc0_dlm_dc0
19                          artists_artists_mel
20                     artists_artists_l

In [403]:
# Convert I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 

In [404]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [405]:
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)

In [406]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0,max_depth=5)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [407]:
# Compute precision and select the best ML matcher using CV
result_precision = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='precision', random_state=0)

result_precision['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x0000019E27671358>,5,0.833333,0.958333,1.0,0.956522,0.882353,0.926108
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x0000019E27671240>,5,0.888889,0.92,1.0,0.958333,1.0,0.953444
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x0000019E27671160>,5,0.607143,0.821429,0.870968,0.833333,0.678571,0.762289
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x0000019E27671278>,5,0.85,0.92,0.964286,0.962963,0.95,0.92945
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x0000019E27671080>,5,0.85,0.916667,0.931034,1.0,1.0,0.93954
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x0000019E27671198>,5,0.888889,0.92,0.962963,0.923077,1.0,0.938986


In [408]:
result_recall = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='recall', random_state=0)
result_recall['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x0000019E27671358>,5,0.882353,1.0,0.925926,0.846154,0.789474,0.888781
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x0000019E27671240>,5,0.941176,1.0,0.888889,0.884615,0.947368,0.93241
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x0000019E27671160>,5,1.0,1.0,1.0,0.961538,1.0,0.992308
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x0000019E27671278>,5,1.0,1.0,1.0,1.0,1.0,1.0
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x0000019E27671080>,5,1.0,0.956522,1.0,0.961538,1.0,0.983612
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x0000019E27671198>,5,0.941176,1.0,0.962963,0.923077,0.947368,0.954917


In [409]:
result_f1 = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='f1', random_state=0)
result_f1['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x0000019E27671358>,5,0.857143,0.978723,0.961538,0.897959,0.833333,0.905739
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x0000019E27671240>,5,0.914286,0.958333,0.941176,0.92,0.972973,0.941354
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x0000019E27671160>,5,0.755556,0.901961,0.931034,0.892857,0.808511,0.857984
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x0000019E27671278>,5,0.918919,0.958333,0.981818,0.981132,0.974359,0.962912
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x0000019E27671080>,5,0.918919,0.93617,0.964286,0.980392,1.0,0.959953
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x0000019E27671198>,5,0.914286,0.958333,0.962963,0.923077,0.972973,0.946326


In [410]:
#Debug Random Forest Matcher X
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']

In [411]:
# Debug X using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [412]:
#Debugging iteration 1 - remove song_title_song_title_lev_dist, song_title_song_title_nmw and song_title_song_title_sw,
#song_title_song_title_cos_dlm_dc0_dlm_dc0, song_title_song_title_mel,song_title_song_title_jac_dlm_dc0_dlm_dc0
F_D = F
F_D = F_D.drop(F_D.index[[2,3,4,5,7,8]])
H = em.extract_feature_vecs(I, 
                            feature_table=F_D, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [414]:
#Debugging iteration 2 - remove song_title_song_title_lev_sim
F_D = F_D.drop(F_D.index[[2]])
H = em.extract_feature_vecs(I, 
                            feature_table=F_D, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

7                         year_year_lev_sim
8     song_title_song_title_jac_qgm_3_qgm_3
16          artists_artists_jac_qgm_3_qgm_3
17      artists_artists_cos_dlm_dc0_dlm_dc0
18      artists_artists_jac_dlm_dc0_dlm_dc0
19                      artists_artists_mel
20                 artists_artists_lev_dist
21                  artists_artists_lev_sim
22                      artists_artists_nmw
23                       artists_artists_sw
Name: feature_name, dtype: object


In [416]:
#Debugging iteration 3 - remove artists_artists_lev_dist, artists_artists_nmw,artists_artists_sw
F_D = F_D.drop(F_D.index[[3,4,5,6,8,9]])
H = em.extract_feature_vecs(I, 
                            feature_table=F_D, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')


In [418]:
#Debugging iteration 4 - remove artists_artists_lev_sim
F_D = F_D.drop(F_D.index[[3]])
H = em.extract_feature_vecs(I, 
                            feature_table=F_D, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [420]:
#Debugging iteration 5 - add feature product of jaccard measure on song_title and artists
H['song_title_song_title_jac_qgm_3_qgm_3']
H['artists_artists_jac_qgm_3_qgm_3']
H['song_title_artists_score']= H.song_title_song_title_jac_qgm_3_qgm_3*H.artists_artists_jac_qgm_3_qgm_3
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [421]:
# Evaluate matching output
# Convert J into a set of feature vectors using feature table
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='gold_labels', show_progress=False)

In [422]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(L))
L.fillna(value=0, inplace=True)

In [423]:
# Convert I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False)
# Check if the feature vectors contain missing values and change to 0
any(pd.notnull(H))
H.fillna(value=0, inplace=True)

In [425]:
# Train using feature vectors from I using decision tree
dt.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_dt = dt.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_dt_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_dt, 'gold_labels', 'predicted_dt_labels')
em.print_eval_summary(eval_result)

Precision : 92.98% (106/114)
Recall : 92.17% (106/115)
F1 : 92.58%
False positives : 8 (out of 114 positive predictions)
False negatives : 9 (out of 86 negative predictions)


In [426]:
# Train using feature vectors from I using random forest
rf.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_rf = rf.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels','predicted_dt_labels'], 
                         append=True,target_attr='predicted_rf_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_rf, 'gold_labels', 'predicted_rf_labels')
em.print_eval_summary(eval_result)

Precision : 94.55% (104/110)
Recall : 90.43% (104/115)
F1 : 92.44%
False positives : 6 (out of 110 positive predictions)
False negatives : 11 (out of 90 negative predictions)


In [427]:
# Train using feature vectors from I using svm
svm.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_svm = svm.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels','predicted_dt_labels','predicted_rf_labels'], 
                         append=True,target_attr='predicted_svm_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_svm, 'gold_labels', 'predicted_svm_labels')
em.print_eval_summary(eval_result)

Precision : 73.86% (113/153)
Recall : 98.26% (113/115)
F1 : 84.33%
False positives : 40 (out of 153 positive predictions)
False negatives : 2 (out of 47 negative predictions)


In [428]:
# Train using feature vectors from I using logistic regression
lg.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_lg= lg.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels','predicted_dt_labels','predicted_rf_labels','predicted_svm_labels'], 
                         append=True,target_attr='predicted_lg_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_lg, 'gold_labels', 'predicted_lg_labels')
em.print_eval_summary(eval_result)

Precision : 95.54% (107/112)
Recall : 93.04% (107/115)
F1 : 94.27%
False positives : 5 (out of 112 positive predictions)
False negatives : 8 (out of 88 negative predictions)


In [429]:
# Train using feature vectors from I using linear regression
ln.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_ln = ln.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels','predicted_dt_labels','predicted_rf_labels','predicted_svm_labels','predicted_lg_labels'], 
                         append=True,target_attr='predicted_ln_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_ln, 'gold_labels', 'predicted_ln_labels')
em.print_eval_summary(eval_result)

Precision : 94.96% (113/119)
Recall : 98.26% (113/115)
F1 : 96.58%
False positives : 6 (out of 119 positive predictions)
False negatives : 2 (out of 81 negative predictions)


In [430]:
# Train using feature vectors from I using naive bayes
nb.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions_nb = nb.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels','predicted_dt_labels','predicted_rf_labels','predicted_svm_labels','predicted_lg_labels','predicted_ln_labels'], 
                         append=True,target_attr='predicted_nb_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions_nb, 'gold_labels', 'predicted_nb_labels')
em.print_eval_summary(eval_result)

Precision : 95.61% (109/114)
Recall : 94.78% (109/115)
F1 : 95.2%
False positives : 5 (out of 114 positive predictions)
False negatives : 6 (out of 86 negative predictions)
