In [134]:
import py_entitymatching as em
import pandas as pd
import numpy as np
import os

In [135]:
datasets_dir = em.get_install_path() + os.sep + 'datasets'
path_A = datasets_dir + os.sep + 'tracks_sample.csv'
path_B = datasets_dir + os.sep + 'songs_sample.csv'
path_G = datasets_dir + os.sep + 'labeled_data.csv'

In [136]:
# Read the CSV files
A = em.read_csv_metadata(path_A,key='id', low_memory=False) # setting the parameter low_memory to False  to speed up loading.
B = em.read_csv_metadata(path_B,key='id',low_memory=False)
G = em.read_csv_metadata(path_G,key='id',low_memory=False,ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


In [137]:
# Split G into I an J
train_test = em.split_train_test(G, train_proportion=0.5,random_state=0)
I = train_test['train']
J = train_test['test']
I.to_csv('train.csv')
J.to_csv('test.csv')

In [138]:
# Generate a set of features
F = em.get_features_for_matching(A, B)
print(F.feature_name)
# Remove all features on id parameters
F = F[4:]
# Remove some features on year parameter
F = F.drop(F.index[[0,1,2]])

0                                     id_id_exm
1                                     id_id_anm
2                                id_id_lev_dist
3                                 id_id_lev_sim
4                                 year_year_exm
5                                 year_year_anm
6                            year_year_lev_dist
7                             year_year_lev_sim
8         song_title_song_title_jac_qgm_3_qgm_3
9     song_title_song_title_cos_dlm_dc0_dlm_dc0
10    song_title_song_title_jac_dlm_dc0_dlm_dc0
11                    song_title_song_title_mel
12               song_title_song_title_lev_dist
13                song_title_song_title_lev_sim
14                    song_title_song_title_nmw
15                     song_title_song_title_sw
16              artists_artists_jac_qgm_3_qgm_3
17          artists_artists_cos_dlm_dc0_dlm_dc0
18          artists_artists_jac_dlm_dc0_dlm_dc0
19                          artists_artists_mel
20                     artists_artists_l

In [140]:
# Convert I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 

In [141]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(H))

True

In [142]:
# Impute feature vectors with the mean of the column values.
H.fillna(value=0, inplace=True)

In [143]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0,max_depth=5)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [153]:
# Compute accuracy and select the best ML matcher using CV
result_precision = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='precision', random_state=0)
result_recall = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='recall', random_state=0)
result_f1 = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        k=5,
        target_attr='gold_labels', metric='f1', random_state=0)
result_precision['cv_stats']
result_recall['cv_stats']
result_f1['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x000002DD5ED90208>,5,0.810811,0.88,0.981132,0.92,0.888889,0.896166
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x000002DD5ED90CF8>,5,0.864865,0.84,0.964286,0.92,0.888889,0.895608
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x000002DD5ED90E10>,5,0.882353,0.893617,0.961538,0.92,0.918919,0.915285
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x000002DD5ED90780>,5,0.848485,0.897959,0.981132,0.941176,0.918919,0.917534
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x000002DD5ED900F0>,5,0.833333,0.916667,0.961538,0.92,0.918919,0.910091
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x000002DD5ED905F8>,5,0.857143,0.875,0.981132,0.941176,0.918919,0.914674


In [145]:
#Debug Random Forest Matcher X
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']

In [146]:
# Debug X using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [147]:
#Debugging iteration 1 - remove song_title_song_title_lev_dist, song_title_song_title_nmw and song_title_song_title_sw,
#song_title_song_title_cos_dlm_dc0_dlm_dc0, song_title_song_title_mel,song_title_song_title_jac_dlm_dc0_dlm_dc0
F = F.drop(F.index[[2,3,4,5,7,8]])
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [148]:
#Debugging iteration 2 - remove song_title_song_title_lev_sim
F = F.drop(F.index[[2]])
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [149]:
#Debugging iteration 3 - remove artists_artists_lev_dist, artists_artists_nmw,artists_artists_sw
F = F.drop(F.index[[3,4,5,6,8,9]])
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')


In [150]:
#Debugging iteration 4 - remove artists_artists_lev_sim
F = F.drop(F.index[[3]])
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold_labels',
                            show_progress=False) 
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug the matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [151]:
#Debugging iteration 5 - add feature product of jaccard measure on song_title and artists
H['song_title_song_title_jac_qgm_3_qgm_3']
H['artists_artists_jac_qgm_3_qgm_3']
H['song_title_artists_score']= H.song_title_song_title_jac_qgm_3_qgm_3*H.artists_artists_jac_qgm_3_qgm_3
# Impute feature vectors with 0.
H.fillna(value=0, inplace=True)
# Split H into P and Q
PQ = em.split_train_test(H, train_proportion=0.5, random_state=0)
P = PQ['train']
Q = PQ['test']
# Debug RF matcher using GUI
em.vis_debug_rf(rf, P, Q, 
        exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'],
        target_attr='gold_labels')

In [94]:
# Evaluate matching output
# Convert J into a set of feature vectors using feature table
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='gold_labels', show_progress=False)

In [95]:
# Check if the feature vectors contain missing values
# A return value of True means that there are missing values
any(pd.notnull(L))
L.fillna(value=0, inplace=True)

In [98]:
# Train using feature vectors from I using decision tree
dt.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = dt.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)

In [None]:
# Train using feature vectors from I using random forest
rf.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = rf.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)

In [100]:
# Train using feature vectors from I using svm
svm.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = svm.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)

In [None]:
# Train using feature vectors from I using logistic regression
lg.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = lg.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)

In [101]:
# Train using feature vectors from I using linear regression
ln.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = ln.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)

Precision : 94.96% (113/119)
Recall : 98.26% (113/115)
F1 : 96.58%
False positives : 6 (out of 119 positive predictions)
False negatives : 2 (out of 81 negative predictions)


In [None]:
# Train using feature vectors from I using naive bayes
nb.fit(table=H, 
       exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
       target_attr='gold_labels')
# Predict on L 
predictions = nb.predict(table=L, exclude_attrs=['id', 'ltable_id', 'rtable_id', 'gold_labels'], 
                         append=True,target_attr='predicted_labels')
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'gold_labels', 'predicted_labels')
em.print_eval_summary(eval_result)