Stage 4, Report
https://github.com/anhaidgroup/py_entitymatching/blob/master/notebooks/vldb_demo/Demo_notebook_v6.ipynb

In [1]:
import py_entitymatching as em
import os
import pandas as pd

# specify filepaths for tables A and B. 
path_A = 'tableA.csv'
path_B = 'tableB.csv'
# read table A; table A has 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
# read table B; table B has 'ID' as the key attribute
B = em.read_csv_metadata(path_B, key='id')

No handlers could be found for logger "py_entitymatching.io.parsers"


In [2]:
# Impute missing values

# Manually set metadata properties, as current py_entitymatching.impute_table()
# requires 'fk_ltable', 'fk_rtable', 'ltable', 'rtable' properties
em.set_property(A, 'fk_ltable', 'id')
em.set_property(A, 'fk_rtable', 'id')
em.set_property(A, 'ltable', A)
em.set_property(A, 'rtable', A)

A_all_attrs = list(A.columns.values)
A_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time','min_age']
A_exclude_attrs = list(set(A_all_attrs) - set(A_impute_attrs))
A1 = em.impute_table(A, exclude_attrs=A_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(A['min_num_players'].isnull()))
print(sum(A1['min_num_players'].isnull()))

# Do the same thing for B
em.set_property(B, 'fk_ltable', 'id')
em.set_property(B, 'fk_rtable', 'id')
em.set_property(B, 'ltable', B)
em.set_property(B, 'rtable', B)

B_all_attrs = list(B.columns.values)
# TODO: add 'min_age'
B_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time']
B_exclude_attrs = list(set(B_all_attrs) - set(B_impute_attrs))
B1 = em.impute_table(B, exclude_attrs=B_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(B['min_num_players'].isnull()))
print(sum(B1['min_num_players'].isnull()))


65
0
5244
0


In [3]:
# Load the pre-labeled data
S = em.read_csv_metadata('sample_labeled.csv', 
                         key='_id',
                         ltable=A1, rtable=B1, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=3)
I = IJ['train']
J = IJ['test']


In [5]:
# Generate a set of features
# TODO: change schema to be actually same for tables A and B!!!! 
# Right now B has like 2ish more columns! Maybe once that is fixed 
# it will notie that "name" is the same across the tables
F = em.get_features_for_matching(A1, B1)
print F['feature_name']

# Select a subset of these features to actually use
include_features = [
    'min_num_players_min_num_players_lev_dist',
    'max_num_players_max_num_players_lev_dist',
    'min_gameplay_time_min_gameplay_time_lev_dist',
    'max_gameplay_time_max_gameplay_time_lev_dist',
]
F = F.loc[F['feature_name'].isin(include_features)]

0                                        id_id_exm
1                                        id_id_anm
2                                   id_id_lev_dist
3                                    id_id_lev_sim
4                                    year_year_exm
5                                    year_year_anm
6                               year_year_lev_dist
7                                year_year_lev_sim
8              min_num_players_min_num_players_exm
9              min_num_players_min_num_players_anm
10        min_num_players_min_num_players_lev_dist
11         min_num_players_min_num_players_lev_sim
12             max_num_players_max_num_players_exm
13             max_num_players_max_num_players_anm
14        max_num_players_max_num_players_lev_dist
15         max_num_players_max_num_players_lev_sim
16         min_gameplay_time_min_gameplay_time_exm
17         min_gameplay_time_min_gameplay_time_anm
18    min_gameplay_time_min_gameplay_time_lev_dist
19     min_gameplay_time_min_ga

In [6]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False)
H.head(10)

Unnamed: 0,_id,ltable_id,rtable_id,min_num_players_min_num_players_lev_dist,max_num_players_max_num_players_lev_dist,min_gameplay_time_min_gameplay_time_lev_dist,max_gameplay_time_max_gameplay_time_lev_dist,label
75,73393,6927,6683,1.0,1.0,2.0,2.0,0
14,57096,1875,10164,0.0,1.0,1.0,1.0,0
47,121277,3939,8743,0.0,1.0,1.0,1.0,0
46,17143,742,8262,0.0,0.0,2.0,2.0,0
379,216801,4510,7062,0.0,0.0,0.0,0.0,1
159,18588,924,6024,0.0,0.0,0.0,0.0,1
79,52831,6782,6948,0.0,0.0,0.0,0.0,0
265,31725,1776,3159,0.0,0.0,0.0,0.0,1
30,69887,2872,5839,0.0,0.0,2.0,0.0,1
209,37634,2229,6139,1.0,2.0,0.0,0.0,0


In [7]:
#B1['min_num_players']
# Extract features which we would like to use only


# Matcher Selection

In [27]:
# Select the best ML matcher using CV
# TODO: use name!
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x115b3d5d0>,5,0.918919,0.972973,0.926829,0.823529,0.916667,0.911783
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x115bbf7d0>,5,0.918919,0.972973,0.926829,0.823529,0.916667,0.911783
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x115a8df90>,5,0.888889,0.972973,0.926829,0.823529,0.916667,0.905777
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x115ba81d0>,5,0.918919,0.972973,0.926829,0.823529,0.916667,0.911783
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x115abc210>,5,0.888889,0.972973,0.9,0.823529,0.916667,0.900412


## Decision Tree

In [29]:
dt = em.DTMatcher(name='DecisionTree', random_state=0)
result = em.select_matcher([dt], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='precision', random_state=0)
print 'Precision: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([dt], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
print 'Recall: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([dt], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
print 'F1: {0:6f}'.format(result['cv_stats']['Mean score'][0])

Precision: 0.857025
Recall: 0.976667
F1: 0.911783


##  Random Forest

In [30]:
rf = em.RFMatcher(name='RF', random_state=0)
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='precision', random_state=0)
print 'Precision: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
print 'Recall: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([rf], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
print 'F1: {0:6f}'.format(result['cv_stats']['Mean score'][0])

Precision: 0.857025
Recall: 0.976667
F1: 0.911783


## SVM

In [31]:
svm = em.SVMMatcher(name='SVM', random_state=0)
result = em.select_matcher([svm], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='precision', random_state=0)
print 'Precision: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([svm], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
print 'Recall: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([svm], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
print 'F1: {0:6f}'.format(result['cv_stats']['Mean score'][0])

Precision: 0.855446
Recall: 0.964902
F1: 0.905777


## Naive Bayes

## Logistic Regression

In [28]:
lg = em.LogRegMatcher(name='LogReg', random_state=0)
result = em.select_matcher([lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='precision', random_state=0)
print 'Precision: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='recall', random_state=0)
print 'Recall: {0:6f}'.format(result['cv_stats']['Mean score'][0])
result = em.select_matcher([lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
        k=5,
        target_attr='label', metric='f1', random_state=0)
print 'F1: {0:6f}'.format(result['cv_stats']['Mean score'][0])

Precision: 0.854494
Recall: 0.954902
F1: 0.900412


For each of the five learning methods (Decision Tree, Random Forest, SVM, Naive Bayes, Logistic Regression), 
Report the precision
Recall
F-1 
that you obtain when you perform cross validation for the first time for these methods on I.

Report which learning based matcher you selected after that cross validation.

Report all debugging iterations and cross validation iterations that you performed. For each debugging
iteration, report (a) what is the matcher that you are trying to debug, and its precision/recall/F-1, (b)
what kind of problems you found, and what you did to fix them, (c) the final precision/recall/F-1 that
you reached

For each cross validation iteration, report (a) what matchers were you trying to evaluate using the
cross validation, and (b) precision/recall/F-1 of those.

• Report the final best learning-based matcher that you selected, and its precision/recall/F-1.

 Now report the following:
– For each of the five learning methods, train it on I, then report its precision/recall/F-1 on J.
– For the final best matcher Y∗,  train it on I then report its precision/recall/F-1 on J
- List the final set of features that you are using in your feature vectors. 

• Report an approximate time estimate: (a) how much did it take to label the data, and (b) to find the best
learning-based matcher.

• Discuss why you can't reach higher precision, recall, F-1. 