Stage 4, Report
https://github.com/anhaidgroup/py_entitymatching/blob/master/notebooks/vldb_demo/Demo_notebook_v6.ipynb

In [1]:
import py_entitymatching as em
import os
import pandas as pd

# specify filepaths for tables A and B. 
path_A = 'newTableA.csv'
path_B = 'tableB.csv'
# read table A; table A has 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
# read table B; table B has 'ID' as the key attribute
B = em.read_csv_metadata(path_B, key='id')

No handlers could be found for logger "py_entitymatching.io.parsers"


# Filling in Missing Values

In [2]:
# Impute missing values

# Manually set metadata properties, as current py_entitymatching.impute_table()
# requires 'fk_ltable', 'fk_rtable', 'ltable', 'rtable' properties
em.set_property(A, 'fk_ltable', 'id')
em.set_property(A, 'fk_rtable', 'id')
em.set_property(A, 'ltable', A)
em.set_property(A, 'rtable', A)

A_all_attrs = list(A.columns.values)
A_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time','min_age']
A_exclude_attrs = list(set(A_all_attrs) - set(A_impute_attrs))
A1 = em.impute_table(A, exclude_attrs=A_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(A['min_num_players'].isnull()))
print(sum(A1['min_num_players'].isnull()))

# Do the same thing for B
em.set_property(B, 'fk_ltable', 'id')
em.set_property(B, 'fk_rtable', 'id')
em.set_property(B, 'ltable', B)
em.set_property(B, 'rtable', B)

B_all_attrs = list(B.columns.values)
# TODO: add 'min_age'
B_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time']
B_exclude_attrs = list(set(B_all_attrs) - set(B_impute_attrs))
B1 = em.impute_table(B, exclude_attrs=B_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(B['min_num_players'].isnull()))
print(sum(B1['min_num_players'].isnull()))


65
0
5244
0


In [3]:
# Load the pre-labeled data
S = em.read_csv_metadata('sample_labeled.csv', 
                         key='_id',
                         ltable=A1, rtable=B1, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

In [4]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=35)
I = IJ['train']
J = IJ['test']


In [5]:
corres = em.get_attr_corres(A1, B1)
print(corres)

{'rtable':           id  \
0          1   
1          2   
2          3   
3          4   
4          5   
5          6   
6          7   
7          8   
8          9   
9         10   
10        11   
11        12   
12        13   
13        14   
14        15   
15        16   
16        17   
17        18   
18        19   
19        20   
20        21   
21        22   
22        23   
23        24   
24        25   
25        26   
26        27   
27        28   
28        29   
29        30   
...      ...   
10266  10269   
10267  10270   
10268  10271   
10269  10272   
10270  10273   
10271  10274   
10272  10275   
10273  10276   
10274  10277   
10275  10278   
10276  10279   
10277  10280   
10278  10281   
10279  10282   
10280  10283   
10281  10284   
10282  10285   
10283  10286   
10284  10287   
10285  10288   
10286  10289   
10287  10290   
10288  10291   
10289  10292   
10290  10293   
10291  10294   
10292  10295   
10293  10296   
10294  10297   
10295  10298 

# Generating Features

Here, we generate all the features we decided upon after our final iteration of cross validation and debugging. We only use the relevant subset of all these features in the reported iterations below.

In [6]:
# Generate a set of features
#import pdb; pdb.set_trace();
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.feature.simfunctions as sim
import py_entitymatching.feature.tokenizers as tok

ltable = A1
rtable = B1

# Get similarity functions for generating the features for matching
sim_funcs = sim.get_sim_funs_for_matching()
# Get tokenizer functions for generating the features for matching
tok_funcs = tok.get_tokenizers_for_matching()

# Get the attribute types of the input tables
attr_types_ltable = au.get_attr_types(ltable)
attr_types_rtable = au.get_attr_types(rtable)

# Get the attribute correspondence between the input tables
attr_corres = au.get_attr_corres(ltable, rtable)
print(attr_types_ltable['name'])
print(attr_types_rtable['name'])
attr_types_ltable['name'] = 'str_bt_5w_10w'
attr_types_rtable['name'] = 'str_bt_5w_10w'



# Get the features
F = em.get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

#F = em.get_features_for_matching(A1, B1)
print(F['feature_name'])


#TODO get name feature!
#http://pradap-www.cs.wisc.edu/cs638/py_entitymatching/user-manual/_modules/py_entitymatching/feature/simfunctions.html#get_sim_funs_for_matching
#name_feature = em.get_feature_fn('name', em.get_tokenizers_for_matching(), em.get_sim_funs_for_matching())
#print(name_feature)
#em.add_feature(F, 'name_dist', name_feature)
#print(F['feature_name'])

str_bt_1w_5w
str_bt_5w_10w
0                                           id_id_exm
1                                           id_id_anm
2                                      id_id_lev_dist
3                                       id_id_lev_sim
4                           name_name_jac_qgm_3_qgm_3
5                       name_name_cos_dlm_dc0_dlm_dc0
6                                       name_name_mel
7                                  name_name_lev_dist
8                                   name_name_lev_sim
9                                       year_year_exm
10                                      year_year_anm
11                                 year_year_lev_dist
12                                  year_year_lev_sim
13                   num_players_num_players_lev_dist
14                    num_players_num_players_lev_sim
15                        num_players_num_players_jar
16                        num_players_num_players_jwn
17                        num_players_num_players_exm
1

In [7]:
#B1['min_num_players']
# Extract features which we would like to use only


## Cross Validation Method

In [8]:
def cross_validation_eval(H):
    cv_iter = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])

    # Matchers
    matchers = [em.DTMatcher(name='DecisionTree', random_state=0),
     em.RFMatcher(name='RandomForest', random_state=0),
     em.SVMMatcher(name='SVM', random_state=0),
     em.NBMatcher(name='NaiveBayes'),
     em.LogRegMatcher(name='LogReg', random_state=0),
    ]
    
    for m in matchers:
        prec_result = em.select_matcher([m], table=H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
                k=5,
                target_attr='label', metric='precision', random_state=0)
        recall_result = em.select_matcher([m], table=H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
                k=5,
                target_attr='label', metric='recall', random_state=0)
        f1_result = em.select_matcher([m], table=H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'],
                k=5,
                target_attr='label', metric='f1', random_state=0)
        cv_iter = cv_iter.append(
            pd.DataFrame([
                [prec_result['cv_stats']['Mean score'][0],
                recall_result['cv_stats']['Mean score'][0],
                f1_result['cv_stats']['Mean score'][0],
                ]],
                index=[m.name],
                columns=['Precision', 'Recall', 'F1']))
    return cv_iter

# Matcher Selection: Iteration 1

In [9]:
# Subset of features we used on our first iteration
include_features = [
    'min_num_players_min_num_players_lev_dist',
    'max_num_players_max_num_players_lev_dist',
    'min_gameplay_time_min_gameplay_time_lev_dist',
    'max_gameplay_time_max_gameplay_time_lev_dist',
]
F_1 = F.loc[F['feature_name'].isin(include_features)]

In [10]:
# Convert the I into a set of feature vectors using F
H_1 = em.extract_feature_vecs(I, feature_table=F_1, attrs_after='label', show_progress=False)
H_1.head(10)

Unnamed: 0,_id,ltable_id,rtable_id,min_num_players_min_num_players_lev_dist,max_num_players_max_num_players_lev_dist,min_gameplay_time_min_gameplay_time_lev_dist,max_gameplay_time_max_gameplay_time_lev_dist,label
153,65346,4081,4137,0.0,0.0,1.0,1.0,0
114,71437,4465,1000,0.0,0.0,0.0,0.0,1
322,98156,4574,4319,0.0,0.0,0.0,0.0,1
128,75832,5717,7538,0.0,0.0,0.0,0.0,1
105,165132,466,4483,0.0,0.0,0.0,0.0,1
95,212333,655,3448,0.0,1.0,2.0,2.0,0
8,44210,3395,811,0.0,1.0,1.0,1.0,0
56,199746,5204,10242,0.0,0.0,0.0,0.0,1
260,120176,2998,4173,0.0,0.0,2.0,2.0,0
333,46418,5900,811,0.0,1.0,2.0,2.0,0


In [11]:
cross_validation_eval(H_1)

Unnamed: 0,Precision,Recall,F1
DecisionTree,0.883431,0.965278,0.9204
RandomForest,0.883431,0.965278,0.9204
SVM,0.866166,0.965278,0.910821
NaiveBayes,0.702381,0.976389,0.814279
LogReg,0.882955,0.955278,0.915278


# Matcher Selection: Iteration 2

In [12]:
# Convert the I into a set of feature vectors using F
# Here, we add name edit distance as a feature
include_features_2 = [
    'min_num_players_min_num_players_lev_dist',
    'max_num_players_max_num_players_lev_dist',
    'min_gameplay_time_min_gameplay_time_lev_dist',
    'max_gameplay_time_max_gameplay_time_lev_dist',
    'name_name_lev_dist'
]
F_2 = F.loc[F['feature_name'].isin(include_features)]
H_2 = em.extract_feature_vecs(I, feature_table=F_2, attrs_after='label', show_progress=False)
H_2.head(10)

Unnamed: 0,_id,ltable_id,rtable_id,min_num_players_min_num_players_lev_dist,max_num_players_max_num_players_lev_dist,min_gameplay_time_min_gameplay_time_lev_dist,max_gameplay_time_max_gameplay_time_lev_dist,label
153,65346,4081,4137,0.0,0.0,1.0,1.0,0
114,71437,4465,1000,0.0,0.0,0.0,0.0,1
322,98156,4574,4319,0.0,0.0,0.0,0.0,1
128,75832,5717,7538,0.0,0.0,0.0,0.0,1
105,165132,466,4483,0.0,0.0,0.0,0.0,1
95,212333,655,3448,0.0,1.0,2.0,2.0,0
8,44210,3395,811,0.0,1.0,1.0,1.0,0
56,199746,5204,10242,0.0,0.0,0.0,0.0,1
260,120176,2998,4173,0.0,0.0,2.0,2.0,0
333,46418,5900,811,0.0,1.0,2.0,2.0,0


In [13]:
cross_validation_eval(H_2)

Unnamed: 0,Precision,Recall,F1
DecisionTree,0.883431,0.965278,0.9204
RandomForest,0.883431,0.965278,0.9204
SVM,0.866166,0.965278,0.910821
NaiveBayes,0.702381,0.976389,0.814279
LogReg,0.882955,0.955278,0.915278


Comparing the results, we see that the decision tree in fact incrementally comparable, if not better, metrics compared to all other matchers

Report which learning based matcher you selected after that cross validation.

Report all debugging iterations and cross validation iterations that you performed. For each debugging
iteration, report (a) what is the matcher that you are trying to debug, and its precision/recall/F-1, (b)
what kind of problems you found, and what you did to fix them, (c) the final precision/recall/F-1 that
you reached

For each cross validation iteration, report (a) what matchers were you trying to evaluate using the
cross validation, and (b) precision/recall/F-1 of those.

• Report the final best learning-based matcher that you selected, and its precision/recall/F-1.

# Train-Test Set Accuracy

In [14]:
# Apply train, test set evaluation
I_table = em.extract_feature_vecs(I, feature_table=F, attrs_after='label', show_progress=False)
J_table = em.extract_feature_vecs(J, feature_table=F, attrs_after='label', show_progress=False)

matchers = [em.DTMatcher(name='DecisionTree', random_state=0),
 em.RFMatcher(name='RF', random_state=0),
 em.NBMatcher(name='NaiveBayes'),
 em.LogRegMatcher(name='LogReg', random_state=0),
 em.SVMMatcher(name='SVM', random_state=0)
]

for m in matchers:
    m.fit(table=I_table, exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'], target_attr='label')
    J_table['prediction'] = m.predict(
        table=J_table, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
        target_attr='label',
    )
    print(m.name)
    em.print_eval_summary(em.eval_matches(J_table, 'label', 'prediction'))
    J_table.drop('prediction', axis=1, inplace=True)
    print('')

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').