## Step 3: Analyze
In this demonstration, we analyze entity resolution as a downstreaming application. We show that the tables integrated using ALITE prepares a better ground for entity resolution than the tables integrated using outer join operator. The results are reported in the form of precision, recall and F-score of entity resolution.

In [None]:
import py_entitymatching as em
import pandas as pd
import os
import sys
import numpy as np

In [None]:
print('python version: ' + sys.version )
print('pandas version: ' + pd.__version__ )
print('magellan version: ' + em.__version__ )

In [None]:
file = 'alite_fd_stadiums.csv'
#file = 'stadiums_minimum_oj.csv'
#path_A = em.get_install_path() + os.sep + 'em_inputs' + os.sep + file
#path_B = em.get_install_path() + os.sep + 'em_inputs' + os.sep + file
path_A = 'em_inputs' + os.sep + file
path_B = 'em_inputs' + os.sep + file

In [None]:
A = em.read_csv_metadata(path_A, key = "index")
B = em.read_csv_metadata(path_B, key = "index")

In [None]:
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

In [None]:
# Initialize overlap blocker
pd.options.mode.chained_assignment = None  # default='warn'
ob = em.OverlapBlocker()
# Block over title attribute
#attrs = ['name','capacity','location','opened','team']
#attrs = ['player', 'position', 'team', 'facility', 'location', 'capacity', 'opened']
attrs = ['player', 'team', 'facility']

C = ob.block_tables(A, B, 'facility', 'facility', 
                    l_output_attrs=attrs,
                    r_output_attrs=attrs, 
                    show_progress=False, overlap_size=2)
len(C)

In [None]:
S = em.sample_table(C, 5)


In [None]:
#G = em.label_table(S, label_column_name='gold_labels')

In [None]:
match_f = em.get_features_for_matching(A, B, validate_inferred_attr_types = False)

In [None]:
match_f

In [None]:
brm = em.BooleanRuleMatcher()
rule_1 = ['player_player_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) > 0.3', 'player_player_cos_dlm_dc0_dlm_dc0(ltuple, rtuple) > 0.3', 'player_player_lev_sim(ltuple, rtuple) > 0.3']
#rule_2 = ['facility_facility_jac_dlm_dc0_dlm_dc0(ltuple, rtuple) > 0.8', 'facility_facility_cos_dlm_dc0_dlm_dc0(ltuple, rtuple)> 0.8', 'facility_facility_lev_sim(ltuple, rtuple) > 0.8']

In [None]:

rule_name = brm.add_rule(rule_1, match_f)
#rule_name = brm.add_rule(rule_2, match_f)

In [None]:
predictions = brm.predict(table=C, target_attr='predicted_labels', inplace=True)
C['predictions'] = predictions

In [None]:

CC  = C[C['ltable_index'] != C['rtable_index']]
CCC = CC[['ltable_index', 'rtable_index', 'predictions']]
final_cols = list(CCC.columns)
final_rows = []
indexed_rows = set()
for index, rows in CCC.iterrows():
    ltable = rows['ltable_index']
    rtable = rows['rtable_index']
    predictions = rows['predictions']
    if (ltable, rtable) not in indexed_rows and (rtable, ltable) not in indexed_rows:
        final_rows.append((ltable, rtable, predictions))
        indexed_rows.add((ltable, rtable))
final_dataframe = pd.DataFrame(final_rows, columns= final_cols )     
#CCC = pd.DataFrame(np.sort(CCC.values, axis=1), columns=CCC.columns).drop_duplicates(ignore_index = True)

In [None]:
final_dataframe.to_csv("em_outputs/em_result_"+file, index = False)

In [None]:
final_dataframe

In [None]:
remove_tuples = set()

for index, row in final_dataframe.iterrows():
    if row['predictions'] == 1:
        remove_tuples.add(min(row['ltable_index'], row['rtable_index']))

In [None]:
ltable_cleaned = A[~A.index.isin(remove_tuples)]

In [None]:
ltable_cleaned

In [None]:
A

Prepare groundtruth for comparison. Since the partitioned tables may not have complete information, we only include those columns for the evaluation whose information are available (the columns participating on joins.)

In [2]:
gt_table = pd.read_csv(r"../data/em_gold/em_stadium_gold_complete_dirty.csv")
dirty_groundtruth = set()
for index, row in gt_table.iterrows():
    player = row['Player'].lower()
    team = row['Team'].lower()
    facility = row['Facility'].lower()
    dirty_groundtruth.add((player, team, facility))
print("the dirty groundtruth size is:", len(dirty_groundtruth))

gt_table = pd.read_csv(r"../data/em_gold/em_stadium_gold_complete.csv")
clean_groundtruth = set()
for index, row in gt_table.iterrows():
    player = row['Player'].lower()
    team = row['Team'].lower()
    facility = row['Facility'].lower()
    clean_groundtruth.add((player, team, facility))
print("the clean groundtruth size is:", len(clean_groundtruth))

NameError: name 'pd' is not defined

Prepare integrated table for comparison for both dirty (A) and clean (ltable_cleaned) tables.

In [None]:
integration_result_dirty = set()
for index, row in A.iterrows():
    player = row['player']
    team = row['team']
    facility = row['facility']
    integration_result_dirty.add((player, team, facility))
print("The dirty result size is:", len(integration_result_dirty))

integration_result_clean = set()
for index, row in ltable_cleaned.iterrows():
    player = row['player']
    team = row['team']
    facility = row['facility']
    integration_result_clean.add((player, team, facility))
print("The clean result size is:", len(integration_result_clean))

Print results out

In [None]:
print("Dirty table size |T|: ", A.shape[0])
print("Clean table size |T|: ", ltable_cleaned.shape[0])
#print("Dirty table intersection with dirty ground truth |T int T*|: ", len(clean_groundtruth.intersection(integration_result_dirty)))
print("Clean table intersection with clean ground truth |T int T*|: ", len(clean_groundtruth.intersection(integration_result_clean)))


In [None]:
precision = len(clean_groundtruth.intersection(integration_result_clean))/ len(integration_result_clean)
recall = len(clean_groundtruth.intersection(integration_result_clean)) / len(clean_groundtruth)
f1_score = (2 * precision * recall) / (precision + recall)
print("Precision = ", precision)
print("Recall = ", recall)
print ("F1-score = ", f1_score)