# Introduction

This IPython notebook illustrates how to perform blocking using Overlap blocker.

First, we need to import *py_entitymatching* package and other libraries as follows:

In [2]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd


#steps followed
#overlap blocking on the director names with qgram
#overlap blocking on the actor names with qgram
# blackbox blocking on the release_year, if abs(r1-r2) > 2, then block them
# blackbox blocking on the runtime, if abs(r1-r2) > 10, then block them


#handy, check if a value is present in the dataframe column
##B['director'].str.contains('None').any()


In [3]:
import numpy as np

Then, read the (sample) input tables for blocking purposes.

In [4]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'imdb_data.csv'
path_B = datasets_dir + os.sep + 'metacritic_data.csv'

In [4]:
path_A

'C:\\Users\\Cooper\\Anaconda2\\lib\\site-packages\\py_entitymatching\\datasets\\imdb_data.csv'

In [5]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A,key = 'id')
B = em.read_csv_metadata(path_B, key = 'id')

No handlers could be found for logger "py_entitymatching.io.parsers"


In [6]:
B

Unnamed: 0,id,name,release_year,runtime,director,actors
0,b1,Citizen Kane,1941.0,119,Orson Welles,_Joseph Cotten_Orson Welles
1,b2,The Godfather,1972.0,175,Francis Ford Coppola,_Al Pacino_Marlon Brando
2,b3,Rear Window,1954.0,112,Alfred Hitchcock,_Frank Cady_Georgine Darcy_Grace Kelly_James Stewart_Judith Evelyn_Raymond Burr_Ross Bagdasarian...
3,b4,Casablanca,1943.0,102,Michael Curtiz,_Humphrey Bogart_Ingrid Bergman
4,b5,Boyhood,2014.0,165,Richard Linklater,_Bonnie Cross_Elijah Smith_Ellar Coltrane_Ethan Hawke_Libby Villari_Lorelei Linklater_Marco Pere...
5,b6,Three Colors: Red,1994.0,99,Krzysztof Kieslowski,_Irène Jacob_Jean-Louis Trintignant
6,b7,Singin' in the Rain,1952.0,103,_Gene Kelly_Stanley Donen,_Cyd Charisse_Dawn Addams_Debbie Reynolds_Donald O'Connor_Douglas Fowley_Gene Kelly_Jean Hagen_J...
7,b8,The Treasure of the Sierra Madre,1948.0,126,John Huston,_Alfonso Bedoya_Arturo Soto Rangel_Barton MacLane_Bruce Bennett_Humphrey Bogart_José Torvay_Manu...
8,b9,Moonlight,2016.0,111,Barry Jenkins,_Alex R. Hibbert_Ashton Sanders_Duan Sanderson_Herman 'Caheei McGloun_Jaden Piner_Janelle Monáe_...
9,b10,Pan's Labyrinth,2006.0,118,Guillermo del Toro,_Ariadna Gil_Ivana Baquero_Sergi López


In [8]:
A = em.read_csv_metadata(path_A, key = 'id')


def remove_underscore(str):
    if str.startswith("_"):
        str = str[1:]  
    return str.replace('_',' ')


#almost all the tuples have actors column starting with "_", so this can be blindly applied.
A['release_year'] = A['release_year']
A['runtime'] = A['runtime']
A['actors'] = A['actors'].apply(remove_underscore)
A['director'] = A['director'].apply(remove_underscore)

import re
def clean_string(my_str):
    str = re.sub('[^a-zA-Z0-9 \n\.]', ' ', my_str)
    return str.replace("."," ").lower()

A['name'] = A['name'].apply(clean_string)


A.dtypes



id              object
name            object
release_year     int64
runtime          int64
director        object
actors          object
dtype: object

# Ways To Do Overlap Blocking

In [10]:
B = em.read_csv_metadata(path_B, key = 'id')




def remove_underscore(str):
    if str.startswith("_"):
        str = str[1:]
    
    return str.replace('_',' ')

#Note - release year, runtime, genre contains blank/None type
B['actors'] = B['actors'].apply(remove_underscore)
B['director'] = B['director'].apply(remove_underscore)
B['release_year'] = B['release_year'].astype(str)
B['release_year'] = B['release_year'].str[:-2]
B['runtime'] = pd.to_numeric(B['runtime'], errors='coerce').fillna(0).astype(np.int64)
B['release_year'] = pd.to_numeric(B['release_year'], errors='coerce').fillna(0).astype(np.int64)

import re
def clean_string(my_str):
    str = re.sub('[^a-zA-Z0-9 \n\.]', ' ', my_str)
    return str.replace("."," ").lower()
B['name'] = B['name'].apply(clean_string)


B['actors'] = B['actors'].apply(lambda x: np.nan if x == "None" else x)
B['director'] = B['director'].apply(lambda x: np.nan if x == "None" else x)

#B['director'].str.contains('None').any()
#B['actors'].isnull().values.any()
B.dtypes

id              object
name            object
release_year     int64
runtime          int64
director        object
actors          object
dtype: object

In [9]:
A['actors'].isnull().values.any()

False

There are three different ways to do overlap blocking:

1. Block two tables to produce a `candidate set` of tuple pairs.
2. Block a `candidate set` of tuple pairs to typically produce a reduced candidate set of tuple pairs.
3. Block two tuples to check if a tuple pair would get blocked.

## Block Tables to Produce a Candidate Set of Tuple Pairs

In [11]:
# Instantiate overlap blocker object
ob = em.OverlapBlocker()
ab = em.AttrEquivalenceBlocker()


For the given two tables, we will assume that two persons with no sufficient overlap between their addresses do not refer to the same real world person. So, we apply overlap blocking on `address`. Specifically, we tokenize the address by word and include the tuple pairs if the addresses have at least 3 overlapping tokens. That is, we block all the tuple pairs that do not share at least 3 tokens in `address`.

In [12]:
# Display first 5 tuple pairs in the candidate set.
#matching the film release year

#Overlap Blocking on director name
C1 = ob.block_tables(A,B, 'director', 'director', word_level=True,overlap_size=1,
                     l_output_attrs=['name', 'release_year', 'director','actors','runtime'], 
                     r_output_attrs=['name', 'release_year', 'director','actors','runtime'],
                     show_progress=False,allow_missing=True)





In [9]:
C1

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_release_year,ltable_director,ltable_actors,ltable_runtime,rtable_name,rtable_release_year,rtable_director,rtable_actors,rtable_runtime
0,0,a1787,b1,touch of evil,1958,Orson Welles,Charlton Heston Orson Welles Janet Leigh Joseph Calleia,95,citizen kane,1941,Orson Welles,Joseph Cotten Orson Welles,119
1,1,a317,b1,citizen kane,1941,Orson Welles,Orson Welles Joseph Cotten Dorothy Comingore Agnes Moorehead,119,citizen kane,1941,Orson Welles,Joseph Cotten Orson Welles,119
2,2,a2693,b2,jack,1996,Francis Ford Coppola,Robin Williams Diane Lane Brian Kerwin Jennifer Lopez,113,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
3,3,a1175,b2,the virgin suicides,1999,Sofia Coppola,Kirsten Dunst Josh Hartnett James Woods Kathleen Turner,97,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
4,4,a10,b2,the godfather,1972,Francis Ford Coppola,Marlon Brando Al Pacino James Caan Diane Keaton,175,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
5,5,a916,b2,dracula,1992,Francis Ford Coppola,Gary Oldman Winona Ryder Anthony Hopkins Keanu Reeves,128,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
6,6,a151,b2,apocalypse now,1979,Francis Ford Coppola,Martin Sheen Marlon Brando Robert Duvall Frederic Forrest,147,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
7,7,a1696,b2,marie antoinette,2006,Sofia Coppola,Kirsten Dunst Jason Schwartzman Rip Torn Judy Davis,123,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
8,8,a33,b2,the godfather part ii,1974,Francis Ford Coppola,Al Pacino Robert De Niro Robert Duvall Diane Keaton,202,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
9,9,a294,b2,lost in translation,2003,Sofia Coppola,Bill Murray Scarlett Johansson Giovanni Ribisi Anna Faris,102,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175


In [13]:
C2 = ob.block_candset(C1, 'actors', 'actors', word_level=True,overlap_size=1, show_progress=False,allow_missing=True)

In [16]:
C2

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_release_year,ltable_director,ltable_actors,ltable_runtime,rtable_name,rtable_release_year,rtable_director,rtable_actors,rtable_runtime
1,1,a317,b1,Citizen Kane,1941,Orson Welles,_Orson Welles_Joseph Cotten_Dorothy Comingore_Agnes Moorehead,119,citizen kane,1941,Orson Welles,Joseph Cotten Orson Welles,119
4,4,a10,b2,The Godfather,1972,Francis Ford Coppola,_Marlon Brando_Al Pacino_James Caan_Diane Keaton,175,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
8,8,a33,b2,The Godfather: Part II,1974,Francis Ford Coppola,_Al Pacino_Robert De Niro_Robert Duvall_Diane Keaton,202,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
25,25,a381,b2,The Godfather: Part III,1990,Francis Ford Coppola,_Al Pacino_Diane Keaton_Andy Garcia_Talia Shire,162,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
32,32,a280,b3,Rear Window,1954,Alfred Hitchcock,_James Stewart_Grace Kelly_Wendell Corey_Thelma Ritter,112,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
36,36,a2774,b3,The Man Who Knew Too Much,1956,Alfred Hitchcock,_James Stewart_Doris Day_Brenda de Banzie_Bernard Miles,120,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
37,37,a408,b3,Vertigo,1958,Alfred Hitchcock,_James Stewart_Kim Novak_Barbara Bel Geddes_Tom Helmore,128,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
39,39,a1405,b3,Rope,1948,Alfred Hitchcock,_James Stewart_John Dall_Farley Granger_Dick Hogan,80,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
74,74,a199,b4,Casablanca,1942,Michael Curtiz,_Humphrey Bogart_Ingrid Bergman_Paul Henreid_Claude Rains,102,casablanca,1943,Michael Curtiz,Humphrey Bogart Ingrid Bergman,102
101,101,a388,b5,Boyhood,2014,Richard Linklater,_Ellar Coltrane_Patricia Arquette_Ethan Hawke_Elijah Smith,165,boyhood,2014,Richard Linklater,Bonnie Cross Elijah Smith Ellar Coltrane Ethan Hawke Libby Villari Lorelei Linklater Marco Perel...,165


In [14]:
# Instantiate blackbox blocker
bb3 = em.BlackBoxBlocker()
bb4 = em.BlackBoxBlocker()
# Set the black box function
def year_year_function(x, y):
    # x, y will be of type pandas series
    
    # get year attribute from both the tuples
    x_year = x['release_year']
    y_year = y['release_year']
    if x_year == 0 or y_year == 0:
        return False
    #check if the difference is more than 2 years,potentially it cannot be more than 1 year #testing
    if abs(x_year - y_year) > 2:
        return True
    else:
        return False
    
def runtime_runtime_function(x, y):
    # x, y will be of type pandas series
    
    # get year attribute from both the tuples
    x_runtime = x['runtime']
    y_runtime = y['runtime']
    #check if the difference is more than 5 minutes
    #changing it to 10minutes, after second analysis
    if x_runtime == 0 or y_runtime == 0:
        return False
    if abs(x_runtime - y_runtime) > 10:
        return True
    else:
        return False
            
bb3.set_black_box_function(year_year_function)
bb4.set_black_box_function(runtime_runtime_function)

In [15]:
C3 = bb3.block_candset(C2,show_progress=False)


In [19]:
C3

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_release_year,ltable_director,ltable_actors,ltable_runtime,rtable_name,rtable_release_year,rtable_director,rtable_actors,rtable_runtime
1,1,a317,b1,Citizen Kane,1941,Orson Welles,_Orson Welles_Joseph Cotten_Dorothy Comingore_Agnes Moorehead,119,citizen kane,1941,Orson Welles,Joseph Cotten Orson Welles,119
4,4,a10,b2,The Godfather,1972,Francis Ford Coppola,_Marlon Brando_Al Pacino_James Caan_Diane Keaton,175,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
8,8,a33,b2,The Godfather: Part II,1974,Francis Ford Coppola,_Al Pacino_Robert De Niro_Robert Duvall_Diane Keaton,202,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
32,32,a280,b3,Rear Window,1954,Alfred Hitchcock,_James Stewart_Grace Kelly_Wendell Corey_Thelma Ritter,112,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
36,36,a2774,b3,The Man Who Knew Too Much,1956,Alfred Hitchcock,_James Stewart_Doris Day_Brenda de Banzie_Bernard Miles,120,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
74,74,a199,b4,Casablanca,1942,Michael Curtiz,_Humphrey Bogart_Ingrid Bergman_Paul Henreid_Claude Rains,102,casablanca,1943,Michael Curtiz,Humphrey Bogart Ingrid Bergman,102
101,101,a388,b5,Boyhood,2014,Richard Linklater,_Ellar Coltrane_Patricia Arquette_Ethan Hawke_Elijah Smith,165,boyhood,2014,Richard Linklater,Bonnie Cross Elijah Smith Ellar Coltrane Ethan Hawke Libby Villari Lorelei Linklater Marco Perel...,165
119,119,a1336,b5,Before Midnight,2013,Richard Linklater,_Ethan Hawke_Julie Delpy_Seamus Davey-Fitzpatrick_Ariane Labed,109,boyhood,2014,Richard Linklater,Bonnie Cross Elijah Smith Ellar Coltrane Ethan Hawke Libby Villari Lorelei Linklater Marco Perel...,165
152,152,a810,b7,Singin' in the Rain,1952,_Stanley Donen_Gene Kelly,_Gene Kelly_Donald O'Connor_Debbie Reynolds_Jean Hagen,103,singin in the rain,1952,Gene Kelly Stanley Donen,Cyd Charisse Dawn Addams Debbie Reynolds Donald O'Connor Douglas Fowley Gene Kelly Jean Hagen Jo...,103
181,181,a1624,b8,The Treasure of the Sierra Madre,1948,John Huston,_Humphrey Bogart_Walter Huston_Tim Holt_Bruce Bennett,126,the treasure of the sierra madre,1948,John Huston,Alfonso Bedoya Arturo Soto Rangel Barton MacLane Bruce Bennett Humphrey Bogart José Torvay Manue...,126


In [16]:
C4 = bb4.block_candset(C3,show_progress=False)

In [22]:
C4

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_release_year,ltable_director,ltable_actors,ltable_runtime,rtable_name,rtable_release_year,rtable_director,rtable_actors,rtable_runtime
1,1,a317,b1,Citizen Kane,1941,Orson Welles,_Orson Welles_Joseph Cotten_Dorothy Comingore_Agnes Moorehead,119,citizen kane,1941,Orson Welles,Joseph Cotten Orson Welles,119
4,4,a10,b2,The Godfather,1972,Francis Ford Coppola,_Marlon Brando_Al Pacino_James Caan_Diane Keaton,175,the godfather,1972,Francis Ford Coppola,Al Pacino Marlon Brando,175
32,32,a280,b3,Rear Window,1954,Alfred Hitchcock,_James Stewart_Grace Kelly_Wendell Corey_Thelma Ritter,112,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
36,36,a2774,b3,The Man Who Knew Too Much,1956,Alfred Hitchcock,_James Stewart_Doris Day_Brenda de Banzie_Bernard Miles,120,rear window,1954,Alfred Hitchcock,Frank Cady Georgine Darcy Grace Kelly James Stewart Judith Evelyn Raymond Burr Ross Bagdasarian ...,112
74,74,a199,b4,Casablanca,1942,Michael Curtiz,_Humphrey Bogart_Ingrid Bergman_Paul Henreid_Claude Rains,102,casablanca,1943,Michael Curtiz,Humphrey Bogart Ingrid Bergman,102
101,101,a388,b5,Boyhood,2014,Richard Linklater,_Ellar Coltrane_Patricia Arquette_Ethan Hawke_Elijah Smith,165,boyhood,2014,Richard Linklater,Bonnie Cross Elijah Smith Ellar Coltrane Ethan Hawke Libby Villari Lorelei Linklater Marco Perel...,165
152,152,a810,b7,Singin' in the Rain,1952,_Stanley Donen_Gene Kelly,_Gene Kelly_Donald O'Connor_Debbie Reynolds_Jean Hagen,103,singin in the rain,1952,Gene Kelly Stanley Donen,Cyd Charisse Dawn Addams Debbie Reynolds Donald O'Connor Douglas Fowley Gene Kelly Jean Hagen Jo...,103
181,181,a1624,b8,The Treasure of the Sierra Madre,1948,John Huston,_Humphrey Bogart_Walter Huston_Tim Holt_Bruce Bennett,126,the treasure of the sierra madre,1948,John Huston,Alfonso Bedoya Arturo Soto Rangel Barton MacLane Bruce Bennett Humphrey Bogart José Torvay Manue...,126
304,304,a760,b9,Moonlight,2016,Barry Jenkins,_Mahershala Ali_Naomie Harris_Trevante Rhodes_Alex R. Hibbert,111,moonlight,2016,Barry Jenkins,Alex R. Hibbert Ashton Sanders Duan Sanderson Herman 'Caheei McGloun Jaden Piner Janelle Monáe M...,111
326,326,a408,b11,Vertigo,1958,Alfred Hitchcock,_James Stewart_Kim Novak_Barbara Bel Geddes_Tom Helmore,128,north by northwest,1959,Alfred Hitchcock,Adam Williams Cary Grant Edward Platt Eva Marie Saint James Mason Jessie Royce Landis Josephine ...,136


In [17]:
#need to handle the null s


bb5 = em.BlackBoxBlocker()

def actor_director_function(x, y):
    # x, y will be of type pandas series
    
    #if either of actor or director field is none, enter this condition
    if pd.isnull(y['actors']) or pd.isnull(y['director']):
    #if y['actors'] == pd.np.NaN  or y['director'] == pd.np.NaN:
        if x['name'].lower() != y['name'].lower():
            return True
        
    return False
            
bb5.set_black_box_function(actor_director_function)




In [18]:
C5 = bb5.block_candset(C4,show_progress=False)

In [23]:
C5.to_csv('tuple_pairs_after_blocking.csv',index=False)

In [19]:
sampled_set = em.sample_table(C5, sample_size=500)
#sampled_set.to_csv('sampled_set.csv',index=False)

In [20]:
#if the year and running times dont match, then it wont match the following pairs

degugger_output = em.debug_blocker(C5, A, B,output_size=100)

In [21]:
degugger_output

Unnamed: 0,_id,ltable_id,rtable_id,ltable_name,ltable_certificate,ltable_genre,ltable_director,ltable_actors,rtable_name,rtable_certificate,rtable_genre,rtable_director,rtable_actors
0,0,a46,b2666,guardians of the galaxy,PG-13,Action Adventure Sci-Fi,James Gunn,Chris Pratt Vin Diesel Bradley Cooper Zoe Saldana,guardians of the galaxy vol 2,PG-13,Comedy,James Gunn,Bradley Cooper Chris Pratt Dave Bautista Karen Gillan Michael Rooker Pom Klementieff Sylvester S...
1,1,a309,b905,star wars episode viii the last jedi,PG-13,Action Adventure Fantasy,Rian Johnson,Daisy Ridley John Boyega Mark Hamill Carrie Fisher,star wars episode vii the force awakens,PG-13,Fantasy,J.J. Abrams,Adam Driver Carrie Fisher Daisy Ridley Domhnall Gleeson Harrison Ford John Boyega Mark Hamill Os...
2,2,a84,b888,harry potter and the deathly hallows part 2,PG-13,Adventure Drama Fantasy,David Yates,Daniel Radcliffe Emma Watson Rupert Grint Michael Gambon,harry potter and the goblet of fire,PG-13,Family,Mike Newell,Daniel Radcliffe Emma Watson Rupert Grint
3,3,a248,b888,harry potter and the deathly hallows part 1,PG-13,Adventure Family Fantasy,David Yates,Daniel Radcliffe Emma Watson Rupert Grint Bill Nighy,harry potter and the goblet of fire,PG-13,Family,Mike Newell,Daniel Radcliffe Emma Watson Rupert Grint
4,4,a571,b2941,fast furious,PG-13,Action Crime Thriller,Justin Lin,Vin Diesel Paul Walker Michelle Rodriguez Jordana Brewster,fast five,PG-13,Crime,Justin Lin,Dwayne Johnson Paul Walker Vin Diesel
5,5,a60,b235,toy story,G,Animation Adventure Comedy,John Lasseter,Tom Hanks Tim Allen Don Rickles Jim Varney,toy story 2,G,Family,Ash Brannon John Lasseter Lee Unkrich,Tim Allen Tom Hanks
6,6,a18,b2787,the avengers,PG-13,Action Adventure Sci-Fi,Joss Whedon,Robert Downey Jr. Chris Evans Scarlett Johansson Jeremy Renner,avengers age of ultron,PG-13,Fantasy,Joss Whedon,Chris Evans Chris Hemsworth James Spader Jeremy Renner Mark Ruffalo Robert Downey Jr. Samuel L. ...
7,7,a193,b286,harry potter and the goblet of fire,PG-13,Adventure Family Fantasy,Mike Newell,Daniel Radcliffe Emma Watson Rupert Grint Eric Sykes,harry potter and the deathly hallows part 2,PG-13,Fantasy,David Yates,Daniel Radcliffe Emma Watson Michael Gambon Rupert Grint
8,8,a243,b397,the bourne supremacy,PG-13,Action Mystery Thriller,Paul Greengrass,Matt Damon Franka Potente Joan Allen Brian Cox,the bourne ultimatum,PG-13,Thriller,Paul Greengrass,Édgar Ramírez Joan Allen Matt Damon
9,9,a206,b706,star trek into darkness,PG-13,Action Adventure Sci-Fi,J.J. Abrams,Chris Pine Zachary Quinto Zoe Saldana Benedict Cumberbatch,star trek,PG-13,Sci-Fi,J.J. Abrams,Chris Pine Simon Pegg Zachary Quinto


In [24]:
#path_labeled_data = datasets_dir + os.sep + 'labeled_data_demo.csv'
import os
cwd = os.getcwd()
path_labeled_data = cwd + '\sampled_set_labelled.csv'
import py_entitymatching as em
import os
import pandas as pd
S = em.read_csv_metadata(path_labeled_data, key='_id', 
                         fk_ltable='ltable_id', fk_rtable='rtable_id',
                         ltable=A, rtable=B)

In [26]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)
I = IJ['train']
J = IJ['test']
I.to_csv('I_tuples.csv',index=False)
J.to_csv('J_tuples.csv',index=False)

In [18]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')

In [75]:
F = em.get_features_for_matching(A, B, validate_inferred_attr_types=False)

In [86]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [87]:
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')

In [22]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)
result['cv_stats']

Unnamed: 0,Matcher,Average precision,Average recall,Average f1
0,DecisionTree,0.984981,0.979737,0.982201
1,RF,0.989872,0.985122,0.987402
2,SVM,0.855566,1.0,0.92182
3,LinReg,0.980103,0.984859,0.982451
4,LogReg,0.985366,0.985122,0.985183


In [23]:
any(pd.notnull(H))

True

In [78]:
def release_year(ltuple, rtuple):
    # assume that the tuples have age attribute and values are valid numbers.
    return abs(ltuple['release_year'] - rtuple['release_year'])

In [79]:
em.add_blackbox_feature(F, 'release_year', release_year)

True

In [80]:
F = F.drop(14)
F = F.drop(15)
F = F.drop(16)
F = F.drop(17)

In [81]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='label',
                            show_progress=False)

In [82]:
H = em.impute_table(H, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')

In [84]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
        k=5,
        target_attr='label', metric_to_select_matcher='f1', random_state=0)


In [None]:
#Chossing the final matcher

In [113]:
# Instantiate the matcher to evaluate.
dt = em.RFMatcher()

# Train using feature vectors from I 
dt.fit(table=H, 
       exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
       target_attr='label')

# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F,
                            attrs_after='label', show_progress=False)

L= em.impute_table(L, 
                exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'],
                strategy='mean')




In [114]:
# Predict on L 
predictions = dt.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'label'], 
              append=True, target_attr='predicted', inplace=False, return_probs=False,
                        probs_attr='proba')



In [115]:
# Evaluate the predictions
eval_result = em.eval_matches(predictions, 'label', 'predicted')
em.print_eval_summary(eval_result)

Precision : 100.0% (192/192)
Recall : 99.48% (192/193)
F1 : 99.74%
False positives : 0 (out of 192 positive predictions)
False negatives : 1 (out of 58 negative predictions)
