## PBS figures


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the range of F_PQ_PC values from 0 to 1
F_PQ_PC_range = np.linspace(0, 1, 500)

def PBS(F_PQ_PC, RR, RR_opt):
    """
    Calculate the PBS value.

    Args:
        F_PQ_PC (float or array): F_PQ_PC value(s).
        RR (float or array): RR value(s).
        RR_opt (float): Optimal RR value.

    Returns:
        array: Calculated PBS values.
    """
    max_denominator = max(1 - RR_opt, RR_opt)
    term = 1 - np.sqrt(np.abs(RR - RR_opt) / max_denominator)
    term = np.clip(term, 0, None)  # Ensure term is not negative
    return np.sqrt(F_PQ_PC * term)

# Parameters for the plots
RR_opt = 0.9  # Fixed RR_opt value
RR_values = np.linspace(0, 1, 500)  # Range of RR values
F_PQ_PC_values = [0.1, 0.5, 0.98]  # Fixed F_PQ_PC values

# Color-blind-friendly colors
colors = ['#4daf4a', '#377eb8', '#ff7f00', '#f781bf']

# Plot 1: PBS vs. RR for different F_PQ_PC values
plt.figure(figsize=(12, 10))
for idx, F_PQ_PC in enumerate(F_PQ_PC_values):
    PBS_values = PBS(F_PQ_PC, RR_values, RR_opt)
    plt.plot(RR_values, PBS_values, label=f'$F_{{PQ,PC}} = {F_PQ_PC}$', color=colors[idx], linewidth=6)

plt.xlabel('$RR$', fontsize=50)
plt.ylabel('$PBS$', fontsize=50)
plt.axvline(RR_opt, color='black', linestyle='--', label=f'$RR_{{opt}} = {RR_opt}$', linewidth=6)
plt.xticks(fontsize=45)
plt.yticks(fontsize=45)
plt.legend(fontsize=40, loc='upper left', bbox_to_anchor=(0.01, 0.99), borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.gca().set_xticks([0] + list(plt.gca().get_xticks())[1:])
plt.gca().set_xticklabels([''] + [f'{x:.1f}' for x in plt.gca().get_xticks()[1:]])
plt.gca().set_yticks([0] + list(plt.gca().get_yticks())[1:])
plt.gca().set_yticklabels([''] + [f'{y:.1f}' for y in plt.gca().get_yticks()[1:]])
plt.gca().annotate('0.0', xy=(0, 0), xytext=(-0.1, -0.05), textcoords='axes fraction', fontsize=45, ha='left')
plt.tight_layout()
plt.savefig('FIGS/PBS-RR.pdf')
plt.close()

# Plot 2: PBS vs. F_PQ_PC for different RR values
plt.figure(figsize=(12, 10))
RR_test_values = [0.6, 0.8, RR_opt, 0.95]

for idx, RR in enumerate(RR_test_values):
    PBS_values = PBS(F_PQ_PC_range, RR, RR_opt)
    linestyle = '--' if RR == RR_opt else '-'
    plt.plot(F_PQ_PC_range, PBS_values, label=f'$RR = {RR}$' if RR != RR_opt else f'$RR_{{opt}} = {RR_opt}$', color=colors[idx], linewidth=6, linestyle=linestyle)

plt.xlabel('$F_{PQ,PC}$', fontsize=50)
plt.ylabel('$PBS$', fontsize=50)
plt.xticks(fontsize=45)
plt.yticks(fontsize=45)
plt.legend(fontsize=40, loc='lower right', bbox_to_anchor=(0.99, 0.01), borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.gca().set_xticks([0] + list(plt.gca().get_xticks())[1:])
plt.gca().set_xticklabels([''] + [f'{x:.1f}' for x in plt.gca().get_xticks()[1:]])
plt.gca().set_yticks([0] + list(plt.gca().get_yticks())[1:])
plt.gca().set_yticklabels([''] + [f'{y:.1f}' for y in plt.gca().get_yticks()[1:]])
plt.gca().annotate('0.0', xy=(0, 0), xytext=(-0.1, -0.05), textcoords='axes fraction', fontsize=45, ha='left')
plt.tight_layout()
plt.savefig('FIGS/PBS-F.pdf')

# Show the plots
plt.close()


In [None]:
import os
import time
import copy
from func import *
import numpy as np
import pandas as pd

from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging, BlockFiltering
from pyjedai.block_building import (
    StandardBlocking,
    QGramsBlocking,
    ExtendedQGramsBlocking,
    SuffixArraysBlocking,
    ExtendedSuffixArraysBlocking,
)

from DeepBlocksrc.deep_blocker import canopy_deep_blocker

from pyjedai.comparison_cleaning import CardinalityEdgePruning
from pyjedai.comparison_cleaning import BLAST
import os

classic_method_dict = {
                'SB': StandardBlocking(),
                'QG': QGramsBlocking(),
                'EQG': ExtendedQGramsBlocking(),
                'SA': SuffixArraysBlocking(),
                'ESA': ExtendedSuffixArraysBlocking()
                }
from pyjedai.comparison_cleaning import BLAST, CardinalityEdgePruning


#

classic_method_name = {
    'CTT': 'CTT',
    'AE': 'AUTO',
}

output_file = "deepBlcok.txt"

# Remove output file if it exists
try: os.remove(output_file)
except FileNotFoundError: pass

# Define tasks and methods to iterate over
tasks = ['Beer', 'Walmart-Amazon', 'Amazon-Google', 'DBLP-ACM', 'Fodors-Zagat', 'iTunes-Amazon', 'DBLP-GoogleScholar', 'Febrl']
methods = ['AE', 'CTT']


# Process each task with each method
for task in tasks:
    for method in methods:
        
        # Load datasets
        left_df = pd.read_csv(f"data/{task}/tableA.csv")
        right_df = pd.read_csv(f"data/{task}/tableB.csv")
        match_df = pd.read_csv(f"data/{task}/matches.csv")

        # Clean datasets for specific tasks
        if task == 'Fodors-Zagat':
            for df in [left_df, right_df]:
                df.applymap(lambda x: x.strip('`').strip() if isinstance(x, str) else x)
                df.applymap(lambda x: x.strip("'").strip() if isinstance(x, str) else x)

        left_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)
        right_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)

        # Process using classic method if applicable
        # if method in classic_method_dict:
            # bb = classic_method_dict[method]
            # attr = [col for col in left_df.columns if col != 'id']
            # data = Data(
            #     dataset_1=left_df.copy(), id_column_name_1='id',
            #     dataset_2=right_df.copy(), id_column_name_2='id',
            #     ground_truth=match_df.rename(columns={list(match_df.columns)[0]: 'D1', list(match_df.columns)[1]: 'D2'})
            # )
            # data.clean_dataset(remove_stopwords=False, remove_punctuation=False, remove_numbers=False, remove_unicodes=True)
            
        start_time = time.time()
        candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = method)

        end_time = time.time()
        candidates = candidates.astype(int)

    # Compile result rows
        left_merged = candidates.merge(left_df, left_on='id1', right_on='id', suffixes=('', '_left'))
        right_merged = left_merged.merge(right_df, left_on='id2', right_on='id', suffixes=('_left', '_right'))

        # Drop redundant columns and reset index
        result_df = right_merged.drop(columns=['id1', 'id2', 'id_left', 'id_right'])
        result_df = right_merged.copy()

        # Merge with match_df to determine labels
        merged_df = result_df.merge(match_df, left_on=['id_left', 'id_right'], right_on=['ltable_id', 'rtable_id'], how='left', indicator=True)
        result_df['label'] = (merged_df['_merge'] == 'both').astype(int)

        # Calculate metrics
        RR = 1 - result_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        RR_opt = 1 - match_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        PC = np.sum(result_df['label'] == 1) / match_df.shape[0]
        PQ = np.sum(result_df['label'] == 1) / result_df.shape[0]
        F = 2 * PC * PQ / (PC + PQ)

        # Print and save results
        print(task, classic_method_name[method])
        print(round(100 * RR, 4), round(100 * PC, 4), round(100 * PQ, 4), round(100 * F, 4))
        print()

        runtime = end_time - start_time

        with open(output_file, "a") as file:
            file.write(f"{task} {classic_method_name[method]}\n")
            file.write(f"{round(100 * RR, 4)} {round(100 * PC, 4)} {round(100 * PQ, 4)} {round(100 * F, 4)} {round(runtime, 2)}\n\n")
    break

In [None]:
np.sum(result_df['label'] == 1) / match_df.shape[0]

In [None]:

task  ='DBLP-ACM'
# task  ='Fodors-Zagat'

# Load datasets
left_df = pd.read_csv(f"data/{task}/tableA.csv")
right_df = pd.read_csv(f"data/{task}/tableB.csv")
match_df = pd.read_csv(f"data/{task}/matches.csv")


# import pickle
# with open('blocking_result.pkl', 'rb') as file:
#     pairs = pickle.load(file)


# tmp = np.array(pairs)
# tmp =tmp[:,0:2].astype(int)

# candidates = pd.DataFrame({'id1':tmp[:,0],'id2':tmp[:,1]})


candidates = pd.read_csv("tmp.csv")
# candidates.to_csv(hp['task']+'_pairs.csv', index=False)

candidates= candidates.rename(columns={'id1':'1','id2':'2'})
candidates = candidates.rename(columns={'2':'id1','1':'id2'})
# candidates
        # Rename columns in candidates DataFrame
# Merge left_df and right_df with candidates based on ids using vectorized operations
left_merged = candidates.merge(left_df, left_on='id1', right_on='id', suffixes=('', '_left'))
right_merged = left_merged.merge(right_df, left_on='id2', right_on='id', suffixes=('_left', '_right'))

# Drop redundant columns and reset index
result_df = right_merged.drop(columns=['id1', 'id2', 'id_left', 'id_right'])
result_df = right_merged.copy()

# Merge with match_df to determine labels
merged_df = result_df.merge(match_df, left_on=['id_left', 'id_right'], right_on=['ltable_id', 'rtable_id'], how='left', indicator=True)
result_df['label'] = (merged_df['_merge'] == 'both').astype(int)


RR = 1 - result_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
RR_opt = 1 - match_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
PC = np.sum(result_df['label'] == 1) / match_df.shape[0]
PQ = np.sum(result_df['label'] == 1) / result_df.shape[0]
F = 2 * PC * PQ / (PC + PQ)

# Print and save results
print(task)
print(round(100 * RR, 4), round(100 * PC, 4), round(100 * PQ, 4))
print()


In [None]:
import os
import time
import copy
import warnings
import numpy as np
import pandas as pd
from func import *
warnings.filterwarnings("ignore")



classic_method_name = {
    'CTT': 'CTT',
    'AE': 'AUTO',
}

output_file = "deepBlcok.txt"

# Remove output file if it exists
try: os.remove(output_file)
except: pass

# Define tasks and methods to iterate over
tasks = ['Beer', 'Walmart-Amazon', 'Amazon-Google', 'DBLP-ACM', 'Fodors-Zagat', 'iTunes-Amazon', 'DBLP-GoogleScholar', 'Febrl']
methods = ['AE', 'CTT']

# Process each task with each method
for task in tasks:
    for method in methods:
        
        # Load datasets
        left_df = pd.read_csv(f"data/{task}/tableA.csv")
        right_df = pd.read_csv(f"data/{task}/tableB.csv")
        match_df = pd.read_csv(f"data/{task}/matches.csv")

        # Clean datasets for specific tasks
        if task == 'Fodors-Zagat':
            for df in [left_df, right_df]:
                df.applymap(lambda x: x.strip('`').strip() if isinstance(x, str) else x)
                df.applymap(lambda x: x.strip("'").strip() if isinstance(x, str) else x)

        left_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)
        right_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)


        start_time = time.time()
        candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = method)
        end_time = time.time()

        # Rename columns in candidates DataFrame
        candidates.rename(columns={'ltable_id': 'id1', 'rtable_id': 'id2'}, inplace=True)

        # Merge left_df and right_df with candidates based on ids using vectorized operations
        left_merged = candidates.merge(left_df, left_on='id1', right_on='id', suffixes=('', '_left'))
        right_merged = left_merged.merge(right_df, left_on='id2', right_on='id', suffixes=('_left', '_right'))

        # Drop redundant columns and reset index
        result_df = right_merged.drop(columns=['id1', 'id2', 'id_left', 'id_right'])
        result_df = right_merged.copy()

        # Merge with match_df to determine labels
        merged_df = result_df.merge(match_df, left_on=['id_left', 'id_right'], right_on=['ltable_id', 'rtable_id'], how='left', indicator=True)
        result_df['label'] = (merged_df['_merge'] == 'both').astype(int)

        # Calculate metrics
        RR = 1 - result_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        RR_opt = 1 - match_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        PC = np.sum(result_df['label'] == 1) / match_df.shape[0]
        PQ = np.sum(result_df['label'] == 1) / result_df.shape[0]
        F = 2 * PC * PQ / (PC + PQ)

        # Print and save results
        print(task, classic_method_name[method])
        print(round(100 * RR, 4), round(100 * PC, 4), round(100 * PQ, 4), round(100 * F, 4))
        print()

        runtime = end_time - start_time

        with open(output_file, "a") as file:
            file.write(f"{task} {classic_method_name[method]}\n")
            file.write(f"{round(100 * RR, 4)} {round(100 * PC, 4)} {round(100 * PQ, 4)} {round(100 * F, 4)} {round(runtime, 2)}\n\n")


In [None]:
import sys
sys.path.append('.')
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparkly.index import IndexConfig, LuceneIndex
from sparkly.search import Searcher
from pathlib import Path

# the number of candidates returned per record
limit = 50
# path to the test data
data_path = Path('./examples/data/abt_buy/').absolute()
# table to be indexed
table_a_path = data_path / 'table_a.parquet'
# table for searching
table_b_path = data_path / 'table_b.parquet'
# the ground truth
gold_path = data_path / 'gold.parquet'
# the analyzers used to convert the text into tokens for indexing
analyzers = ['3gram']

# initialize a local spark context
spark = SparkSession.builder\
                    .master('local[*]')\
                    .appName('Sparkly Example')\
                    .getOrCreate()
# read all the data as spark dataframes
table_a = spark.read.parquet(f'file://{str(table_a_path)}')
table_b = spark.read.parquet(f'file://{str(table_b_path)}')
gold = spark.read.parquet(f'file://{str(gold_path)}')
# the index config, '_id' column will be used as the unique 
# id column in the index. Note id_col must be an integer (32 or 64 bit)
config = IndexConfig(id_col='_id')
# add the 'name' column to be indexed with analyzer above
config.add_field('name', analyzers)
# create a new index stored at /tmp/example_index/
index = LuceneIndex('/tmp/example_index/', config)
# index the records from table A according to the config we created above
index.upsert_docs(table_a)

# get a query spec (template) which searches on 
# all indexed fields
query_spec = index.get_full_query_spec()
# create a searcher for doing bulk search using our index
searcher = Searcher(index)
# search the index with table b
candidates = searcher.search(table_b, query_spec, id_col='_id', limit=limit).cache()

candidates.show()
# output is rolled up 
# search record id -> (indexed ids + scores + search time)
#
# explode the results to compute recall
pairs = candidates.select(
                    F.explode('ids').alias('a_id'),
                    F.col('_id').alias('b_id')
                )
# number of matches found
true_positives = gold.intersect(pairs).count()
# precentage of matches found
recall = true_positives / gold.count()

print(f'true_positives : {true_positives}')
print(f'recall : {recall}')

candidates.unpersist()

In [None]:
RR = []
PC = []
PQ = []
correlation_combined_pq_all =[]
correlation_combined_pc_all = []
correlation_combined_rr_all = []


for i in range(7):
# for i in [1]:

    RR = []
    PC = []
    PQ = []
    # for method in list(METHODS.keys()):
    for method in list(METHODS.keys()):
        for task in [TASKS[i]]:
             
        
            row = res[task][method]
            RR.append(row['RR'])
            PC.append(row['PC'])
            PQ.append(row['PQ'])

    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LinearRegression

    # Example data (replace these with your actual data arrays)

    # Create a DataFrame for easier handling
    data = pd.DataFrame({'RR': RR, 'PC': PC, 'PQ': PQ})
    
    
    # data_ = pd.DataFrame({'RR': RR, 'PC': PC, 'PQ': PQ})
    # scaler = StandardScaler()
    # data = scaler.fit_transform(data_)
    # data = pd.DataFrame({'RR': data[:,0], 'PC': data[:,1], 'PQ': data[:,2]})



    # 1. Correlation of PQ with combined RR and PC
    X1 = data[['RR', 'PC']]  # Independent variables (RR, PC)
    y1 = data['PQ']           # Dependent variable (PQ)

    model1 = LinearRegression()
    model1.fit(X1, y1)
    pq_pred = model1.predict(X1)
    correlation_combined_pq = np.corrcoef(y1, pq_pred)[0, 1]
    print(f"Correlation of PQ with combined RR and PC: {correlation_combined_pq:.2f}")

    # 2. Correlation of RR with combined PC and PQ
    X2 = data[['PC', 'PQ']]  # Independent variables (PC, PQ)
    y2 = data['RR']           # Dependent variable (RR)

    model2 = LinearRegression()
    model2.fit(X2, y2)
    rr_pred = model2.predict(X2)
    correlation_combined_rr = np.corrcoef(y2, rr_pred)[0, 1]
    print(f"Correlation of RR with combined PC and PQ: {correlation_combined_rr:.2f}")

    # 3. Correlation of PC with combined RR and PQ
    X3 = data[['RR', 'PQ']]  # Independent variables (RR, PQ)
    y3 = data['PC']           # Dependent variable (PC)

    model3 = LinearRegression()
    model3.fit(X3, y3)
    pc_pred = model3.predict(X3)
    correlation_combined_pc = np.corrcoef(y3, pc_pred)[0, 1]
    print(f"Correlation of PC with combined RR and PQ: {correlation_combined_pc:.4f}")
    print()

    correlation_combined_pq_all.append(correlation_combined_pq)
    correlation_combined_rr_all.append(correlation_combined_rr)
    correlation_combined_pc_all.append(correlation_combined_pc)

np.mean(correlation_combined_pq_all) , np.mean(correlation_combined_rr_all) , np.mean(correlation_combined_pc_all)

In [None]:
import os
import time
import copy
from func import *
import numpy as np
import pandas as pd

from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging, BlockFiltering
from pyjedai.block_building import (
    StandardBlocking,
    ExtendedQGramsBlocking,
    ExtendedSuffixArraysBlocking,
    QGramsBlocking,
    SuffixArraysBlocking
)
from pyjedai.comparison_cleaning import BLAST, CardinalityEdgePruning


# Define method dictionaries
classic_method_dict = {
                'SB': StandardBlocking(),
                'QG': QGramsBlocking(),
                'EQG': ExtendedQGramsBlocking(),
                'SA': SuffixArraysBlocking(),
                'ESA': ExtendedSuffixArraysBlocking()
                }
classic_method_name = {
    'SB': 'StandardBlocking',
    'EQG': 'ExtendedQGramsBlocking',
    'ESA': 'ExtendedSuffixArraysBlocking',
    'QG': 'QGramsBlocking',
    'SA': 'SuffixArraysBlocking',
    'CTT': 'CTT',
    'AE': 'AUTO',

}

import random 

def shuffle_string(s):
    str_list = list(s)
    random.shuffle(str_list)
    shuffled_str = ''.join(str_list)
    return shuffled_str

def shuffle_df(df_in,match_list_in, frac  =10):
    df = df_in.copy()
    match_list = match_list_in.copy()
    np.random.shuffle(match_list)
    R = int(len(match_list) * (frac /100))
    for i in match_list[0:R]: 
        for col in list(df.columns):
            if col =='id': continue
            
            s = str(df[df['id'] == i][col].to_list()[0])
            s2 = shuffle_string(s)
            df.loc[df['id'] == i, col] = s2
    return df



output_file = "tmp2.txt"
try: os.remove(output_file)
except FileNotFoundError: pass

Time_all = []

tasks = ['Beer', 'Walmart-Amazon', 'Amazon-Google', 'DBLP-ACM', 'Fodors-Zagat', 'iTunes-Amazon', 'DBLP-GoogleScholar', 'Febrl']


task = 'Amazon-Google'
methods = ['SB', 'EQG', 'ESA', 'QG','SA','CTT','AE']
methods = ['SB']
EXP2 = []

# Process each task with each method
for method in methods:
    
    # Load datasets
    left_df_in = pd.read_csv(f"data/{task}/tableA.csv")
    right_df = pd.read_csv(f"data/{task}/tableB.csv")
    match_df = pd.read_csv(f"data/{task}/matches.csv")


    for F in [0,10,20,30,40,50,60,70,80,90,100]:
    # for F in [0,25,50,75,100]:
    # for F in [0,50,100]:

        




        

        for _ in range(4):
            match_left = np.unique(list(match_df['ltable_id']))
            np.random.shuffle(match_left)
            left_df = shuffle_df(left_df_in.copy(),match_left, frac  =F)

            left_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)
            right_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)

            # Process using classic method if applicable

            

            if method == 'CTT':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'CTT')
            elif method =='AE':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'AE')
            else:

                bb = classic_method_dict[method]
                attr = [col for col in left_df.columns if col != 'id']
                data = Data(
                    dataset_1=left_df.copy(), id_column_name_1='id',
                    dataset_2=right_df.copy(), id_column_name_2='id',
                    ground_truth=match_df.rename(columns={list(match_df.columns)[0]: 'D1', list(match_df.columns)[1]: 'D2'})
                )
                data.clean_dataset(remove_stopwords=False, remove_punctuation=False, remove_numbers=False, remove_unicodes=True)
                start_time = time.time()
                blocks = bb.build_blocks(copy.deepcopy(data), attributes_1=attr, attributes_2=attr, tqdm_disable=True)
                bp = BlockPurging()
                bf = BlockFiltering()
                mb = BLAST('EJS')
                cleaned_blocks = bf.process(blocks, data, tqdm_disable=True)
                filtered_blocks = bp.process(cleaned_blocks, data, tqdm_disable=True)
                # candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=False)

                # candidates = mb.export_to_df(candidate_pairs_blocks)
            end_time = time.time()

            runtime = end_time - start_time
            Time_all.append(runtime)



        # Print and save results

        runtime = np.mean(Time_all)
        runtime_std = np.std(Time_all)
        EXP2.append([F, runtime, runtime_std])
        print(task, classic_method_name[method],round(runtime, 3), round(runtime_std,3))
        print()



In [None]:
import matplotlib.pyplot as plt

x_values = np.array(EXP2)[:,0]
averages = np.array(EXP2)[:,1]
std_devs = np.array(EXP2)[:,2]
color = '#0072B2'  # A blue color that is colorblind-friendly

# Plotting the data with error bars
plt.errorbar(x_values, averages, yerr=std_devs, fmt='o', capsize=5, color=color, label='Data with error bars')

# Adding line to connect the dots with the same color
plt.plot(x_values, averages, linestyle='-', marker='o', color=color, label='Connected Line')

plt.xlabel('fraction of disturbance in true matches')
plt.ylabel('time')


In [None]:
import os
import time
import copy
from func import *
import numpy as np
import pandas as pd

from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging, BlockFiltering
from pyjedai.block_building import (
    StandardBlocking,
    ExtendedQGramsBlocking,
    ExtendedSuffixArraysBlocking,
    QGramsBlocking,
    SuffixArraysBlocking
)
from pyjedai.comparison_cleaning import (
    WeightedEdgePruning, WeightedNodePruning, 
    CardinalityEdgePruning, CardinalityNodePruning,
    BLAST, ReciprocalCardinalityNodePruning,
    ReciprocalWeightedNodePruning, ComparisonPropagation)


# Define method dictionaries
classic_method_dict = {
                'SB': StandardBlocking(),
                'QG': QGramsBlocking(),
                'EQG': ExtendedQGramsBlocking(),
                'SA': SuffixArraysBlocking(),
                'ESA': ExtendedSuffixArraysBlocking()
                }
classic_method_name = {
    'SB': 'StandardBlocking',
    'EQG': 'ExtendedQGramsBlocking',
    'ESA': 'ExtendedSuffixArraysBlocking',
    'QG': 'QGramsBlocking',
    'SA': 'SuffixArraysBlocking',
    'CTT': 'CTT',
    'AE': 'AUTO',

}

output_file = "block_stat.txt"
# output_file = "tmp.txt"

# Remove output file if it exists
try: os.remove(output_file)
except FileNotFoundError: pass

# Define tasks and methods to iterate over
tasks = ['Beer', 'Walmart-Amazon', 'Amazon-Google', 'DBLP-ACM', 'Fodors-Zagat', 'DBLP-GoogleScholar']# 'iTunes-Amazon'
tasks = ['Beer']
methods = ['SB', 'EQG', 'ESA', 'QG','SA','CTT','AE']



# Process each task with each method
for task in tasks:
    for method in methods:
        
        # Load datasets
        left_df = pd.read_csv(f"data/{task}/tableA.csv")
        right_df = pd.read_csv(f"data/{task}/tableB.csv")
        match_df = pd.read_csv(f"data/{task}/matches.csv")

        # Clean datasets for specific tasks
        if task == 'Fodors-Zagat':
            for df in [left_df, right_df]:
                df.applymap(lambda x: x.strip('`').strip() if isinstance(x, str) else x)
                df.applymap(lambda x: x.strip("'").strip() if isinstance(x, str) else x)

        left_df = left_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)
        right_df = right_df.replace(r"\\ '", "'", regex=True).replace(r" '", "'", regex=True).replace(r"\\ `", "\\ ", regex=True)
        
        left_df = left_df.applymap(lambda x: x.strip('`') if isinstance(x, str) else x).applymap(lambda x: x.strip("'") if isinstance(x, str) else x).applymap(lambda x: x.strip() if isinstance(x, str) else x)
        right_df = right_df.applymap(lambda x: x.strip('`') if isinstance(x, str) else x).applymap(lambda x: x.strip("'") if isinstance(x, str) else x).applymap(lambda x: x.strip() if isinstance(x, str) else x)



        # Process using classic method if applicable
        if method in classic_method_dict:
            bb = classic_method_dict[method]
            attr = [col for col in left_df.columns if col != 'id']
            data = Data(
                dataset_1=left_df.copy(), id_column_name_1='id',
                dataset_2=right_df.copy(), id_column_name_2='id',
                ground_truth=match_df.rename(columns={list(match_df.columns)[0]: 'D1', list(match_df.columns)[1]: 'D2'})
            )
            data.clean_dataset(remove_stopwords=False, remove_punctuation=False, remove_numbers=False, remove_unicodes=True)
            
            start_time = time.time()
            blocks = bb.build_blocks(copy.deepcopy(data), attributes_1=attr, attributes_2=attr, tqdm_disable=True)
            
            bp = BlockPurging()
            bf = BlockFiltering()
            mb = BLAST('EJS')
            # if task == 'iTunes-Amazon':
            #     mb = CardinalityEdgePruning()
            # for meta in META:
            cleaned_blocks = bf.process(copy.deepcopy(blocks), data, tqdm_disable=True)
            filtered_blocks = bp.process(cleaned_blocks, data, tqdm_disable=True)
            candidate_pairs_blocks = mb.process(filtered_blocks, data, tqdm_disable=False)

            candidates = mb.export_to_df(candidate_pairs_blocks)
        
        else:
            if method == 'CTT':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'CTT')
            elif method =='AE':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'AE')
            else:
                print('method not found!')

        end_time = time.time()

        # Compile result rows
        candidates= candidates.astype(int)

        # Rename columns in candidates DataFrame
        candidates.rename(columns={'ltable_id': 'id1', 'rtable_id': 'id2'}, inplace=True)

        # Merge left_df and right_df with candidates based on ids using vectorized operations
        left_merged = candidates.merge(left_df, left_on='id1', right_on='id', suffixes=('', '_left'))
        right_merged = left_merged.merge(right_df, left_on='id2', right_on='id', suffixes=('_left', '_right'))

        # Drop redundant columns and reset index
        result_df = right_merged.drop(columns=['id1', 'id2', 'id_left', 'id_right'])
        result_df = right_merged.copy()

        # Merge with match_df to determine labels
        merged_df = result_df.merge(match_df, left_on=['id_left', 'id_right'], right_on=['ltable_id', 'rtable_id'], how='left', indicator=True)
        result_df['label'] = (merged_df['_merge'] == 'both').astype(int)
        # result_df.to_csv(task +'_'+method+ '.csv',index= False)

        # Calculate metrics
        RR = 1 - result_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        RR_opt = 1 - match_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        PC = np.sum(result_df['label'] == 1) / match_df.shape[0]
        PQ = np.sum(result_df['label'] == 1) / result_df.shape[0]
        F = 2 * PC * PQ / (PC + PQ)

        # Print and save results
        print(task, classic_method_name[method])
        print(round(100 * RR, 4), round(100 * PC, 4), round(100 * PQ, 4), round(100 * F, 4))
        print()

        runtime = end_time - start_time



        MINOR, MAJOR, data_STAT = calc_bias_block(result_df,match_df,left_df,right_df, task , sens_dict)
        [RR_minor,PC_minor,PQ_minor,Fb_minor ] = MINOR
        [RR_major,PC_major,PQ_major,Fb_major ] = MAJOR
        [P_major, P_minor, M_major, M_minor] = data_STAT

        NUM = 2
        print('Minor: ',end='')
        print(round(RR_minor,NUM),round(PC_minor,NUM),round(PQ_minor,NUM), round(Fb_minor,NUM))
        print('major: ',end='')
        print(round(RR_major,NUM),round(PC_major,NUM),round(PQ_major,NUM), round(Fb_major,NUM))
        print('diff : ',end='')
        print(round(RR_major-RR_minor ,NUM),round(PC_major- PC_minor,NUM),round(PQ_major - PQ_minor,NUM),round(Fb_major - Fb_minor,NUM))


        print()
        with open(output_file, "a") as file:
            file.write(f"{task} {classic_method_name[method]}\n")
            file.write(f"{round(100 * RR, 4)} {round(100 * PC, 4)} {round(100 * PQ, 4)} {round(100 * F, 4)} {round(runtime, 2)}\n\n")
            file.write(f"bias {round( RR_minor, 4)} {round( PC_minor, 4)} {round( PQ_minor, 4)} {round( Fb_minor, 4)}\n\n")
            file.write(f"bias {round( RR_major, 4)} {round( PC_major, 4)} {round( PQ_major, 4)} {round( Fb_major, 4)}\n\n")
            file.write(f"bias {round( RR_major - RR_minor, 4)} {round( PC_major - PC_minor, 4)} {round( PQ_major - PQ_minor, 4)} {round( Fb_major - Fb_minor, 4)}\n\n")
            file.write(f"bias {P_major} {P_minor} {M_major} {M_minor}\n\n")






In [None]:
import pandas as pd

# Define the path to your text file
file_path = 'block_stat.txt'

# Read the file and process the data
data = []
res = {}
with open(file_path, 'r') as file:
    for line in file:
        # Split the line into parts based on spaces
        parts = line.split()
        if parts == []: continue
        if len(parts) == 2:
            if parts[0] not in res.keys():
                res[parts[0]] = {}
            res[parts[0]][parts[1]] = {}
            method = parts[1]
            dataset = parts[0]
            cnt = 0
        elif parts[0] != 'bias':
            RR = float(parts[0])
            PC = float(parts[1])
            PQ = float(parts[2])
            time_ = float(parts[4])
            Fb = 2*PC *RR / (PC + RR)
            res[dataset][method] = {'RR':round(RR,5), 'PC':round(PC,5), 'PQ':round(PQ,5), 'Fb':round(Fb,5), 'time':round(time_,5)}
        else:
            if cnt ==0:
                RR_minor = float(parts[1])
                PC_minor = float(parts[2])
                PQ_minor = float(parts[3])
                Fb_minor = float(parts[4])
                res[dataset][method]['RR_minor'] = round(RR_minor,5)
                res[dataset][method]['PC_minor'] = round(PC_minor,5)
                res[dataset][method]['PQ_minor'] = round(PQ_minor,5)
                res[dataset][method]['Fb_minor'] = round(Fb_minor,5)
                cnt+=1
            elif cnt ==1:
                RR_major = float(parts[1])
                Pc_major = float(parts[2])
                PQ_major = float(parts[3])
                Fb_major = float(parts[4])
                res[dataset][method]['RR_major'] = round(RR_major,5)
                res[dataset][method]['Pc_major'] = round(Pc_major,5)
                res[dataset][method]['PQ_major'] = round(PQ_major,5)
                res[dataset][method]['Fb_major'] = round(Fb_major,5)
                cnt+=1
            elif cnt==2:

                RR_diff = float(parts[1])
                Pc_diff = float(parts[2])
                PQ_diff = float(parts[3])
                Fb_diff = float(parts[4])
                res[dataset][method]['RR_diff'] = round(RR_diff,5)
                res[dataset][method]['Pc_diff'] = round(Pc_diff,5)
                res[dataset][method]['PQ_diff'] = round(PQ_diff,5)
                res[dataset][method]['Fb_diff'] = round(Fb_diff,5)
                cnt+=1
            else:
                continue


                

METHODS = {
    'StandardBlocking':'\\stdBlock',
    'QGramsBlocking':'\\qgram',
    'ExtendedQGramsBlocking':'\\exQgram',
    'SuffixArraysBlocking':'\\suffix',
    'ExtendedSuffixArraysBlocking':'\\exSuffix',
    'AUTO':'\\AutoBlock',
    'CTT':'\\CTT'}



TASKS = ['Amazon-Google', 'Walmart-Amazon', 'DBLP-GoogleScholar', 'DBLP-ACM', 'Beer', 'Fodors-Zagat', 'iTunes-Amazon']
TASKS = ['DBLP-GoogleScholar', 'DBLP-ACM','Beer']
# TASKS = ['Amazon-Google', 'Walmart-Amazon','Beer', 'Fodors-Zagat']


# Open a file in write mode
with open('exp1__latex.txt', 'w') as file:
    for method in METHODS.keys():
        for task in TASKS:
            row = res[task][method]

            Fb = 2 *row['PC'] * row['RR'] /(row['PC'] + row['RR'])
            if task == 'iTunes-Amazon':
                file.write(f"{row['RR']:.2f} & {row['PC']:.2f} & {row['PQ']:.2f} & {Fb:.2f} \\\\\n")
            elif task == 'Amazon-Google':
                file.write(f"{METHODS[method]} & \n{row['RR']:.2f} & {row['PC']:.2f} & {row['PQ']:.2f} & {Fb:.2f} & \n")
            else:
                file.write(f"{row['RR']:.2f} & {row['PC']:.2f} & {row['PQ']:.2f} & {Fb:.2f} & \n")

        file.write('\n')



# Open a file in write mode
with open('exp2__latex.txt', 'w') as file:
    for method in METHODS.keys():
        for task in TASKS:
            
            row = res[task][method]
            Fb_minor = 2 *row['PC_minor'] * row['RR_minor'] /(row['PC_minor'] + row['RR_minor'])
            Fb_major = 2 *row['Pc_major'] * row['RR_major'] /(row['Pc_major'] + row['RR_major'])

            fb_diff = Fb_major - Fb_minor

            if task == 'iTunes-Amazon':
                file.write(f"{row['RR_diff']:.2f} & {row['Pc_diff']:.2f} & {fb_diff:.2f}  \\\\\n")
            elif task == 'Amazon-Google':
                file.write(f"{METHODS[method]} & \n{row['RR_diff']:.2f} & {row['Pc_diff']:.2f} & {fb_diff:.2f} & \n")
            else:
                file.write(f"{row['RR_diff']:.2f} & {row['Pc_diff']:.2f} & {fb_diff:.2f} & \n")

        file.write('\n')



# Open a file in write mode
with open('exp3__latex.txt', 'w') as file:
    for method in METHODS.keys():
        for task in TASKS:
            row = res[task][method]
            Fb_minor = 2 *row['PC_minor'] * row['RR_minor'] /(row['PC_minor'] + row['RR_minor'])
            Fb_major = 2 *row['Pc_major'] * row['RR_major'] /(row['Pc_major'] + row['RR_major'])



            if task == 'Fodors-Zagat':
                file.write(f"{row['RR_minor']:.2f} & {row['RR_major']:.2f} & {row['PC_minor']:.2f} & {row['Pc_major']:.2f} & {Fb_minor:.2f} &  {Fb_major:.2f} \\\\\n")
            elif task == 'Amazon-Google':
                file.write(f"{METHODS[method]} & \n{row['RR_minor']:.2f} & {row['RR_major']:.2f} & {row['PC_minor']:.2f} & {row['Pc_major']:.2f} & {Fb_minor:.2f} &  {Fb_major:.2f} & \n")
            elif task in ['Walmart-Amazon', 'Beer']:
                file.write(f"{row['RR_minor']:.2f} & {row['RR_major']:.2f} & {row['PC_minor']:.2f} & {row['Pc_major']:.2f} & {Fb_minor:.2f} &  {Fb_major:.2f} & \n")

        file.write('\n')

