## correlation of RR, PC, PQ


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Initialize the overall correlation matrix
correlation_matrix_all = np.zeros((3, 3))

METHODS = {
    'StandardBlocking':'\\stdBlock',
    'QGramsBlocking':'\\qgram',
    'ExtendedQGramsBlocking':'\\exQgram',
    'SuffixArraysBlocking':'\\suffix',
    'ExtendedSuffixArraysBlocking':'\\exSuffix',
    'AUTO':'\\AutoBlock',
    'CTT':'\\CTT'}



TASKS = ['Amazon-Google', 'Walmart-Amazon', 'DBLP-GoogleScholar', 'DBLP-ACM', 'Beer', 'Fodors-Zagat', 'iTunes-Amazon']
file_path = 'Block_stat_final.txt'


# Read the file and process the data
data = []
res = {}
with open(file_path, 'r') as file:
    for line in file:
        # Split the line into parts based on spaces
        parts = line.split()
        if parts == []: continue
        if len(parts) == 2:
            if parts[0] not in res.keys():
                res[parts[0]] = {}
            res[parts[0]][parts[1]] = {}
            method = parts[1]
            dataset = parts[0]
            cnt = 0
        elif parts[0] != 'bias':
            RR = float(parts[0])
            PC = float(parts[1])
            PQ = float(parts[2])
            time_ = float(parts[4])
            Fb = 2*PC *RR / (PC + RR)
            res[dataset][method] = {'RR':round(RR,5), 'PC':round(PC,5), 'PQ':round(PQ,5), 'Fb':round(Fb,5), 'time':round(time_,5)}
        else:
            if cnt ==0:
                RR_minor = float(parts[1])
                PC_minor = float(parts[2])
                PQ_minor = float(parts[3])
                Fb_minor = float(parts[4])
                res[dataset][method]['RR_minor'] = round(RR_minor,5)
                res[dataset][method]['PC_minor'] = round(PC_minor,5)
                res[dataset][method]['PQ_minor'] = round(PQ_minor,5)
                res[dataset][method]['Fb_minor'] = round(Fb_minor,5)
                cnt+=1
            elif cnt ==1:
                RR_major = float(parts[1])
                Pc_major = float(parts[2])
                PQ_major = float(parts[3])
                Fb_major = float(parts[4])
                res[dataset][method]['RR_major'] = round(RR_major,5)
                res[dataset][method]['Pc_major'] = round(Pc_major,5)
                res[dataset][method]['PQ_major'] = round(PQ_major,5)
                res[dataset][method]['Fb_major'] = round(Fb_major,5)
                cnt+=1
            elif cnt==2:

                RR_diff = float(parts[1])
                Pc_diff = float(parts[2])
                PQ_diff = float(parts[3])
                Fb_diff = float(parts[4])
                res[dataset][method]['RR_diff'] = round(RR_diff,5)
                res[dataset][method]['Pc_diff'] = round(Pc_diff,5)
                res[dataset][method]['PQ_diff'] = round(PQ_diff,5)
                res[dataset][method]['Fb_diff'] = round(Fb_diff,5)
                cnt+=1
            else:
                continue



# Iterate over tasks
for i in range(7):
    RR = []
    PC = []
    PQ = []
    
    # Calculate metrics for each method
    for method in METHODS.keys():
        task = TASKS[i]
        row = res[task][method]
        RR.append(row['RR'])
        PC.append(row['PC'])
        PQ.append(row['PQ'])

    # Convert metrics to numpy arrays
    rr = np.array(RR)
    pc = np.array(PC)
    pq = np.array(PQ)

    # Create a DataFrame and compute the correlation matrix
    data = pd.DataFrame({'RR': rr, 'PC': pc, 'PQ': pq})
    correlation_matrix_all += np.abs(data.corr())

# Plot the overall correlation heatmap
plt.figure(figsize=(10, 10))
colorblind_palette = sns.color_palette("colorblind", as_cmap=True)

sns.heatmap(
    (correlation_matrix_all / 7), 
    annot=True, 
    cmap=colorblind_palette, 
    center=0, 
    annot_kws={"size": 40, "weight": "bold"},
    cbar=False
)

plt.xticks(fontsize=60)
plt.yticks(fontsize=60)
plt.tick_params(axis='both', which='both', length=10, width=3)
plt.tight_layout()
plt.savefig('FIGS/correlation_heatmap_all.pdf')
plt.close()


## Time

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


METHODS = {
    'StandardBlocking':'stdBlck',
    'QGramsBlocking':'QGram',
    'ExtendedQGramsBlocking':'XQGram',
    'SuffixArraysBlocking':'Suffix',
    'ExtendedSuffixArraysBlocking':'XSuffix',
}




TASKS = ['Amazon-Google', 'Walmart-Amazon', 'DBLP-GoogleScholar', 'DBLP-ACM', 'Beer', 'Fodors-Zagat', 'iTunes-Amazon']
file_path = 'block_stat_time.txt'


# Read the file and process the data
data = []
res = {}
with open(file_path, 'r') as file:
    for line in file:
        # Split the line into parts based on spaces
        parts = line.split()
        if parts == []: continue
        try: 
            float(parts[0])
            time_avg = float(parts[0])
            time_std = float(parts[1])
            res[dataset][method] = {'time_avg':round(time_avg,5), 'time_std':round(time_std,5)}
        except:
            if parts[0] not in res.keys():
                res[parts[0]] = {}
            res[parts[0]][parts[1]] = {}
            method = parts[1]
            dataset = parts[0]
     


res['Beer']['AUTO']  = {'time_avg': 9.16185941696167, 'time_std': 0.4758632559586421}
res['iTunes-Amazon']['AUTO']= {'time_avg': 112.45368194580078, 'time_std': 3.491730883903146}
res['Fodors-Zagat']['AUTO']= {'time_avg': 0.7223541736602783, 'time_std': 0.04316247998284229}
res['Walmart-Amazon']['AUTO']= {'time_avg': 18.85678825378418, 'time_std': 0.5236079390049121}
res['Amazon-Google']['AUTO']= {'time_avg': 5.152085638046264, 'time_std': 0.16152604483293306}
res['DBLP-ACM']['AUTO']= {'time_avg': 7.0379444599151615, 'time_std': 0.2323224380155196}
res['DBLP-GoogleScholar']['AUTO']= {'time_avg': 58.06285433769226, 'time_std': 2.8567257992252384}

res['Beer']['CTT']  = {'time_avg': 10.189891958236695, 'time_std': 0.4049498209339713}
res['iTunes-Amazon']['CTT']= {'time_avg': 156.71345224380494, 'time_std': 5.504195632915928}
res['Fodors-Zagat']['CTT']= {'time_avg': 0.642644739151001, 'time_std': 0.015861690254546323}
res['Walmart-Amazon']['CTT']= {'time_avg': 27.573931312561037, 'time_std': 5.267834604690219}
res['Amazon-Google']['CTT']= {'time_avg': 5.319742012023926, 'time_std': 0.3558836507519921}
res['DBLP-ACM']['CTT']= {'time_avg': 7.460848760604859, 'time_std': 0.18120981692447158}
res['DBLP-GoogleScholar']['CTT']= {'time_avg': 89.94730200767518, 'time_std': 9.554257975921635}



METHODS = {
    'StandardBlocking':'stdBlck',
    'QGramsBlocking':'QGram',
    'ExtendedQGramsBlocking':'XQGram',
    'SuffixArraysBlocking':'Suffix',
    'ExtendedSuffixArraysBlocking':'XSuffix',
    'AUTO':'AUTO',
    'CTT':'CTT'
}




colors = ["#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
colors = ["#E69F00", "#33BFA1", "#007D59", "#56B4E9","#0072B2", "#D55E00", "#CC79A7"]

markers = ['o', 's', 'D', '^', 'v', 'p', '*']


colors = ["#E69F00", "#33BFA1", "#007D59", "#56B4E9","#0072B2", "#E15759", "#8B2B2E"]#
markers = ['o', 's', 'D', '^', 'v', '>', '<']


plt.figure(figsize=(14, 12))

for idx, M in enumerate(METHODS.keys()):
    res_plot = []
    for task in TASKS:
        left_df = pd.read_csv(f"data/{task}/tableA.csv")
        right_df = pd.read_csv(f"data/{task}/tableB.csv")

        size_ = left_df.shape[0] * right_df.shape[0]
        time_ = res[task][M]['time_avg']
        time_std = res[task][M]['time_std']

        # res_plot.append([np.log(size_), np.log(time_)])
        res_plot.append([size_, 
                         time_, time_std])

    res_plot.sort(key=lambda x: x[0])

    plt.loglog(
        [x[0] for x in res_plot],
        [x[1] for x in res_plot],
        marker=markers[idx % len(markers)],
        color=colors[idx % len(colors)],
        label=METHODS[M].replace('\\',''),
        linestyle='-',  # Solid line
        linewidth=4,
        markersize=23
    )





plt.xlabel("$|P_{init}|$: Initial pair pool size ($\\times 10^6$)", fontsize=50)

plt.ylabel("Time (sec)",fontsize=50)
current_ticks = plt.xticks()[0]
new_ticks = current_ticks[1:-2:]
plt.xticks(new_ticks,fontsize=48)
plt.tick_params(axis='both', which='both', length=12, width=1.5)
plt.yticks(fontsize=48)
plt.legend(fontsize=48, loc='upper left', bbox_to_anchor=(0.005, 0.995), borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
plt.tight_layout()
plt.savefig('FIGS/time_log.pdf')
# plt.close()




In [None]:

plt.figure(figsize=(12, 10))

for idx, M in enumerate(METHODS.keys()):
    res_plot = []
    for task in TASKS:
        left_df = pd.read_csv(f"data/{task}/tableA.csv")
        right_df = pd.read_csv(f"data/{task}/tableB.csv")

        size_ = left_df.shape[0] * right_df.shape[0]
        time_ = res[task][M]['time_avg']
        time_std= res[task][M]['time_std']

        # res_plot.append([np.log(size_), np.log(time_)])
        res_plot.append([size_/1000000, 
                         time_/60, time_std/60])

    res_plot.sort(key=lambda x: x[0])

    plt.plot(
        [x[0] for x in res_plot],
        [x[1] for x in res_plot],
        marker=markers[idx % len(markers)],
        color=colors[idx % len(colors)],
        label=METHODS[M].replace('\\',''),
        linestyle='-',  # Solid line
        linewidth=3.8,
        markersize=16
    )

    # sizes = [x[0] for x in res_plot]
    # times = [x[1] for x in res_plot]
    # time_stds = [x[2] for x in res_plot]

    # # plt.errorbar(
    # #     sizes,
    # #     times,
    # #     yerr=time_stds,
    # #     marker=markers[idx % len(markers)],
    # #     color=colors[idx % len(colors)],
    # #     label=METHODS[M].replace('\\', ''),
    # #     linestyle='-',  # Solid line
    # #     linewidth=3.8,
    # #     markersize=16,
    # #     capsize=5  # Controls the cap size of the error bars
    # # )


plt.xlabel("Initial pair pool size $|P_{init}|$ ($\\times 10^6$)", fontsize=42)
plt.ylabel("Time (min)",fontsize=42)
current_ticks = plt.xticks()[0]
new_ticks = current_ticks[1::2]
plt.xticks(new_ticks,fontsize=45)
plt.tick_params(axis='both', which='both', length=12, width=1.5)
plt.yticks(fontsize=45)
plt.legend(fontsize=40, loc='upper left', bbox_to_anchor=(0.005, 0.995), borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
plt.tight_layout()
plt.savefig('FIGS/time.pdf')
plt.close()


## Time from same

In [165]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np




METHODS = {
    'StandardBlocking':'stdBlck',
    'ExtendedQGramsBlocking':'XQGram',
    'ExtendedSuffixArraysBlocking':'XSuffix',
    'QGramsBlocking':'QGram',
    'SuffixArraysBlocking':'Suffix',
}

TASKS = ['Amazon-Google', 'Walmart-Amazon', 'DBLP-GoogleScholar', 'DBLP-ACM', 'Beer', 'Fodors-Zagat', 'iTunes-Amazon']


# for i in [2,3,4,5]:
for i in [5]:

        file_path = f'block_stat_time_same{i}.txt'


        # Read the file and process the data
        data = []
        res = {}
        time_ =[]

        frac = []
        with open(file_path, 'r') as file:
            for line in file:
                # Split the line into parts based on spaces
                parts = line.split()
                if parts == []: 
                        res[dataset][method] = {'frac':frac, 'time':time_}
                        continue
                if parts[0] in TASKS:
                    if parts[0] not in res.keys():
                        res[parts[0]] = {}
                        res[parts[0]][parts[1]] = {}
                    frac = []
                    time_ = []
                    method = parts[1]
                    dataset = parts[0]

                else:
                    frac.append(float(parts[0]))
                    time_.append(float(parts[2]))




        METHODS = {
            'StandardBlocking':'stdBlck',
            'QGramsBlocking':'QGram',
            'ExtendedQGramsBlocking':'XQGram',
            'SuffixArraysBlocking':'Suffix',
            'ExtendedSuffixArraysBlocking':'XSuffix',
            'AUTO':'AUTO',
            'CTT':'CTT'
        }


        colors = ["#E69F00", "#33BFA1", "#007D59", "#56B4E9","#0072B2", "#D55E00", "#CC79A7"]
        markers = ['o', 's', 'D', '^', 'v', 'p', '*']



        colors = ["#E69F00", "#33BFA1", "#007D59", "#56B4E9","#0072B2", "#E15759", "#8B2B2E"]#
        markers = ['o', 's', 'D', '^', 'v', '>', '<']


        plt.figure(figsize=(14, 12))


        TT = list(res.keys())[0]
        for idx, M in enumerate(METHODS.keys()):
            res_plot = []




            if M == 'AUTO':
                time_ = [
                2.361281,
                8.007564,
                10.565273,
                13.298214,
                15.794987,
                19.11888,
            ]

                frac = [0.01,0.2,0.4,0.6,0.8,1]
            
            elif M == 'CTT':
                 
                time_ = [2.569706153869629,
                        10.010574245452881,
                        14.817002058029175,
                        20.32390217781067,
                        21.906998586654662,
                        25.958156061172485]
                frac = [0.01,0.2,0.4,0.6,0.8,1]
                

            else:
                 

                time_ = res[TT][M]['time']
                frac = res[TT][M]['frac']

            # res_plot.append([frac, 
                                # time_])

            # res_plot.sort(key=lambda x: x[0])

            plt.plot(
                100*np.array(frac),
                time_,
                marker=markers[idx % len(markers)],
                color=colors[idx % len(colors)],
                label=METHODS[M].replace('\\',''),
                linestyle='-',  # Solid line
                linewidth=4,
                markersize=23
            )





        plt.xlabel("$\\frac{|P_{new}|}{|P_{init}|}$ (%)",fontsize=50)
        plt.ylabel("Time (sec)",fontsize=50)
        # current_ticks = plt.xticks()[0]
        # new_ticks = current_ticks[1:-2:]
        plt.xticks(fontsize=48)
        plt.tick_params(axis='both', which='both', length=12, width=1.5)
        plt.yticks(fontsize=48)
        plt.legend(fontsize=48, loc='upper left', bbox_to_anchor=(0.005, 0.995), borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
        # plt.close()
        # plt.xlim([0-0.01,1+0.04])
        plt.tight_layout()

        plt.savefig('FIGS/time_'+TT+'.pdf')
        print('time_'+TT+'.pdf')
        plt.close()










time_Walmart-Amazon.pdf


## scatter plot

In [None]:
import pandas as pd
# Define the path to your text file
file_path = 'Block_stat_final.txt'


# Read the file and process the data
data = []
res = {}
with open(file_path, 'r') as file:
    for line in file:
        # Split the line into parts based on spaces
        parts = line.split()
        if parts == []: continue
        if len(parts) == 2:
            if parts[0] not in res.keys():
                res[parts[0]] = {}
            res[parts[0]][parts[1]] = {}
            method = parts[1]
            dataset = parts[0]
            cnt = 0
        elif parts[0] != 'bias':
            RR = float(parts[0])
            PC = float(parts[1])
            PQ = float(parts[2])
            time_ = float(parts[4])
            Fb = 2*PC *RR / (PC + RR)
            res[dataset][method] = {'RR':round(RR,5), 'PC':round(PC,5), 'PQ':round(PQ,5), 'Fb':round(Fb,5), 'time':round(time_,5)}
        else:
            if cnt ==0:
                RR_minor = float(parts[1])
                PC_minor = float(parts[2])
                PQ_minor = float(parts[3])
                Fb_minor = float(parts[4])
                res[dataset][method]['RR_minor'] = round(RR_minor,5)
                res[dataset][method]['PC_minor'] = round(PC_minor,5)
                res[dataset][method]['PQ_minor'] = round(PQ_minor,5)
                res[dataset][method]['Fb_minor'] = round(Fb_minor,5)
                cnt+=1
            elif cnt ==1:
                RR_major = float(parts[1])
                Pc_major = float(parts[2])
                PQ_major = float(parts[3])
                Fb_major = float(parts[4])
                res[dataset][method]['RR_major'] = round(RR_major,5)
                res[dataset][method]['Pc_major'] = round(Pc_major,5)
                res[dataset][method]['PQ_major'] = round(PQ_major,5)
                res[dataset][method]['Fb_major'] = round(Fb_major,5)
                cnt+=1
            elif cnt==2:

                RR_diff = float(parts[1])
                Pc_diff = float(parts[2])
                PQ_diff = float(parts[3])
                Fb_diff = float(parts[4])
                res[dataset][method]['RR_diff'] = round(RR_diff,5)
                res[dataset][method]['Pc_diff'] = round(Pc_diff,5)
                res[dataset][method]['PQ_diff'] = round(PQ_diff,5)
                res[dataset][method]['Fb_diff'] = round(Fb_diff,5)
                cnt+=1
            else:
                continue


M_all = list(res['Beer'].keys())

scatter_pnt = {}
for M in M_all:
    scatter_pnt[M] = []


for k in list(res.keys()):
    for M in M_all:
        # if M in ['CTT','AUTO'] and k == 'Fodors-Zagat': continue
        scatter_pnt[M].append([res[k][M]['PC'],res[k][M]['PQ']])



METHODS = {
    'StandardBlocking':'stdBlck',
    'QGramsBlocking':'QGram',
    'ExtendedQGramsBlocking':'XQGram',
    'SuffixArraysBlocking':'Suffix',
    'ExtendedSuffixArraysBlocking':'XSuffix',
    'AUTO':'AUTO',
    'CTT':'CTT'
}


colors = ["#E69F00", "#33BFA1", "#007D59", "#56B4E9","#0072B2", "#E15759", "#F28E2B"]#8B2B2E
markers = ['o', 's', 'D', '^', 'v', '>', '<']


plt.figure(figsize=(12, 10))

for idx, M in enumerate(METHODS):
    # if M in ['AUTO']: continue

    row = np.array(scatter_pnt[M])
    RR = (row[:,0])
    PQ = (row[:,1])

    plt.scatter(
        PQ,
        RR,
        marker=markers[idx],
        color=colors[idx],
        label=METHODS[M].replace('\\',''),
        # linestyle='-',  # Solid line
        edgecolors='none',
        linewidth=0,
        s= 1000
        # markersize=16
    )





plt.xlabel("PC (%)",fontsize=50)
plt.ylabel("PQ (%)",fontsize=50)
# current_ticks = plt.xticks()[0]
# new_ticks = current_ticks[1:-2:]
plt.xticks(fontsize=45)
plt.tick_params(axis='both', which='both', length=12, width=1.5)
plt.yticks(fontsize=45)
plt.legend(fontsize=40, borderaxespad=0, handletextpad=0.2, borderpad=0.2, labelspacing=0.2)
# plt.close()
# plt.xlim([0-0.01,1+0.04])
plt.tight_layout()

# plt.savefig('PQ-PC.pdf')
# print('time_'+TT+'.pdf')
# plt.xlim([-1,20])
# plt.ylim([97.5,101])
# plt.close()




In [None]:

res = {}

res['Beer']  ={'AUTO':[], 'CTT':[]}
res['iTunes-Amazon']={'AUTO':[], 'CTT':[]}
res['Fodors-Zagat']={'AUTO':[], 'CTT':[]}
res['Walmart-Amazon']={'AUTO':[], 'CTT':[]}
res['Amazon-Google']= {'AUTO':[], 'CTT':[]}
res['DBLP-ACM']={'AUTO':[], 'CTT':[]}
res['DBLP-GoogleScholar']= {'AUTO':[], 'CTT':[]}



res['Beer']['AUTO']  = {'time_avg': 9.16185941696167, 'time_std': 0.4758632559586421}
res['iTunes-Amazon']['AUTO']= {'time_avg': 112.45368194580078, 'time_std': 3.491730883903146}
res['Fodors-Zagat']['AUTO']= {'time_avg': 0.7223541736602783, 'time_std': 0.04316247998284229}
res['Walmart-Amazon']['AUTO']= {'time_avg': 18.85678825378418, 'time_std': 0.5236079390049121}
res['Amazon-Google']['AUTO']= {'time_avg': 5.152085638046264, 'time_std': 0.16152604483293306}
res['DBLP-ACM']['AUTO']= {'time_avg': 7.0379444599151615, 'time_std': 0.2323224380155196}
res['DBLP-GoogleScholar']['AUTO']= {'time_avg': 58.06285433769226, 'time_std': 2.8567257992252384}

res['Beer']['CTT']  = {'time_avg': 10.189891958236695, 'time_std': 0.4049498209339713}
res['iTunes-Amazon']['CTT']= {'time_avg': 156.71345224380494, 'time_std': 5.504195632915928}
res['Fodors-Zagat']['CTT']= {'time_avg': 0.642644739151001, 'time_std': 0.015861690254546323}
res['Walmart-Amazon']['CTT']= {'time_avg': 27.573931312561037, 'time_std': 5.267834604690219}
res['Amazon-Google']['CTT']= {'time_avg': 5.319742012023926, 'time_std': 0.3558836507519921}
res['DBLP-ACM']['CTT']= {'time_avg': 7.460848760604859, 'time_std': 0.18120981692447158}
res['DBLP-GoogleScholar']['CTT']= {'time_avg': 89.94730200767518, 'time_std': 9.554257975921635}


METHODS = {
    'StandardBlocking':'\\stdBlock',
    'QGramsBlocking':'\\qgram',
    'ExtendedQGramsBlocking':'\\exQgram',
    'SuffixArraysBlocking':'\\suffix',
    'ExtendedSuffixArraysBlocking':'\\exSuffix',
    'AUTO':'\\AutoBlock',
    'CTT':'\\CTT'}



TASKS = ['Amazon-Google', 'Walmart-Amazon', 'DBLP-GoogleScholar', 'DBLP-ACM', 'Beer', 'Fodors-Zagat', 'iTunes-Amazon']


for task in TASKS:
    row = res[task]['AUTO']
    time_ = row['time_avg']
    std_ = row['time_std']

    if time_ > 60:
        time_ = time_ / 60
        std_ = std_ / 60
        print(task, round(time_,1), round(std_,1),'m')
    else:
        print(task, round(time_,1), round(std_,2),'s')





