##  Blocking stats, Bias



In [None]:
import os
import time
import copy
from func import *
import numpy as np
import pandas as pd
from pyjedai.datamodel import Data
from pyjedai.block_cleaning import BlockPurging, BlockFiltering
from pyjedai.comparison_cleaning import CardinalityEdgePruning,BLAST
from pyjedai.block_building import (
    StandardBlocking,
    ExtendedQGramsBlocking,
    ExtendedSuffixArraysBlocking,
    QGramsBlocking,
    SuffixArraysBlocking
)



output_file = "RES.txt"
try: os.remove(output_file)
except FileNotFoundError: pass


tasks = ['Beer','Fodors-Zagat', 'Walmart-Amazon', 'Amazon-Google', 'DBLP-ACM', 'DBLP-GoogleScholar', 'iTunes-Amazon']
methods = ['SB', 'EQG', 'ESA', 'QG','SA','AE','CTT']



# Process each task with each method
for task in tasks:
    for method in methods:
        
        left_df, right_df,match_df = load_blk_data(task)



        if method in classic_method_dict:

            candidates, runtime, df = trad_blk(task, method, left_df, right_df, match_df, attr_type = 'all')


        else:
            if method == 'CTT':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'CTT')
                end_time = time.time()
                runtime = end_time - start_time
            elif method =='AE':
                start_time = time.time()
                candidates = deepBlock(left_df.copy(), right_df.copy(), K = 50, method = 'AE')
                end_time = time.time()
                runtime = end_time - start_time
            else:
                print('method not found!')



        # Compile result rows
        candidates= candidates.astype(int)
        candidates.rename(columns={'ltable_id': 'id1', 'rtable_id': 'id2'}, inplace=True)

        # Merge left_df and right_df with candidates based on ids using vectorized operations
        left_merged = candidates.merge(left_df, left_on='id1', right_on='id', suffixes=('', '_left'))
        right_merged = left_merged.merge(right_df, left_on='id2', right_on='id', suffixes=('_left', '_right'))

        # Drop redundant columns and reset index
        result_df = right_merged.drop(columns=['id1', 'id2', 'id_left', 'id_right'])
        result_df = right_merged.copy()

        # Merge with match_df to determine labels
        merged_df = result_df.merge(match_df, left_on=['id_left', 'id_right'], right_on=['ltable_id', 'rtable_id'], how='left', indicator=True)
        result_df['label'] = (merged_df['_merge'] == 'both').astype(int)

        # result_df.to_csv(task+'_'+method+'_blk.csv',index=False)
        # continue


        # Calculate metrics
        RR = 1 - result_df.shape[0] / (left_df.shape[0] * right_df.shape[0])
        PC = np.sum(result_df['label'] == 1) / match_df.shape[0]
        PQ = np.sum(result_df['label'] == 1) / result_df.shape[0]
        F = 2 * PC * RR / (PC + RR)

        # Print and save results
        print(task, classic_method_name[method])
        print(round(100 * RR, 4), round(100 * PC, 4), round(100 * PQ, 4), round(100 * F, 4))
        print()


        res_sens = []
        if task == 'DBLP-ACM':
            LEFT = pd.DataFrame({'sens':pd.read_csv('data/'+task+'/'+'left_sens.csv')['0'],
            'id':pd.read_csv('data/'+task+'/'+'tableA.csv')['id']})


            RIGHT = pd.DataFrame({'sens':pd.read_csv('data/'+task+'/'+'right_sens.csv')['0'],
            'id':pd.read_csv('data/'+task+'/'+'tableB.csv')['id']})

            a = list(result_df['id1'])
            b = list(result_df['id2'])


            res_sens = []
            for i in range(len(a)):
                sens_c = np.logical_or(list(RIGHT[RIGHT['id'] == b[i]]['sens'])[0],list(LEFT[LEFT['id'] == a[i]]['sens'])[0])
                res_sens.append(sens_c)

        print(1)



        MINOR, MAJOR, data_STAT = calc_bias_block(result_df,match_df,left_df,right_df, task , sens_dict,res_sens )
        [RR_minor,PC_minor,PQ_minor,Fb_minor ] = MINOR
        [RR_major,PC_major,PQ_major,Fb_major ] = MAJOR
        [P_major, P_minor, M_major, M_minor] = data_STAT

        NUM = 2
        print('Minor: ',end='')
        print(round(RR_minor,NUM),round(PC_minor,NUM),round(PQ_minor,NUM), round(Fb_minor,NUM))
        print('major: ',end='')
        print(round(RR_major,NUM),round(PC_major,NUM),round(PQ_major,NUM), round(Fb_major,NUM))
        print('diff : ',end='')
        print(round(RR_major-RR_minor ,NUM),round(PC_major- PC_minor,NUM),round(PQ_major - PQ_minor,NUM),round(Fb_major - Fb_minor,NUM))


        print()
        # output_file = 'tmp.txt'
        with open(output_file, "a") as file:
            file.write(f"{task} {classic_method_name[method]}\n")
            file.write(f"{round(100 * RR,8)} {round(100 * PC,8)} {round(100 * PQ,8)} {round(100 * F,8)} {round(runtime, 2)}\n")
            file.write(f"bias {round( RR_minor,8)} {round( PC_minor,8)} {round( PQ_minor,8)} {round( Fb_minor,8)}\n")
            file.write(f"bias {round( RR_major,8)} {round( PC_major,8)} {round( PQ_major,8)} {round( Fb_major,8)}\n")
            file.write(f"bias {round( RR_major - RR_minor,8)} {round( PC_major - PC_minor,8)} {round( PQ_major - PQ_minor,8)} {round( Fb_major - Fb_minor,8)}\n")
            file.write(f"bias {P_major} {P_minor} {M_major} {M_minor}\n\n")
            



        

