##IMPORT

In [1]:
!pip install py_stringmatching
!pip install py_entitymatching
!pip install py_stringsimjoin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py_stringmatching
  Downloading py_stringmatching-0.4.2.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.8/661.8 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: py_stringmatching
  Building wheel for py_stringmatching (setup.py) ... [?25l[?25hdone
  Created wheel for py_stringmatching: filename=py_stringmatching-0.4.2-cp38-cp38-linux_x86_64.whl size=2692337 sha256=ceeac5f8b011f18c5ec836c2e92a3ad43d4ce56770ea7a484c72c692f14b2db2
  Stored in directory: /root/.cache/pip/wheels/6d/01/57/0131e0a87fa6471ec265bd9777de2e789a4bd1248c279fa875
Successfully built py_stringmatching
Installing collected packages: py_stringmatching
Successfully installed py_stringmatching-0.4.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/p

In [2]:
import pandas as pd
import ssl
import numpy as np
import set_like
from copy import deepcopy
import py_stringmatching as sm
import py_entitymatching as em
import py_stringsimjoin as ssj
ssl._create_default_https_context = ssl._create_unverified_context

from six import ensure_binary
from hashlib import md5

from __future__ import division

from math import log

from bitstring import hamdist, digits, popcount
from tabhash import SimpleTabulation
from set_like import BloomFilter

##FUNCTIONS AND CLASSES DEFINITION

In [50]:
class Minwise(object):
    def __init__(self, m, q=64, seed=None, klass=SimpleTabulation):
        # m is the length of the list of hashes returned by the hash method
        # if seed is not None, then its value will
        # be used to seed the PRNG
        # numbits denotes length of hash returned by the hash method
        if not m > 0:
            raise ValueError('Minwise hash must have length > 0')
        if not seed is None:
            np.random.seed(seed)
        self._hashers = [klass(q=q) for _ in range(m)]
        self._cache = {}

    @property
    def hashers(self):
        return self._hashers

    def hash(self, tokens):
        # returns a list of minwise hashes
        # ensure tokens is a frozenset
        if not isinstance(tokens, frozenset):
            tokens = frozenset(tokens)
        if not tokens in self._cache:
            self._cache[tokens] = [min(h.hash(s) for s in tokens) for h in self.hashers]
        return self._cache[tokens]


class B_bit(Minwise):
    def __init__(self, b, m, q=64, seed=None):
        super(B_bit, self).__init__(m, q, seed)
        self._mod = 2**b

    def hash(self, tokens):
        # returns a list of b-bit minwise hashes
        if not isinstance(tokens, frozenset):
            tokens = frozenset(tokens)
        if not tokens in self._cache:
            mod = self._mod
            self._cache[tokens] = [x % mod for x in super(B_bit, self).hash(tokens)]
        return self._cache[tokens]


class Concatenated(B_bit):
    def __init__(self, m, q=64, seed=None):
        super(Concatenated, self).__init__(1, m, q, seed)

    def hash(self, tokens):
        # returns a concatenated 1-bit hash
        if not isinstance(tokens, frozenset):
            tokens = frozenset(tokens)
        if not tokens in self._cache:
            h = 0
            for i, bit in enumerate(reversed(super(Concatenated, self).hash(tokens))):
                if bit:
                    h += (1 << i)
            self._cache[tokens] = C_hash(h, len(self.hashers))
        return self._cache[tokens]


class C_hash(int):
    # a 1-bit concatenated hash instance
    # i.e. a Python long with a couple of
    # of additional attributes
    def __new__(cls, bits, m, N=1):
        obj = int.__new__(cls, bits)
        obj._m = m # number of significant bits
        obj._N = N # compression factor for XOR compression
        return obj

    def hex(self):
        # return a hex representation
        return hex(self)

    def compressed(self, m):
        # returns a compressed hash of length m
        # by simply reducing number of significant bits
        if not m <= self._m:
            raise ValueError('Cannot compress to larger size')
        return self.__class__.__new__(self.__class__, self % 2**m, m)

    def XOR(self, N):
        # returns a compressed hash using XOR
        # N must be a positive power of 2
        # (and for most applications probably not more
        # than around 8)
        if not popcount(N) == 1:
            raise ValueError(' N must be a positive power of 2')
        chunksize = self.m // N
        if not N * chunksize == self.m:
            raise ValueError('Hash size %s not divisible by %s' % (self._m, N))
        h = 0
        x = self
        mod = 2**chunksize
        for _ in range(N):
            h ^= (x % mod)
            x = x >> chunksize
        return self.__class__.__new__(self.__class__, h, chunksize, N=self._N*N)

    @ property
    def digits(self):
        # returns the bitstring representation
        return digits(self, pad=self._m)

    @property
    def m(self):
        return self._m

    @property
    def N(self):
        return self._N


############## Functions based on token sets ##############


def Jaccard(tokens1, tokens2):
    # returns exact Jaccard
    # similarity measure for
    # two token sets
    tokens1 = set(tokens1)
    tokens2 = set(tokens2)
    return len(tokens1&tokens2) / len(tokens1|tokens2)

def dice_coefficient(tokens1, tokens2):
    # returns exact dice coefficient
    # similarity measure for
    # two token sets
    tokens1 = set(tokens1)
    tokens2 = set(tokens2)
    return 2 * len(tokens1&tokens2) / (len(tokens1) + len(tokens2))


############## Functions for b-bit hashes ##############


def J_hat(hashes1, hashes2, b):
    # returns estimated Jaccard score from lists of
    # minwise hashes 
    # hashes1 and hashes2 are lists for different
    # token sets using the same (minwise) hash functions
    # only the b most significant bits are used
    # in the evaluation of the measure
    # ensuring that b is 'much' less than the
    # number of bits in the full hash values
    # assumes the probability of collisions on the
    # full hash is negligible
    if not len(hashes1) == len(hashes2):
        raise ValueError('Hash lists must have equal length')
    mod = 2**b
    frac = sum([(a-b) % mod == 0 for (a,b) in zip(hashes1, hashes2)]) / len(hashes1)
    c = (1/2)**b
    return (frac-c)/(1-c)


############## Functions for 1-bit concatenated hashes ##############


def J_hat_from_conc(a, b, m, N=1, truncate=True):
    # returns estimated Jaccard
    # similarity measure for
    # a pair of comparable concatenated hashes
    # if truncate is True, then 0 is
    # returned rather than a negative value
    # N is the compression factor
    try:
        res = (1-2*hamdist(a,b)/m)**(1/N)
    except ValueError:
        # enforce truncation for N > 1
        return 0
    if truncate and res < 0:
        return 0
    return res

def var_J_hat(J, m, N=1):
    # returns variance of
    # above estimator (without truncation)
    # for true Jaccard score J, compressed hash length m
    # and XOR compression factor N
    # result is approximate for N > 1
    return (1/J**(2*N-2)-J**2)/m/N/N

def mean_J_hat(J, m, N=1):
    # returns expected value of
    # above estimator (without truncation)
    # for true Jaccard score J, compressed hash length m
    # and XOR compression factor N
    # result is approximate for N > 1
    return J - (N-1)*(1/J**(2*N-1)-J)/m/N/N/2

def MSE_J_hat(J, m, N=1):
    # returns MSE of
    # above estimator (without truncation)
    # for true Jaccard score J, compressed hash length m
    # and XOR compression factor N
    # result is approximate for N > 1
    if N == 1:
        return var_J_hat(J, m, N)
    return var_J_hat(J, m, N) + (J - mean_J_hat(J, m, N))**2


############## Bloom filter functions ##############


def J_hat_from_bf(a, b):
    # returns estimated Jaccard
    # similarity measure for
    # a pair of comparable Bloom filters
    return popcount(a&b) / popcount(a|b)

def D_hat_from_bf(a, b):
    # returns estimated Dice coefficient
    # similarity measure for
    # a pair of comparable Bloom filters
    return 2*popcount(a&b)/(popcount(a) + popcount(b))

def J_hat_from_bf_corrected(a, b, m):
    # bias corrected estimator due to Swamidass and Baldi (2007)
    A = -m*log(1-popcount(a)/m)
    B = -m*log(1-popcount(b)/m)
    AB = -m*log(1-popcount(a|b)/m)
    num = max(A+B-AB, 0)
    denom = min(AB, A+B)
    return num / denom

def n_grams(s, n, pad=False):
    # n >= 1
    # returns a list of n-grams
    # or an empty list if n > len(s)
    if pad:
        s = '_' * (n-1) + s + '_' * (n-1)
    return [s[i:i+n] for i in range(len(s)-n+1)]

def ApplyBloomFilter(x):
    tokens=n_grams(x['mix'], 2, pad=True)
    funcs= set_like.k_hashes(4,1024,seed=6)
    bf=set_like.BloomFilter(1024,funcs)
    for token in tokens:
        bf.add(token)
    
    return bf.bits

def Funzione_J_hat_from_bf(x):
    return J_hat_from_bf(x['bf_l'],x['bf_r'])

def StableMarriage(ST):
  ST.columns= ['rec_id_l', 'rec_id_r','sim'] 
  MATCH = pd.DataFrame(columns=['rec_id_l', 'rec_id_r','sim'])
  MT=deepcopy(ST)
  MT=MT.sort_values(['sim'], ascending=[False])
  while True:
    R=MT.loc[(~MT['rec_id_l'].isin(MATCH['rec_id_l'])) & (~MT['rec_id_r'].isin(MATCH['rec_id_r']))]
    if len(R)==0:
      break
    x=R.iloc[0,:]
    MATCH=MATCH.append(x, ignore_index=True)
  return MATCH

def Evaluation(Gold:pd.DataFrame, Match:pd.DataFrame):
  Gold = Gold[['l_id','r_id']]
  Match = Match[['l_id','r_id']]
  FOJ = Gold.merge(Match, how='outer', indicator=True)

  TP = FOJ[FOJ['_merge']=='both']
  FP = FOJ[FOJ['_merge']=='right_only']
  FN = FOJ[FOJ['_merge']=='left_only']

  if len(TP) == 0:
      return pd.DataFrame({
              'MT':[len(Match)],
              'TP':[len(TP)],
              'FP':[len(FP)],
              'FN':[len(FN)],
              'P':[round(0,4)],
              'R':[round(0,4)],
              'F':[round(0,4)]
          })
  else:
      P = len(TP)/(len(TP)+len(FP))
      R = len(TP)/(len(TP)+len(FN))
      F = 2 * P * R / ( P + R )
      return pd.DataFrame({
              'MT':[len(Match)],
              'TP':[len(TP)],
              'FP':[len(FP)],
              'FN':[len(FN)],
              'P':[round(P,4)],
              'R':[round(R,4)], 
              'F':[round(F,4)]
          })
      
def preprocess_title(title):
  title = title.lower()
  title = title.replace('-org', ' ')
  title = title.replace('-dup-0', ' ')
  title = title.replace('-dup-1', ' ')
  title = title.replace('-dup-2', ' ')
  title = title.replace('-dup-3', ' ')
  title = title.replace('-dup-4', ' ')
  title = title.replace('-dup-5', ' ')
  title = title.replace('.0', ' ')
  title = title.replace('rec-', ' ')
  return title.strip()

##TEST

In [5]:
attributeString1 = "Giovanni"
attributeString2 = "Giovanno"

attributeTokens1 = n_grams(attributeString1, 2, pad=True) # bi-gram
attributeTokens2 = n_grams(attributeString2, 2, pad=True) # bi-gram

In [6]:
h = Concatenated(1024, q=64, seed=6)

attributeHash1 = h.hash(attributeTokens1)
attributeHash2 = h.hash(attributeTokens2)

print("J_hat_from_conc:", J_hat_from_conc(attributeHash1, attributeHash2, 1024))

J_hat_from_conc: 0.603515625


In [7]:
funcs = set_like.k_hashes(4,1024,seed=6)

bf1 = set_like.BloomFilter(1024,funcs)
bf2 = set_like.BloomFilter(1024,funcs)

for token in attributeTokens1:
    bf1.add(token)
for token in attributeTokens2:
    bf2.add(token)

print("J_hat_from_bf:", J_hat_from_bf(bf1.bits,bf2.bits))

J_hat_from_bf: 0.6511627906976745


##LOAD DATASETS: Esempio 1

In [160]:
datasetName_A = "ds1_a.csv"
datasetName_B = "ds1_b.csv"

TableA = pd.read_csv(datasetName_A, encoding = 'unicode_escape').astype('string')
TableB = pd.read_csv(datasetName_B, encoding = 'unicode_escape').astype('string')

In [None]:
TableA

In [None]:
TableB

##CUSTOM PPRL: Esempio 1

In [172]:
### ENCODE TABLE A

#df_a=TableA.reset_index()
TableA.drop_duplicates(TableA.columns[2:4], keep='first', inplace=True) #Index([' fname', ' gname'], dtype='object')

mixColumnsA = TableA.columns[2:6] #Index([' fname', ' gname', ' address', ' dob'], dtype='object')
TableA['mix'] = ' '

for x in list(mixColumnsA) :
  TableA['mix'] += TableA[x] + ' '

TableA['bf'] = TableA.apply(lambda row: ApplyBloomFilter(row), axis=1)

TableA_ENC = TableA[[TableA.columns[0],TableA.columns[len(TableA.columns)-1]]]  # TableA encoded : ('id', 'bf')


### ENCODE TABLE B

#df_b=TableB.reset_index()
TableB.drop_duplicates(TableB.columns[2:4], keep='first', inplace=True) #Index([' fname', ' gname'], dtype='object')

mixColumnsB = TableB.columns[2:6] #Index([' fname', ' gname', ' address', ' dob'], dtype='object')
TableB['mix'] = ' '

for x in list(mixColumnsB) :
  TableB['mix'] += TableB[x] + ' '

TableB['bf'] = TableB.apply(lambda row: ApplyBloomFilter(row), axis=1)

TableB_ENC = TableB[[TableB.columns[0],TableB.columns[len(TableB.columns)-1]]]  # TableB encoded : ('id', 'bf')

In [None]:
TableA_ENC

In [None]:
TableB_ENC

In [None]:
PCC_ENC = TableA_ENC.assign(key=1).merge(TableB_ENC.assign(key=1), on='key', suffixes=('_l', '_r')).drop('key', 1)
PCC_ENC['bf_sim'] = PCC_ENC.apply (lambda row: Funzione_J_hat_from_bf(row), axis=1)

In [None]:
PCC_ENC

In [181]:
SM_J_sim = StableMarriage(PCC_ENC[['id_l', 'id_r','bf_sim']])

In [None]:
SM_J_sim

In [234]:
GoldStandard = pd.DataFrame({'l_id': TableA['id'], 'r_id': TableB['id']})
SM_J_sim_Evaluation = SM_J_sim[['rec_id_l','rec_id_r']].rename(columns={'rec_id_l':'l_id','rec_id_r':'r_id'})

In [239]:
Evaluation(GoldStandard, SM_J_sim_Evaluation)

Unnamed: 0,MT,TP,FP,FN,P,R,F
0,20,20,0,0,1.0,1.0,1.0


##LOAD DATASETS: Esempio 2

In [15]:
datasetName_A = "dfA.csv"
datasetName_B = "dfB.csv"

TableA = pd.read_csv(datasetName_A, encoding = 'unicode_escape').astype('string').head(1000)
TableA['id']=TableA['rec_id']
TableB = pd.read_csv(datasetName_B, encoding = 'unicode_escape').astype('string').head(1000)
TableB['id']=TableB['rec_id']

In [None]:
TableA

In [None]:
TableB

##CUSTOM PPRL: Esempio 2

In [5]:
### ENCODE TABLE A

#df_a=TableA.reset_index()
TableA.drop_duplicates(TableA.columns[1:3], keep='first', inplace=True) #Index(['given_name', 'surname'], dtype='object')

for col in TableA.columns[1:11]:
  TableA = TableA[TableA[col].notna()]

mixColumnsA = TableA.columns[1:11] #Index(['given_name', 'surname', 'street_number', 'address_1', 'address_2','suburb', 'postcode', 'state', 'date_of_birth', 'soc_sec_id'],dtype='object')
TableA['mix'] = ' '

for x in list(mixColumnsA) :
  TableA['mix'] += TableA[x] + ' '

TableA['bf'] = TableA.apply(lambda row: ApplyBloomFilter(row), axis=1)

TableA_ENC = TableA[[TableA.columns[0],TableA.columns[len(TableA.columns)-1]]]  # TableA encoded : ('id', 'bf')


### ENCODE TABLE B

#df_b=TableB.reset_index()
TableB.drop_duplicates(TableB.columns[1:3], keep='first', inplace=True) #Index(['given_name', 'surname'], dtype='object')

for col in TableB.columns[1:11]:
  TableB = TableB[TableB[col].notna()]

mixColumnsB = TableB.columns[1:11] #Index(['given_name', 'surname', 'street_number', 'address_1', 'address_2','suburb', 'postcode', 'state', 'date_of_birth', 'soc_sec_id'],dtype='object')
TableB['mix'] = ' '

for x in list(mixColumnsB) :
  TableB['mix'] += TableB[x] + ' '

TableB['bf'] = TableB.apply(lambda row: ApplyBloomFilter(row), axis=1)

TableB_ENC = TableB[[TableB.columns[0],TableB.columns[len(TableB.columns)-1]]]  # TableB encoded : ('id', 'bf')

In [None]:
TableA_ENC

In [None]:
TableB_ENC

In [6]:
PCC_ENC = TableA_ENC.assign(key=1).merge(TableB_ENC.assign(key=1), on='key', suffixes=('_l', '_r')).drop('key', 1)
PCC_ENC['bf_sim'] = PCC_ENC.apply (lambda row: Funzione_J_hat_from_bf(row), axis=1)

  PCC_ENC = TableA_ENC.assign(key=1).merge(TableB_ENC.assign(key=1), on='key', suffixes=('_l', '_r')).drop('key', 1)


In [None]:
PCC_ENC

In [10]:
SM_J_sim = StableMarriage(PCC_ENC[['rec_id_l', 'rec_id_r','bf_sim']])

In [None]:
SM_J_sim

In [11]:
GoldStandard = pd.DataFrame({'l_id': TableA['id'], 'r_id': TableB['id']})
SM_J_sim_Evaluation = SM_J_sim[['rec_id_l','rec_id_r']].rename(columns={'rec_id_l':'l_id','rec_id_r':'r_id'})

In [12]:
Evaluation(GoldStandard, SM_J_sim_Evaluation)

Unnamed: 0,MT,TP,FP,FN,P,R,F
0,803,803,0,0,1.0,1.0,1.0


##LOAD DATASETS: Esempio 3

In [72]:
datasetName = "dataset_febrl3.csv"

Table = pd.read_csv(datasetName, encoding = 'unicode_escape').astype('string')
Table['id']=Table['rec_id']

In [90]:
Table

Unnamed: 0,rec_id,given_name,surname,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,id
0,rec-1496-org,mitchell,green,wallaby place,delmar,cleveland,2119,sa,19560409.0,1804974,rec-1496-org
1,rec-552-dup-3,harley,mccarthy,pridhamstreet,milton,marsden,3165,nsw,19080419.0,6089216,rec-552-dup-3
2,rec-988-dup-1,madeline,mason,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,19081128.0,2185997,rec-988-dup-1
3,rec-1716-dup-1,isabelle,,gundulu place,currin ga,utakarra,2193,wa,19921119.0,4314184,rec-1716-dup-1
4,rec-1213-org,taylor,hathaway,yuranigh court,brentwood vlge,,4220,nsw,19991207.0,9144092,rec-1213-org
...,...,...,...,...,...,...,...,...,...,...,...
4995,rec-937-org,jack,campbell,marr street,rhosewyn,oakleigh,3356,vic,19770109.0,1485686,rec-937-org
4996,rec-1200-dup-0,william,lazaroff,leah ylose,milwlood,forbes,7256,qld,,8072193,rec-1200-dup-0
4997,rec-1756-org,destynii,bowerman,halford crescent,sutton,nollamara,2431,qld,19880821.0,6089424,rec-1756-org
4998,rec-1444-org,gianni,dooley,ashburton circuit,brentwood vlge,ryde,6025,qld,19371212.0,5854405,rec-1444-org


##CUSTOM PPRL: Esempio 3

In [None]:
# costruisco DA e DB simulando dirty-dirty
DA=Table[(Table['rec_id'].str.endswith('org'))]
DB=Table[(Table['rec_id'].str.endswith('0'))]

DA['id']= DA['id'].map(preprocess_title)
DA.id = DA.id.astype(int)
DA = DA.sort_values('id')
DA.id = DA.id.astype('string')

DB['id']= DB['id'].map(preprocess_title)
DB.id = DB.id.astype(int)
DB = DB.sort_values('id')
DB.id = DB.id.astype('string')

DA.shape[0],DB.shape[0],DA.shape[0]+DB.shape[0]

In [91]:
DA

Unnamed: 0,rec_id,given_name,surname,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,id,mix,bf
3419,rec-0-org,jinni,dreyer,were street,marriott downs,south melbourne,3172,nsw,19420127.0,3787407,0,jinni dreyer were street marriott downs south melbourne 3172 nsw 19420127.0 3787407,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...
2440,rec-2-org,mia,thredgold,summerland circuit,cornvale,highett,5110,vic,19980406.0,7484089,2,mia thredgold summerland circuit cornvale highett 5110 vic 19980406.0 7484089,143736719526049598875550040685046846060410828649454505883816101531766881991785204814262285791983...
4804,rec-3-org,naomi,millar,southern cross drive,glengar,st agnes,5172,qld,19750818.0,7751504,3,naomi millar southern cross drive glengar st agnes 5172 qld 19750818.0 7751504,294935825683194405579415722808547437322455756041110171495444291077748801679710359549573436747340...
213,rec-4-org,lachlan,goldsworthy,ashkanasy crescent,mcivor ridge,rowville,3158,vic,19450818.0,8577437,4,lachlan goldsworthy ashkanasy crescent mcivor ridge rowville 3158 vic 19450818.0 8577437,182912520715152883230887209182041464653592245991682425619368197402940102622251004244511994973773...
312,rec-5-org,jackson,prestia,priddle street,mirilla,graceville,3012,qld,19260601.0,5613219,5,jackson prestia priddle street mirilla graceville 3012 qld 19260601.0 5613219,143257757406442456129658248986491260990211573409582754168679113213897632259264643292900858992898...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1519,rec-1993-org,isabella,pulford,monaro highway,jarara,beaumaris,2615,nsw,19521225.0,6048503,1993,isabella pulford monaro highway jarara beaumaris 2615 nsw 19521225.0 6048503,704761499523833597884996112636375218068307482630617921232208819930503543840738149918289480765392...
675,rec-1994-org,aidan,belperio,charlton crescent,openshaw,st clair,4508,nsw,19720503.0,1268692,1994,aidan belperio charlton crescent openshaw st clair 4508 nsw 19720503.0 1268692,295155189484924336627828138051249782284470939113708024474994404764554993900735284523658467629724...
580,rec-1995-org,george,ferrar,ballumbir street,rich meadows,highgate,4558,nsw,19510224.0,8454781,1995,george ferrar ballumbir street rich meadows highgate 4558 nsw 19510224.0 8454781,126706456493251541380222712452694380321215728945786798461761974117749515044995333374520933218284...
126,rec-1996-org,grace,timmermans,cussen street,hallsville,preston,2251,wa,19501207.0,6702521,1996,grace timmermans cussen street hallsville preston 2251 wa 19501207.0 6702521,706134622618818340029452476496617453395804495620372078633547725551792285162589761424124533505937...


In [None]:
DB

In [74]:
### ENCODE TABLE A

DA.drop_duplicates(DA.columns[1:3], keep='first', inplace=True) #Index(['given_name', 'surname'], dtype='object')

for col in DA.columns[1:10]:
  DA = DA[DA[col].notna()]

mixColumnsA = DA.columns[1:10] #Index(['given_name', 'surname', 'address_1', 'address_2', 'suburb', 'postcode','state', 'date_of_birth', 'soc_sec_id'],dtype='object')
DA['mix'] = ' '

for x in list(mixColumnsA) :
  DA['mix'] += DA[x] + ' '

DA['bf'] = DA.apply(lambda row: ApplyBloomFilter(row), axis=1)

DA_ENC = DA[[DA.columns[len(DA.columns)-3],DA.columns[len(DA.columns)-1]]]  # DA encoded : ('id', 'bf')


### ENCODE TABLE B

DB.drop_duplicates(DB.columns[1:3], keep='first', inplace=True) #Index(['given_name', 'surname'], dtype='object')

for col in DB.columns[1:10]:
  DB = DB[DB[col].notna()]

mixColumnsB = DB.columns[1:10] #Index(['given_name', 'surname', 'address_1', 'address_2', 'suburb', 'postcode','state', 'date_of_birth', 'soc_sec_id'],dtype='object')
DB['mix'] = ' '

for x in list(mixColumnsB) :
  DB['mix'] += DB[x] + ' '

DB['bf'] = DB.apply(lambda row: ApplyBloomFilter(row), axis=1)

DB_ENC = DB[[DB.columns[len(DB.columns)-3],DB.columns[len(DB.columns)-1]]]  # DB encoded : ('id', 'bf')

In [92]:
DA_ENC

Unnamed: 0,id,bf
3419,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...
2440,2,143736719526049598875550040685046846060410828649454505883816101531766881991785204814262285791983...
4804,3,294935825683194405579415722808547437322455756041110171495444291077748801679710359549573436747340...
213,4,182912520715152883230887209182041464653592245991682425619368197402940102622251004244511994973773...
312,5,143257757406442456129658248986491260990211573409582754168679113213897632259264643292900858992898...
...,...,...
1519,1993,704761499523833597884996112636375218068307482630617921232208819930503543840738149918289480765392...
675,1994,295155189484924336627828138051249782284470939113708024474994404764554993900735284523658467629724...
580,1995,126706456493251541380222712452694380321215728945786798461761974117749515044995333374520933218284...
126,1996,706134622618818340029452476496617453395804495620372078633547725551792285162589761424124533505937...


In [None]:
DB_ENC

In [76]:
PCC_ENC = DA_ENC.assign(key=1).merge(DB_ENC.assign(key=1), on='key', suffixes=('_l', '_r')).drop('key', 1)
PCC_ENC['bf_sim'] = PCC_ENC.apply (lambda row: Funzione_J_hat_from_bf(row), axis=1)

  PCC_ENC = DA_ENC.assign(key=1).merge(DB_ENC.assign(key=1), on='key', suffixes=('_l', '_r')).drop('key', 1)


In [93]:
PCC_ENC

Unnamed: 0,id_l,bf_l,id_r,bf_r,bf_sim
0,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...,3,294935827357426604307955437756644620912582988918308889984220290187672023564835377046641108118929...,0.354922
1,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...,5,143258301531907042906032168108697278165637021387235528593426217102492845798566694086459790536928...,0.296104
2,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...,10,713264707080281446334544210670106686275140460735365067256287291478822353660505679184661703358070...,0.230233
3,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...,12,143294724387832565591211498768638866388841110736925678370153322967832163645833272518258956089473...,0.276961
4,0,351167847528913316233668275860303369455417171499818813840934454120926237466954393643630472315210...,14,143856996498572219024918666988497206159670138036760011611296243123323623879875637408380886726686...,0.316062
...,...,...,...,...,...
1325185,1997,142982377745613742500543268447804165567839117965877035412246888521729070987906841592299578481074...,1988,142727358435669283243166776122558919754692593893151894018230847906704844976514492658163606434986...,0.333333
1325186,1997,142982377745613742500543268447804165567839117965877035412246888521729070987906841592299578481074...,1989,182797735278854840358516775416795805889086996497245251039812221392795574292356342670398910367835...,0.311170
1325187,1997,142982377745613742500543268447804165567839117965877035412246888521729070987906841592299578481074...,1993,704796323553599085018355352239283386472013580889071655658952900914075870023129785519530360272495...,0.353425
1325188,1997,142982377745613742500543268447804165567839117965877035412246888521729070987906841592299578481074...,1995,126679079448542311852697823451568935326873453617706481893768430695078329508529024189371756770150...,0.335859


In [78]:
SM_J_sim = StableMarriage(PCC_ENC[['id_l', 'id_r','bf_sim']])

In [80]:
SM_J_sim

Unnamed: 0,rec_id_l,rec_id_r,sim
0,1484,1484,1.000000
1,1316,1316,1.000000
2,1479,1479,1.000000
3,1009,1009,1.000000
4,960,960,1.000000
...,...,...,...
808,942,666,0.412742
809,1115,534,0.405556
810,664,1910,0.398892
811,776,324,0.383954


In [87]:
GoldStandard = pd.DataFrame({'l_id': DA['id'], 'r_id': DA['id']})
SM_J_sim_Evaluation = SM_J_sim[['rec_id_l','rec_id_r']].rename(columns={'rec_id_l':'l_id','rec_id_r':'r_id'})

In [None]:
GoldStandard

In [None]:
SM_J_sim_Evaluation

In [89]:
Evaluation(GoldStandard, SM_J_sim_Evaluation)

Unnamed: 0,MT,TP,FP,FN,P,R,F
0,813,787,26,843,0.968,0.4828,0.6443
