In [2]:
pip install recordlinkage

Collecting recordlinkage
  Downloading recordlinkage-0.16-py3-none-any.whl.metadata (8.1 kB)
Collecting jellyfish>=1 (from recordlinkage)
  Downloading jellyfish-1.2.1-cp310-cp310-win_amd64.whl.metadata (642 bytes)
Downloading recordlinkage-0.16-py3-none-any.whl (926 kB)
   ---------------------------------------- 0.0/926.9 kB ? eta -:--:--
   ---------------------------------------- 926.9/926.9 kB 21.4 MB/s  0:00:00
Downloading jellyfish-1.2.1-cp310-cp310-win_amd64.whl (213 kB)
Installing collected packages: jellyfish, recordlinkage

   -------------------- ------------------- 1/2 [recordlinkage]
   ---------------------------------------- 2/2 [recordlinkage]

Successfully installed jellyfish-1.2.1 recordlinkage-0.16
Note: you may need to restart the kernel to use updated packages.




In [3]:
import pandas as pd
import recordlinkage
from recordlinkage.preprocessing import clean
from recordlinkage.index import Block

# 1. SETUP: Raw fragmented banking data
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Johnathan Smith', 'Jon Smith', 'J. Smith', 'Alice Brown', 'John Smith'],
    'address': ['123 Maple St', '123 Maple Street', '123 Maple St. Apt 2', '456 Oak Rd', '123 Maple Rd'],
    'phone': ['555-0101', '5550101', '555-0101', '555-9999', '555-0101']
}
df = pd.DataFrame(data)

# 2. CLEANING: The "Canonicalization" Step
df['c_name'] = clean(df['name'])
df['c_address'] = clean(df['address'])
df['c_phone'] = df['phone'].str.replace(r'\D', '', regex=True)

# 3. INDEXING: Use "Blocking" to avoid comparing everyone to everyone
# We block by phone number to find people sharing the same contact info
indexer = Block('c_phone')
candidate_links = indexer.index(df)

# 4. COMPARISON: Fuzzy Matching logic
compare = recordlinkage.Compare()
compare.string('c_name', 'c_name', method='jarowinkler', threshold=0.85, label='name_match')
compare.string('c_address', 'c_address', method='levenshtein', threshold=0.70, label='addr_match')
compare.exact('c_phone', 'c_phone', label='phone_match')

# 5. RESOLUTION: Identifying the "Fraud Cluster"
features = compare.compute(candidate_links, df)
# Sum the matches: if score >= 2, we consider them the same entity
matches = features[features.sum(axis=1) >= 2]

print("--- Identified Fraudulent Clusters (Synthetic Identities) ---")
print(matches)

--- Identified Fraudulent Clusters (Synthetic Identities) ---
     name_match  addr_match  phone_match
1 0         0.0         1.0            1
2 1         1.0         1.0            1
4 0         1.0         1.0            1
  1         1.0         0.0            1
