In [2]:
pip install recordlinkage

Collecting recordlinkage
  Downloading recordlinkage-0.16-py3-none-any.whl.metadata (8.1 kB)
Collecting jellyfish>=1 (from recordlinkage)
  Downloading jellyfish-1.2.1-cp310-cp310-win_amd64.whl.metadata (642 bytes)
Downloading recordlinkage-0.16-py3-none-any.whl (926 kB)
   ---------------------------------------- 0.0/926.9 kB ? eta -:--:--
   ---------------------------------------- 926.9/926.9 kB 21.4 MB/s  0:00:00
Downloading jellyfish-1.2.1-cp310-cp310-win_amd64.whl (213 kB)
Installing collected packages: jellyfish, recordlinkage

   -------------------- ------------------- 1/2 [recordlinkage]
   ---------------------------------------- 2/2 [recordlinkage]

Successfully installed jellyfish-1.2.1 recordlinkage-0.16
Note: you may need to restart the kernel to use updated packages.




In [1]:
import pandas as pd
import numpy as np
import recordlinkage
from recordlinkage.preprocessing import clean
from scipy.stats import mstats

# --- 1. SETUP: Raw Data with "Bust-out" Outliers ---
# Note: Customer 5 has a 1 million dollar transaction (Outlier)
data = {
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['Johnathan Smith', 'Jon Smith', 'J. Smith', 'Alice Brown', 'John Smith'],
    'address': ['123 Maple St', '123 Maple Street', '123 Maple St. Apt 2', '456 Oak Rd', '123 Maple Rd'],
    'phone': ['555-0101', '5550101', '555-0101', '555-9999', '555-0101'],
    'txn_amt': [1200, 1500, 1100, 500, 1000000] 
}
df = pd.DataFrame(data)

# --- 2. CLEANING: Canonicalization ---
df['c_name'] = clean(df['name'])
df['c_address'] = clean(df['address'])
df['c_phone'] = df['phone'].str.replace(r'\D', '', regex=True)

# --- 3. WINSORIZATION: Handling Outliers ---
# We cap the extreme 'txn_amt' at the 90th percentile.
# This prevents the $1M transaction from distorting the 'Average Spend' metric.
df['txn_winsorized'] = mstats.winsorize(df['txn_amt'], limits=[0, 0.10])

# --- 4. INDEXING & COMPARISON: Entity Resolution ---
# Block by phone to find identity collisions
indexer = recordlinkage.Index()
indexer.block('c_phone')
candidate_links = indexer.index(df)

compare = recordlinkage.Compare()
compare.string('c_name', 'c_name', method='jarowinkler', threshold=0.85, label='name_match')
compare.string('c_address', 'c_address', method='levenshtein', threshold=0.70, label='addr_match')
compare.exact('c_phone', 'c_phone', label='phone_match')

# --- 5. RESOLUTION: Identifying the Cluster ---
features = compare.compute(candidate_links, df)
matches = features[features.sum(axis=1) >= 2]

print("--- Data with Winsorized Amounts ---")
print(df[['name', 'txn_amt', 'txn_winsorized']])

print("\n--- Identified Fraudulent Clusters (Synthetic Identities) ---")
print(matches)

--- Data with Winsorized Amounts ---
              name  txn_amt  txn_winsorized
0  Johnathan Smith     1200            1200
1        Jon Smith     1500            1500
2         J. Smith     1100            1100
3      Alice Brown      500             500
4       John Smith  1000000         1000000

--- Identified Fraudulent Clusters (Synthetic Identities) ---
     name_match  addr_match  phone_match
1 0         0.0         1.0            1
2 1         1.0         1.0            1
4 0         1.0         1.0            1
  1         1.0         0.0            1
