# Customer Database Deduplication

# Clean a customer database that has duplicates caused by inconsistent data entry

# Step 1 — Import Libraries & Create Sample Customer Data

In [1]:
import pandas as pd

# Sample customer data with duplicates & inconsistencies
data = {
    'customer_name': [
        'John Doe', 'john doe', ' JOHN  DOE ', 'Alice Smith', 
        'alice smith ', 'Bob-Ray', 'Bob Ray', 'bob ray'
    ],
    'email': [
        'john@example.com', 'john@example.com', 'john@example.com',
        'alice@example.com', 'ALICE@EXAMPLE.COM',
        'bob@example.com', 'bob@example.com', 'bob@example.com'
    ],
    'phone': [
        '123-456-7890', '1234567890', '(123) 456-7890',
        '555-111-2222', '5551112222',
        '777-888-9999', '777 888 9999', '7778889999'
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,customer_name,email,phone
0,John Doe,john@example.com,123-456-7890
1,john doe,john@example.com,1234567890
2,JOHN DOE,john@example.com,(123) 456-7890
3,Alice Smith,alice@example.com,555-111-2222
4,alice smith,ALICE@EXAMPLE.COM,5551112222
5,Bob-Ray,bob@example.com,777-888-9999
6,Bob Ray,bob@example.com,777 888 9999
7,bob ray,bob@example.com,7778889999


# Step 2 — Identify Exact Duplicates

In [3]:
print("Exact duplicates found:", df.duplicated().sum())
df[df.duplicated()]


Exact duplicates found: 0


Unnamed: 0,customer_name,email,phone


# Step 3 — Clean & Standardize Data (Fix inconsistencies)

In [4]:
# Strip extra spaces & lowercase names
df['customer_name'] = df['customer_name'].str.strip().str.lower().str.replace(r'\s+', ' ', regex=True)

# Lowercase emails
df['email'] = df['email'].str.lower()

# Standardize phone numbers → keep only digits
df['phone'] = df['phone'].str.replace(r'\D', '', regex=True)

df


Unnamed: 0,customer_name,email,phone
0,john doe,john@example.com,1234567890
1,john doe,john@example.com,1234567890
2,john doe,john@example.com,1234567890
3,alice smith,alice@example.com,5551112222
4,alice smith,alice@example.com,5551112222
5,bob-ray,bob@example.com,7778889999
6,bob ray,bob@example.com,7778889999
7,bob ray,bob@example.com,7778889999


# Step 4 — Check Potential Fuzzy Duplicates After Cleaning

In [5]:
df.groupby(['customer_name']).size().sort_values(ascending=False)


customer_name
john doe       3
alice smith    2
bob ray        2
bob-ray        1
dtype: int64

# Step 5 — Remove Duplicates (After Standardization)

In [6]:
df_cleaned = df.drop_duplicates(subset=['customer_name', 'email', 'phone'])
df_cleaned


Unnamed: 0,customer_name,email,phone
0,john doe,john@example.com,1234567890
3,alice smith,alice@example.com,5551112222
5,bob-ray,bob@example.com,7778889999
6,bob ray,bob@example.com,7778889999


# Step 6 — Final Duplicate Check

In [7]:
print("Remaining duplicates:", df_cleaned.duplicated().sum())


Remaining duplicates: 0
