# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Deduplication Using K-means Clustering

**Steps**:
1. Data Set: Download a dataset containing duplicate customer records.
2. Preprocess: Standardize the data to ensure better clustering.
3. Apply K-means: Use K-means clustering to find and group similar customer records.
4. Identify Duplicates: Identify and remove duplicates within clusters.

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Step 1: Generate a synthetic customer dataset with duplicates
np.random.seed(42)
customer_data = {
    'CustomerID': np.arange(1, 21),
    'Age': [25, 30, 25, 30, 35, 40, 35, 40, 25, 30, 50, 45, 50, 55, 50, 45, 60, 55, 60, 55],
    'AnnualIncome': [50000, 60000, 50000, 60000, 70000, 80000, 70000, 80000, 50000, 60000, 100000, 95000, 100000, 105000, 100000, 95000, 120000, 115000, 120000, 115000],
    'SpendingScore': [40, 45, 40, 45, 60, 65, 60, 65, 40, 45, 80, 75, 80, 85, 80, 75, 90, 85, 90, 85]
}

df = pd.DataFrame(customer_data)

# Step 2: Data Preprocessing
features = ['Age', 'AnnualIncome', 'SpendingScore']
scaler = StandardScaler()

# Validate data integrity
try:
    assert df[features].notnull().all(), 'Data contains NaN values'
    assert all(df[features].apply(pd.to_numeric, errors='coerce').notnull()), 'Non-numeric values found in data'
except AssertionError as error:
    print('Error:', error)
    df = df.dropna(subset=features)  # Example: remove rows with NaN values

# Standardize the data
df_scaled = scaler.fit_transform(df[features])

# Step 3: Determine optimal number of clusters using the elbow method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(df_scaled)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS (Within-cluster Sum of Squares)')
plt.show()

# From the plot, assume we select the optimal number of clusters (let's say it's 4)
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Step 4: Deduplicate by keeping only one record per cluster
deduplicated_df = df.groupby('Cluster').first().reset_index(drop=True)

# Step 5: Display the original and deduplicated data
print("Original DataFrame:")
print(df[['CustomerID', 'Age', 'AnnualIncome', 'SpendingScore', 'Cluster']])

print("\nDeduplicated DataFrame:")
print(deduplicated_df[['CustomerID', 'Age', 'AnnualIncome', 'SpendingScore']])

# Optional: Visualize the clusters
plt.figure(figsize=(8, 6))
plt.scatter(df['Age'], df['AnnualIncome'], c=df['Cluster'], cmap='viridis', label='Clusters')
plt.title('K-means Clustering of Customer Data')
plt.xlabel('Age')
plt.ylabel('Annual Income')
plt.colorbar(label='Cluster')
plt.show()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().