In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.


import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Sample DataFrame with potential duplicates
data = {
    'ProductID': [1, 2, 3, 4, 5, 6],
    'Price': [10.5, 20.0, 10.5, 35.0, 10.5, 20.0],
    'Category': ['A', 'B', 'A', 'C', 'A', 'B'],
    'Description': ['Red product', 'Blue product', 'Red product', 'Green product', 'Red product', 'Blue product']
}

df = pd.DataFrame(data)

# Step 1: Preprocessing
# Convert categorical data to numerical using Label Encoding or One-Hot Encoding
df['Category'] = df['Category'].astype('category').cat.codes
df['Description'] = df['Description'].astype('category').cat.codes

# Normalize the data (important for distance-based models like KMeans)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[['Price', 'Category', 'Description']])

# Step 2: Compute similarity (Cosine Similarity)
similarity_matrix = cosine_similarity(df_scaled)

# Step 3: Apply Clustering Algorithm (e.g., KMeans)
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(df_scaled)

# Step 4: Deduplicate based on the clustering
# Group by the cluster and remove duplicates in each cluster based on feature similarity
df_dedup = df.drop_duplicates(subset=['Cluster'], keep='first')

# Step 5: Show the deduplicated DataFrame
print("Original DataFrame with possible duplicates:")
print(df)
print("\nDeduplicated DataFrame:")
print(df_dedup)

Original DataFrame with possible duplicates:
   ProductID  Price  Category  Description  Cluster
0          1   10.5         0            2        0
1          2   20.0         1            0        1
2          3   10.5         0            2        0
3          4   35.0         2            1        1
4          5   10.5         0            2        0
5          6   20.0         1            0        1

Deduplicated DataFrame:
   ProductID  Price  Category  Description  Cluster
0          1   10.5         0            2        0
1          2   20.0         1            0        1
