In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
file_path = r'C:\Users\ssp1_\Downloads\Input.csv'
data = pd.read_csv(file_path)

# Display initial dataset shape
print(f"Original dataset shape: {data.shape}")

# Preprocessing (combine columns and normalize text)
def preprocess(row):
    combined = f"{row['ln']} {row['dob']} {row['gn']} {row['fn']}"
    return " ".join(combined.lower().strip().split())

data['Processed_Text'] = data.apply(preprocess, axis=1)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['Processed_Text'])

# Compute cosine similarity between rows
similarity_matrix = cosine_similarity(tfidf_matrix)

# Set a similarity threshold for duplicates (e.g., 0.9)
threshold = 0.9

def find_duplicates(similarity_matrix, threshold):
    duplicates = set()
    for i in range(similarity_matrix.shape[0]):
        for j in range(i + 1, similarity_matrix.shape[1]):
            if similarity_matrix[i, j] > threshold:
                duplicates.add(j)
    return list(duplicates)

# Find duplicate indices
duplicate_indices = find_duplicates(similarity_matrix, threshold)

# Remove duplicates
data_deduplicated = data.drop(index=duplicate_indices)

# Display results
print(f"Deduplicated dataset shape: {data_deduplicated.shape}")

# Save deduplicated dataset to a new file
data_deduplicated.to_csv(r'C:\Users\ssp1_\Downloads\deduplicated_dataset.csv', index=False)
print("Deduplicated dataset saved to 'deduplicated_dataset.csv'.")


Original dataset shape: (149, 5)
Deduplicated dataset shape: (91, 6)
Deduplicated dataset saved to 'deduplicated_dataset.csv'.
