# Identifying and Removing Duplicates

# Step 1 — Import Libraries & Generate Sample Data

In [5]:
import pandas as pd

# Generate sample data with exact and fuzzy duplicates
data = {
    'name': ['Monisha', 'monisha', 'Prathisha', 'prathiSha', 'Tamilarasan', 'TamilArasan'],
    'age': [25, 25, 30, 30, 40, 40],
    'city': ['TRICHY', 'trichy', 'COIMBATORE', 'coimbatore', 'CHENNAI', 'chennai']
}

df = pd.DataFrame(data)
df


Unnamed: 0,name,age,city
0,Monisha,25,TRICHY
1,monisha,25,trichy
2,Prathisha,30,COIMBATORE
3,prathiSha,30,coimbatore
4,Tamilarasan,40,CHENNAI
5,TamilArasan,40,chennai


# Step 2 — Identify Exact Duplicates

In [6]:
# Identify exact duplicates across all columns
duplicate_flags = df.duplicated()

print("Duplicate Flags (True indicates duplicate rows):")
print(duplicate_flags)

print("\nNumber of exact duplicates:", duplicate_flags.sum())


Duplicate Flags (True indicates duplicate rows):
0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

Number of exact duplicates: 0


# Step 3 — Remove Exact Duplicates

In [7]:
df.drop_duplicates(inplace=True)
df


Unnamed: 0,name,age,city
0,Monisha,25,TRICHY
1,monisha,25,trichy
2,Prathisha,30,COIMBATORE
3,prathiSha,30,coimbatore
4,Tamilarasan,40,CHENNAI
5,TamilArasan,40,chennai


# Step 4 — Handle Fuzzy Duplicates (Case-insensitive duplicates)

In [8]:
# Convert names to lowercase for consistency
df['name'] = df['name'].str.lower()
df


Unnamed: 0,name,age,city
0,monisha,25,TRICHY
1,monisha,25,trichy
2,prathisha,30,COIMBATORE
3,prathisha,30,coimbatore
4,tamilarasan,40,CHENNAI
5,tamilarasan,40,chennai


# Step 5 — Detect potential fuzzy duplicates using groupby

In [9]:
# Group by standardized names to identify repeated entries
name_counts = df.groupby('name').size().sort_values(ascending=False)

print("Counts of each name after standardization:")
print(name_counts)


Counts of each name after standardization:
name
monisha        2
prathisha      2
tamilarasan    2
dtype: int64


# Step 6 — Remove duplicates again after standardization

In [10]:
df.drop_duplicates(subset=['name', 'age', 'city'], inplace=True)
df


Unnamed: 0,name,age,city
0,monisha,25,TRICHY
1,monisha,25,trichy
2,prathisha,30,COIMBATORE
3,prathisha,30,coimbatore
4,tamilarasan,40,CHENNAI
5,tamilarasan,40,chennai


# Step 7 — Final Duplicate Check

In [11]:
print("Final number of duplicates:", df.duplicated().sum())


Final number of duplicates: 0
