# **Duplicate Detection and Removal in Dataset**

**Loading Dataset**

In [None]:
import pandas as pd

df = pd.read_csv("data.csv")  


In [2]:
df.head()

Unnamed: 0,Name,Address,ID
0,David Green,"456 Oak St, NY",1
1,Charlie White,"123 Elm Street, NY",2
2,David Green,"404 Ash Rd, TX",3
3,Eva Black,"123 Elm St, NY",4
4,John Smith,"505 Walnut Dr, FL",5


**Finding Duplicate Entries Using Levenshtein Distance**

In [None]:
import Levenshtein

def calculate_levenshtein_distance(str1, str2):
    return Levenshtein.distance(str1, str2)

def find_duplicates(df):
    duplicates = []
    for i, row1 in df.iterrows():
        for j, row2 in df.iterrows():
            if i < j:  
                name_distance = calculate_levenshtein_distance(row1['Name'], row2['Name'])
                address_distance = calculate_levenshtein_distance(row1['Address'], row2['Address'])
                
                if name_distance < 3 and address_distance < 3:
                    duplicates.append((row1['ID'], row2['ID'], name_distance, address_distance))
    
    return duplicates

duplicates = find_duplicates(df)
print(duplicates)


[(1, 63, 0, 0), (1, 70, 0, 0), (1, 174, 0, 0), (2, 24, 0, 0), (2, 26, 0, 0), (2, 103, 0, 0), (2, 111, 0, 0), (2, 190, 0, 0), (3, 81, 0, 0), (4, 58, 0, 0), (4, 97, 0, 0), (4, 134, 0, 0), (4, 144, 0, 0), (4, 173, 0, 0), (4, 254, 0, 0), (5, 22, 1, 0), (5, 30, 0, 0), (5, 53, 1, 0), (5, 72, 1, 0), (5, 82, 0, 0), (5, 180, 1, 0), (5, 185, 0, 0), (5, 186, 1, 0), (5, 257, 0, 0), (6, 131, 0, 0), (6, 161, 0, 0), (6, 167, 0, 0), (6, 195, 0, 0), (7, 43, 0, 0), (7, 240, 0, 0), (7, 295, 0, 0), (8, 158, 0, 0), (9, 117, 0, 0), (9, 150, 0, 0), (10, 164, 0, 0), (11, 55, 0, 0), (12, 37, 0, 0), (13, 73, 0, 0), (13, 166, 0, 0), (13, 265, 0, 0), (14, 168, 0, 0), (14, 272, 0, 0), (14, 292, 0, 0), (15, 90, 0, 0), (15, 132, 0, 0), (15, 154, 0, 0), (16, 69, 0, 0), (16, 184, 0, 0), (16, 215, 0, 0), (16, 290, 0, 0), (17, 46, 0, 0), (17, 74, 0, 0), (17, 178, 0, 0), (18, 223, 0, 0), (19, 50, 0, 0), (20, 52, 0, 0), (20, 67, 0, 0), (20, 86, 1, 0), (20, 105, 1, 0), (20, 194, 0, 0), (20, 211, 0, 0), (20, 229, 1, 0), (20

**Removing Duplicate Entries from Dataset**

In [None]:
def remove_duplicates(df, duplicates):

    duplicate_ids = set()
    for dup in duplicates:
        duplicate_ids.add(dup[1])  

    df_clean = df[~df['ID'].isin(duplicate_ids)]
    
    return df_clean


df_clean = remove_duplicates(df, duplicates)
print(df_clean)


              Name             Address   ID
0      David Green      456 Oak St, NY    1
1    Charlie White  123 Elm Street, NY    2
2      David Green      404 Ash Rd, TX    3
3        Eva Black      123 Elm St, NY    4
4       John Smith   505 Walnut Dr, FL    5
..             ...                 ...  ...
211      Bob Brown     789 Pine St, NY  212
226    David Green   505 Walnut Dr, FL  227
235      Grace Lee      123 Elm St, NY  236
250      Bob Brown   606 Cherry Ln, FL  251
251  Alice Johnson   505 Walnut Dr, FL  252

[86 rows x 3 columns]


**Saving Cleaned Dataset to CSV File**

In [5]:
df_clean.to_csv("cleaned_data.csv", index=False)

In [6]:
df2 = pd.read_csv("cleaned_data.csv")

In [7]:
df2.head()

Unnamed: 0,Name,Address,ID
0,David Green,"456 Oak St, NY",1
1,Charlie White,"123 Elm Street, NY",2
2,David Green,"404 Ash Rd, TX",3
3,Eva Black,"123 Elm St, NY",4
4,John Smith,"505 Walnut Dr, FL",5
