In [1]:
import pandas as pd
from pathlib import Path

# file_withDupes - filename of original file 
# file_withoutDupes - filename of file with (blanks and) duplicates removed
# file_blankRecords - filename of file with records where dedupeField is blank
# file_dupeRecords - filename of file with records where dedupefield is not blank and is a duplicate of an existing
# df_withDupes - data frame of original file
# df_withoutDupes - data frame with duplicates removed
# df_dupeRecordsMask - data frame of boolean mask indicating which rows are duplicate 
# df_dupeRecords - data frame of records identified as the duplicates


In [2]:
# set values here

# get the name of the original file
file_withDupes = Path(input(f"\nEnter filename\n"))

# get the name of the column to check for duplicates
dedupeField = input(f"\nEnter name of column to check for duplicates\n")

# get whether to keep first or last record as the master
keepRecord = input(f"\nEnter record to keep. Valid values are 'first' or 'last'\n")

# file without dupes will be named [file_withDupes]_deduped_on[DupeField]_keep[keepRecord].csv
file_withoutDupes = file_withDupes.stem + '_deduped_on' + dedupeField.title().replace(" ", "") + '_keep' + keepRecord.capitalize() + '.csv'

# file of records with blank dedupeField will be named [file_withDupes]_deduped_on[DupeField]_keep[keepRecord]_blankRecords.csv
file_blankRecords = file_withDupes.stem + '_deduped_on' + dedupeField.title().replace(" ", "") + '_keep' + keepRecord.capitalize() + '_blankRecords' + '.csv'

# file of duplicate records will be named [file_withDupes]_deduped_on[DupeField]_keep[keepRecord]_dupeRecords.csv
file_dupeRecords = file_withDupes.stem + '_deduped_on' + dedupeField.title().replace(" ", "") + '_keep' + keepRecord.capitalize() + '_dupeRecords' + '.csv'

print(file_withoutDupes)
print(file_blankRecords)
print(file_dupeRecords)



Enter filename
ME_Contacts_NameAndEmail (1).csv

Enter name of column to check for duplicates
Primary Email

Enter record to keep. Valid values are 'first' or 'last'
first
ME_Contacts_NameAndEmail (1)_deduped_onPrimaryEmail_keepFirst.csv
ME_Contacts_NameAndEmail (1)_deduped_onPrimaryEmail_keepFirst_blankRecords.csv
ME_Contacts_NameAndEmail (1)_deduped_onPrimaryEmail_keepFirst_dupeRecords.csv


In [3]:
# read the original file into a dataframe df_withDupes
df_withDupes = pd.read_csv(file_withDupes)
print(df_withDupes)

       Contact ID     Display Name First Name Middle Name  Last Name  \
0           15001    Sally Sparrow      Sally         NaN    Sparrow   
1           15002  Christina Caleb  Christina         NaN      Caleb   
2           15003     Duane Currie      Duane         NaN     Currie   
3           15004   Andrzej Inglot    Andrzej         NaN     Inglot   
4           15005   Rita Wuebbeler       Rita         NaN  Wuebbeler   
...           ...              ...        ...         ...        ...   
11370       29996       Victoria G   Victoria         NaN          G   
11371       29997         Noriko S     Noriko         NaN          S   
11372       29998    Chrissy Brown    Chrissy         NaN      Brown   
11373       29999    Chrissy Brown    Chrissy         NaN      Brown   
11374       30000      Justin Rose     Justin         NaN       Rose   

       Legal Name          Primary Email          Billing Email  
0             NaN  sallysparrow@yahoo.ca  sallysparrow@yahoo.ca  
1  

In [4]:
# create a dataframe with records where dedupeField is not blank/NaN since these will all be considered "duplicates"
df_withoutBlanks = df_withDupes.dropna(subset=[dedupeField])
print(df_withoutBlanks)

       Contact ID     Display Name First Name Middle Name  Last Name  \
0           15001    Sally Sparrow      Sally         NaN    Sparrow   
1           15002  Christina Caleb  Christina         NaN      Caleb   
2           15003     Duane Currie      Duane         NaN     Currie   
3           15004   Andrzej Inglot    Andrzej         NaN     Inglot   
4           15005   Rita Wuebbeler       Rita         NaN  Wuebbeler   
...           ...              ...        ...         ...        ...   
11332       29957   Ronny Restrepo      Ronny         NaN   Restrepo   
11333       29958  Richard Michael    Richard         NaN    Michael   
11334       29960       Luke Evans       Luke         NaN      Evans   
11338       29964    Jesse Coleman      Jesse         NaN    Coleman   
11362       29988    Anne  Sheperd      Anne          NaN    Sheperd   

       Legal Name               Primary Email          Billing Email  
0             NaN       sallysparrow@yahoo.ca  sallysparrow@yaho

In [5]:
#create a boolean mask indicating which records have a blank dedupeField
df_blankDedupeFieldRecordsMask = df_withDupes[dedupeField].isna()
print(df_blankDedupeFieldRecordsMask)

0        False
1        False
2        False
3        False
4        False
         ...  
11370     True
11371     True
11372     True
11373     True
11374     True
Name: Primary Email, Length: 11375, dtype: bool


In [6]:
# apply boolean mask to original file to create dataframe of only the records with a blank dedupeField
df_blankDedupeFieldRecords = df_withDupes[df_blankDedupeFieldRecordsMask]
print(df_blankDedupeFieldRecords)

       Contact ID        Display Name First Name Middle Name   Last Name  \
35          15043       Ingrid Hansen     Ingrid         NaN      Hansen   
630         15750       Beverley Drew   Beverley         NaN        Drew   
795         16098        Liz MacInnis        Liz         NaN   MacInnis    
920         16287  Barbara Lindenberg    Barbara         NaN  Lindenberg   
927         16297        George Fells     George         NaN       Fells   
...           ...                 ...        ...         ...         ...   
11370       29996          Victoria G   Victoria         NaN           G   
11371       29997            Noriko S     Noriko         NaN           S   
11372       29998       Chrissy Brown    Chrissy         NaN       Brown   
11373       29999       Chrissy Brown    Chrissy         NaN       Brown   
11374       30000         Justin Rose     Justin         NaN        Rose   

       Legal Name Primary Email Billing Email  
35            NaN           NaN        

In [7]:
# write the above dataframe of records with a blank dedupeField to a csv
df_blankDedupeFieldRecords.to_csv(file_blankRecords,index=False)

In [8]:
# create a dataframe without dupes, by dedupeField, and keeping the keepRecord
df_withoutDupes = df_withoutBlanks.drop_duplicates(subset=[dedupeField], keep=keepRecord)
print(df_withoutDupes)

       Contact ID     Display Name First Name Middle Name  Last Name  \
0           15001    Sally Sparrow      Sally         NaN    Sparrow   
1           15002  Christina Caleb  Christina         NaN      Caleb   
2           15003     Duane Currie      Duane         NaN     Currie   
3           15004   Andrzej Inglot    Andrzej         NaN     Inglot   
4           15005   Rita Wuebbeler       Rita         NaN  Wuebbeler   
...           ...              ...        ...         ...        ...   
11332       29957   Ronny Restrepo      Ronny         NaN   Restrepo   
11333       29958  Richard Michael    Richard         NaN    Michael   
11334       29960       Luke Evans       Luke         NaN      Evans   
11338       29964    Jesse Coleman      Jesse         NaN    Coleman   
11362       29988    Anne  Sheperd      Anne          NaN    Sheperd   

       Legal Name               Primary Email          Billing Email  
0             NaN       sallysparrow@yahoo.ca  sallysparrow@yaho

In [9]:
# write the unique records (duplicates removed) to a csv
df_withoutDupes.to_csv(file_withoutDupes, index=False)

In [10]:
#create a boolean mask indicating which records are duplicate
df_dupeRecordsMask = df_withoutBlanks.duplicated(subset=[dedupeField], keep=keepRecord)
print(df_dupeRecordsMask)

0        False
1        False
2        False
3        False
4        False
         ...  
11332    False
11333    False
11334    False
11338    False
11362    False
Length: 10531, dtype: bool


In [11]:
#apply boolean mask to original file to create dataframe of only the duplicate records (not including the "kept" record)
df_dupeRecords = df_withoutBlanks[df_dupeRecordsMask]
print(df_dupeRecords)

       Contact ID     Display Name First Name Middle Name  Last Name  \
66          15084     June Komisar       June       Diana    Komisar   
68          15086        Joel Ross       Joel         NaN       Ross   
312         15351       Jason Shim      Jason         NaN       Shim   
318         15356    Richard Allen    Richard         NaN      Allen   
332         15373      Luke Albert       Luke         NaN     Albert   
...           ...              ...        ...         ...        ...   
9671        28195   Mike Micalizzi       Mike         NaN  Micalizzi   
10088       28647    Stephen Moore    Stephen         NaN      Moore   
10090       28647    Stephen Moore    Stephen         NaN      Moore   
10126       28685  Kieran Heilbron     Kieran         NaN   Heilbron   
10219       28784    David McAleer      David         NaN    McAleer   

       Legal Name               Primary Email               Billing Email  
66            NaN         jkomisar@ryerson.ca              

In [12]:
# write the above dataframe of duplicate records to a csv
df_dupeRecords.to_csv(file_dupeRecords,index=False)