This tutorial has been taken from **[Data School](https://www.youtube.com/channel/UCnVzApLJE2ljPZSeQylSEyg)**. <br>
The link for the tutorial video can be found [here](https://youtu.be/ht5buXUMqkQ).


In [1]:
import pandas as pd

In [2]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep = '|', header = None, names = user_cols, index_col = 'user_id')

In [3]:
users.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [4]:
users.shape

(943, 4)

In [11]:
users.zip_code.duplicated() # returns true if same zip code seen in rows above

user_id
1      False
2      False
3      False
4      False
5      False
       ...  
939    False
940     True
941    False
942    False
943    False
Name: zip_code, Length: 943, dtype: bool

In [12]:
users.zip_code.duplicated().sum()

148

In [13]:
users.duplicated() # entire duplicated rows

user_id
1      False
2      False
3      False
4      False
5      False
       ...  
939    False
940    False
941    False
942    False
943    False
Length: 943, dtype: bool

In [16]:
users.duplicated().sum()

7

In [17]:
users.loc[users.duplicated(), :]

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [19]:
users.loc[users.duplicated(keep = 'first'), :] # default keeps parameter is 'first'
# it keeps the first rows of any encountered value as not duplicated

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [21]:
users.loc[users.duplicated(keep = 'last'), :]
# the last duplicated rows are kept as not-duplicated
# all the other exact same rows, including the first one,
# are considered as duplicated

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [23]:
users.loc[users.duplicated(keep = False), :]
# keeps no duplicated rows, shows all of them

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


In [30]:
users.drop_duplicates(keep = 'first').shape
# drops all duplicated rows except the first one
# inplace = False by default

(936, 4)

In [31]:
users.drop_duplicates(keep = 'last').shape
# drops all but last duplicated rows

(936, 4)

In [32]:
users.drop_duplicates(keep = False).shape
# drops all duplicated rows (all instances)

(929, 4)

In [34]:
# consider only certain columns, not all
# only age and zip_code considered for duplicacy
users.duplicated(subset = ['age', 'zip_code']).sum()
# means there are 16 rows with exact same age and zip_code values

16

In [35]:
users.drop_duplicates(subset = ['age', 'zip_code']).shape

(927, 4)