In [1]:
import pandas as pd

In [2]:
df_laptops = pd.read_csv('laptop_price.csv')

In [3]:
df_laptops.head(3)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0


## 1. duplicated()

### 1.1 Example 1

In [4]:
# finding duplicates in 1 column/series
df_laptops.duplicated('laptop_ID')

0       False
1       False
2       False
3       False
4       False
        ...  
1298    False
1299    False
1300    False
1301    False
1302    False
Length: 1303, dtype: bool

In [None]:
# And we got here a series with True or False values. So we can filter
# the dataframe based on this condition.

In [5]:
# showing elements in dataframe with duplicates in column 'laptop_ID'
df_laptops[df_laptops.duplicated('laptop_ID')]

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros


In [6]:
# this means that the laptop_ID column doesn't have any duplicated elements.

In [7]:
# finding duplicates in two or more columns
df_laptops.duplicated(['Product','TypeName','Inches'])

# And here what this is doing is, to find rows that have duplicated elements
# inside the columns ['Product','TypeName','Inches'] at the same time.

0       False
1       False
2       False
3       False
4        True
        ...  
1298     True
1299     True
1300     True
1301     True
1302     True
Length: 1303, dtype: bool

In [10]:
duplicated = df_laptops.duplicated(['Product','TypeName','Inches'])    # Condition or Filter 

In [11]:
# showing all the values duplicated 
df_laptops[duplicated].sort_values(['Product','TypeName'])
# I want to sort the values by the Product column and then by the TypeName column

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
1287,1305,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.0
1301,1319,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6GB,1TB HDD,AMD Radeon R5 M330,Windows 10,2.19kg,764.0
1098,1113,HP,250 G5,Notebook,15.6,1366x768,Intel Pentium Quad Core N3710 1.6GHz,4GB,1TB HDD,Intel HD Graphics 405,Windows 10,1.96kg,500.0
1170,1188,HP,250 G5,Notebook,15.6,Full HD 1920x1080,Intel Core i7 6500U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.96kg,679.0
10,11,HP,250 G6,Notebook,15.6,1366x768,Intel Core i5 7200U 2.5GHz,4GB,500GB HDD,Intel HD Graphics 620,No OS,1.86kg,393.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...
414,421,Asus,ZenBook Flip,2 in 1 Convertible,13.3,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.27kg,928.0
826,835,Asus,ZenBook Flip,2 in 1 Convertible,13.3,Touchscreen / Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.1kg,1358.0
1275,1293,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8GB,512GB SSD,Intel HD Graphics 515,Windows 10,1.2kg,729.0
1289,1307,Asus,ZenBook UX305CA-UBM1,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core M 6Y30 0.9GHz,8GB,512GB SSD,Intel HD Graphics 515,Windows 10,1.2kg,729.0


In [12]:
# For example the 4th and 7th laptops have the same Product, TypeName, and Inches, 
# so they are duplicated. The 10th and 11th observations are duplicates too, and so on.  

# 1287 and 1301 are duplicates, 1098 and 1170 too, etcetera.

# And that's how duplicates value can be found in two or more columns

### 1.2 Example 2

I'm going the find the cheapest and most expensive laptop of each company using the sort_values()\
and duplicated() methods.

In [17]:
# sorting the dataframe ascending by company and price
# cheapest first and most expensive last
df_laptops = df_laptops.sort_values(['Company','Price_euros'])
df_laptops

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
1215,1233,Acer,C740-C9QX (3205U/2GB/32GB/Chrome,Netbook,11.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,32GB SSD,Intel HD Graphics,Chrome OS,1.3kg,174.00
290,295,Acer,Chromebook C910-C2ST,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,16GB SSD,Intel HD Graphics,Chrome OS,2.19kg,199.00
1102,1117,Acer,Chromebook 15,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,4GB,16GB SSD,Intel HD Graphics,Chrome OS,2.20kg,209.00
695,703,Acer,TravelMate B117-M,Netbook,11.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,32GB Flash Storage,Intel HD Graphics,Windows 10,1.4kg,269.00
1198,1216,Acer,Aspire 3,Notebook,15.6,1366x768,Intel Celeron Dual Core N3350 2GHz,4GB,1TB HDD,Intel HD Graphics 500,Linux,2.1kg,272.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,195,Vero,K147 (N3350/4GB/32GB/FHD/W10),Notebook,14.0,IPS Panel Full HD 1920x1080,Intel Celeron Dual Core N3350 1.1GHz,4GB,32GB Flash Storage,Intel HD Graphics 500,Windows 10,1.3kg,260.00
877,888,Xiaomi,Mi Notebook,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,8GB,256GB SSD,Nvidia GeForce 940MX,Windows 10,1.28kg,935.00
192,196,Xiaomi,Mi Notebook,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Nvidia GeForce MX150,No OS,1.3kg,999.90
184,188,Xiaomi,Mi Notebook,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Nvidia GeForce MX150,No OS,1.95kg,1199.00


In [18]:
# And now is sorted by the Company and also by the Price.. So the cheapest laptop is first
# and the most expensive laptop is at the end.

In [19]:
# checking all the value categories in "Company" column
df_laptops.value_counts('Company')

Company
Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Vero           4
Xiaomi         4
Google         3
Fujitsu        3
Chuwi          3
LG             3
Huawei         2
Name: count, dtype: int64

In [22]:
# Looking for duplicated values in "Company" columns
duplicated_first = df_laptops.duplicated('Company', keep='first')

# .duplicated() if we press shift+tab we can see that the keep parameter is 'first'
# by default.. this means only the 1st duplicated value is going to remain while the
# others will be removed

# keep='first' (cheapest laptop per company)

In [23]:
# showing dataframe with duplicated values in "Company" column
df_laptops[duplicated_first]

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
290,295,Acer,Chromebook C910-C2ST,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,16GB SSD,Intel HD Graphics,Chrome OS,2.19kg,199.00
1102,1117,Acer,Chromebook 15,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,4GB,16GB SSD,Intel HD Graphics,Chrome OS,2.20kg,209.00
695,703,Acer,TravelMate B117-M,Netbook,11.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,32GB Flash Storage,Intel HD Graphics,Windows 10,1.4kg,269.00
1198,1216,Acer,Aspire 3,Notebook,15.6,1366x768,Intel Celeron Dual Core N3350 2GHz,4GB,1TB HDD,Intel HD Graphics 500,Linux,2.1kg,272.00
1263,1281,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4GB,500GB HDD,Intel HD Graphics 400,Linux,2.4kg,289.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,1055,Vero,V142 (X5-Z8350/2GB/32GB/W10),Notebook,14.0,1366x768,Intel Atom X5-Z8350 1.44GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.45kg,210.80
191,195,Vero,K147 (N3350/4GB/32GB/FHD/W10),Notebook,14.0,IPS Panel Full HD 1920x1080,Intel Celeron Dual Core N3350 1.1GHz,4GB,32GB Flash Storage,Intel HD Graphics 500,Windows 10,1.3kg,260.00
192,196,Xiaomi,Mi Notebook,Ultrabook,13.3,IPS Panel Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Nvidia GeForce MX150,No OS,1.3kg,999.90
184,188,Xiaomi,Mi Notebook,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Nvidia GeForce MX150,No OS,1.95kg,1199.00


In [24]:
# showing dataframe with non duplicated values in "Company" column
df_laptops[~duplicated_first]

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
1215,1233,Acer,C740-C9QX (3205U/2GB/32GB/Chrome,Netbook,11.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,32GB SSD,Intel HD Graphics,Chrome OS,1.3kg,174.0
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
20,21,Asus,Vivobook E200HA,Netbook,11.6,1366x768,Intel Atom x5-Z8350 1.44GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,0.98kg,191.9
30,31,Chuwi,"LapBook 15.6""",Notebook,15.6,Full HD 1920x1080,Intel Atom x5-Z8300 1.44GHz,4GB,64GB Flash Storage,Intel HD Graphics,Windows 10,1.89kg,244.99
340,346,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4GB,500GB HDD,Intel HD Graphics,Linux,2.2kg,274.9
983,997,Fujitsu,LifeBook A556,Notebook,15.6,1366x768,Intel Core i5 6200U 2.3GHz,4GB,256GB SSD,Intel HD Graphics 520,Windows 10,2.3kg,649.0
472,479,Google,Pixelbook (Core,Ultrabook,12.3,Touchscreen 2400x1600,Intel Core i5 7Y57 1.2GHz,8GB,128GB SSD,Intel HD Graphics 615,Chrome OS,1.1kg,1275.0
1268,1286,HP,Stream 11-Y000na,Netbook,11.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.17kg,209.0
170,174,Huawei,MateBook X,Ultrabook,13.0,IPS Panel Full HD 2160x1440,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows 10,1.05kg,1349.0
909,922,LG,Gram 14Z970,Ultrabook,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,512GB SSD,Intel HD Graphics 620,Windows 10,0.98kg,1899.0


In [26]:
# so we got in the Company column different values
df_laptops[~duplicated_first][['Company','Price_euros']]

Unnamed: 0,Company,Price_euros
1215,Acer,174.0
1,Apple,898.94
20,Asus,191.9
30,Chuwi,244.99
340,Dell,274.9
983,Fujitsu,649.0
472,Google,1275.0
1268,HP,209.0
170,Huawei,1349.0
909,LG,1899.0


In [None]:
# given that we set keep = 'first' that means only the laptop with the cheapest price will remain in the df..
# that's why the values inside this df represent laptops with the cheapest price per company. For example,
# the cheapest Apple laptop costsa 898.94 euros

In [28]:
# checking all the value categories
# df_laptops[~duplicated_first].value_counts('Company')

In [None]:
# So as we can see, all of the elements inside the Company column only have 1 value
# so here Acer has 1, Lonovo has 1, and so on.

In [29]:
# keep='last'
duplicated_last = df_laptops.duplicated('Company', keep='last')

In [32]:
# showing dataframe with non duplicated values in "Company" column

# since we are using keep='last' we're supossed to get the most expensive laptop per company
# because we sorted the df ascending, from the cheapest to the most expensive.. so the cheapest were 
# in the first rows and most expensive were in the last rows.. that's why when we set keep='last'
# we get the most expensive laptop per company.

df_laptops[~duplicated_last][['Company','Price_euros']]    # to obtain the unique values

Unnamed: 0,Company,Price_euros
1189,Acer,2599.0
17,Apple,2858.0
1066,Asus,3975.0
421,Chuwi,449.0
723,Dell,3659.4
623,Fujitsu,799.0
437,Google,2199.0
749,HP,4389.0
214,Huawei,1499.0
678,LG,2299.0


In [33]:
# And now we got this dataframe, and all the values inside this df are unique.
# So there isn't any duplicated values.. And also we got the most expensive laptop
# per company. So for example, the most expensive Apple laptop costs 2858 euros.

In [34]:
# And that's how duplicated rows can be found using the duplicated() method.