In [21]:
import pandas as pd

In [22]:
df_laptops = pd.read_csv('laptop_price.csv')

In [23]:
df_laptops.head(3)

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,1339.69
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,898.94
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,575.0


## 1. drop_duplicates()

In [6]:
# dropping duplicates in 2 or more columns
df_laptops.drop_duplicates(['Company'])[['Company', 'Price_euros']]
# .drop_duplicates(['<name_of_column_I_want_to_evaluate>'])
# here we're going to remove duplicate elements inside the Company column

Unnamed: 0,Company,Price_euros
0,Apple,1339.69
2,HP,575.0
5,Acer,400.0
8,Asus,1495.0
13,Dell,498.9
18,Lenovo,499.0
30,Chuwi,244.99
58,MSI,2449.0
70,Microsoft,1089.0
143,Toshiba,602.0


In [None]:
# And then we shouldn't get any duplicate element in the company column
# the [['Company', 'Price_euros']] part in the instruction is to select only those columns

# And now we can easily see all the elements. So we can see that in the Company column
# we don't get any repeated value.. So Apple, HP, Acerm and so on..

# we can verify this with the value_counts() method

In [7]:
df_laptops.drop_duplicates(['Company'])[['Company', 'Price_euros']].value_counts('Company')

Company
Acer         1
Lenovo       1
Vero         1
Toshiba      1
Samsung      1
Razer        1
Microsoft    1
Mediacom     1
MSI          1
LG           1
Apple        1
Huawei       1
HP           1
Google       1
Fujitsu      1
Dell         1
Chuwi        1
Asus         1
Xiaomi       1
Name: count, dtype: int64

In [8]:
# And we can see that all of them only have one element. So there are no duplicated elements

In [11]:
# sorting dataframe ascending by company and price
# cheapest first and most expensive last
df_laptops = df_laptops.sort_values(['Company', 'Price_euros'])

In [None]:
# And now the dataframe is sorted

In [13]:
# cheapest: keep='first'
df_laptops.drop_duplicates('Company', keep='first')[['Company','Price_euros']]

Unnamed: 0,Company,Price_euros
1215,Acer,174.0
1,Apple,898.94
20,Asus,191.9
30,Chuwi,244.99
340,Dell,274.9
983,Fujitsu,649.0
472,Google,1275.0
1268,HP,209.0
170,Huawei,1349.0
909,LG,1899.0


In [None]:
# And here we can see all the laptops with the cheapest values per company

In [14]:
# most expensive: keep='last'
df_laptops.drop_duplicates('Company', keep='last')[['Company','Price_euros']]

Unnamed: 0,Company,Price_euros
1189,Acer,2599.0
17,Apple,2858.0
1066,Asus,3975.0
421,Chuwi,449.0
723,Dell,3659.4
623,Fujitsu,799.0
437,Google,2199.0
749,HP,4389.0
214,Huawei,1499.0
678,LG,2299.0


In [None]:
# And we see here all the most expensive laptops per company

In [None]:
# Arguments

# inplace: drop duplicates in place (it saves all the changes we made to the original df)
# df_laptops.drop_duplicates('Company', keep='last', inplace=True)[['Company','Price_euros']]

# ignore_index=True: it ignores all the original indexes that the dataframe had 
# So for example, it's going to replace the 1189 index with 0, the 17 index with 1, the 1066 index with 2, and so on

In [15]:
df_laptops.drop_duplicates('Company', keep='last', inplace=True, ignore_index=True)

In [16]:
# I ran the previous command, so the df should be updated
df_laptops.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
0,1207,Acer,Predator G9-793,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1070,Windows 10,4.2kg,2599.0
1,18,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.9GHz,16GB,512GB SSD,AMD Radeon Pro 560,macOS,1.83kg,2858.0
2,1081,Asus,ROG G701VO,Gaming,17.3,IPS Panel Full HD 1920x1080,Intel Core i7 6820HK 2.7GHz,64GB,1TB SSD,Nvidia GeForce GTX 980,Windows 10,3.58kg,3975.0
3,428,Chuwi,LapBook 12.3,Notebook,12.3,IPS Panel Retina Display 2736x1824,Intel Celeron Quad Core N3450 1.1GHz,6GB,64GB Flash Storage,Intel HD Graphics 500,Windows 10,1.4kg,449.0
4,731,Dell,Alienware 17,Gaming,17.3,4K Ultra HD 3840x2160,Intel Core i7 7700HQ 2.8GHz,32GB,1TB SSD + 1TB HDD,Nvidia GeForce GTX 1070,Windows 10,4.36kg,3659.4


In [None]:
# And now we have this dataframe. And it starts with index 0, 1, 2, 3, and so on.

In [17]:
df_laptops[['Company', 'Price_euros']]

Unnamed: 0,Company,Price_euros
0,Acer,2599.0
1,Apple,2858.0
2,Asus,3975.0
3,Chuwi,449.0
4,Dell,3659.4
5,Fujitsu,799.0
6,Google,2199.0
7,HP,4389.0
8,Huawei,1499.0
9,LG,2299.0


### Example 1

I'm going to find the biggest and smallest screen size in laptops of each company using the\
sort_values() and duplicated() methods (with the parameter keep "first", "last", and False). 

In [28]:
df_laptops = pd.read_csv('laptop_price.csv')

In [30]:
# sorting dataframe ascending by company and inches
# smallest first and biggest last
df_laptops = df_laptops.sort_values(['Company', 'Inches'])

In [31]:
# smallest: keep='first'
df_laptops.drop_duplicates('Company', keep='first')[['Company','Inches']]

Unnamed: 0,Company,Inches
319,Acer,11.6
1234,Apple,11.6
20,Asus,11.6
421,Chuwi,12.3
455,Dell,11.6
567,Fujitsu,15.6
437,Google,12.3
677,HP,11.6
170,Huawei,13.0
909,LG,14.0


In [33]:
# And there you have laptops with the smallest screens. So for example, the Apple laptop
# with the smallest screen is 11.6"

In [32]:
# biggest: keep='last'
df_laptops.drop_duplicates('Company', keep='last')[['Company','Inches']]

Unnamed: 0,Company,Inches
1216,Acer,17.3
17,Apple,15.4
1256,Asus,17.3
483,Chuwi,15.6
1243,Dell,17.3
983,Fujitsu,15.6
762,Google,12.3
1136,HP,17.3
214,Huawei,13.0
905,LG,15.6


In [None]:
# And here we have the laptops with the biggest screens. So for example, the Apple laptop
# with the biggest screen is 15.4"