## Exploração inicial e processamento de dados

Verificação e pré-processamento dos dados para utilização nos experimentos de modelos

In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv("../data/raw/laptop-price-brl.csv")
df.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2321,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2613,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2680,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,4689,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,1808,3 stars,0,0


## Pré-processamento

In [18]:
df_transformed = df.copy()

In [19]:
df_transformed = df_transformed.astype(str).apply(lambda x : x.str.lower())

In [20]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,asus,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2321,2 stars,3,0
1,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2613,3 stars,65,5
2,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2680,3 stars,8,1
3,asus,intel,core i5,10th,8 gb,ddr4,512 gb,0 gb,windows,32-bit,2 gb,casual,no warranty,no,no,4689,3 stars,0,0
4,asus,intel,celeron dual,not available,4 gb,ddr4,0 gb,512 gb,windows,64-bit,0 gb,casual,no warranty,no,no,1808,3 stars,0,0


## Remoção de colunas

Colunas que não serão utilizadas no treinamento e predição por não terem poder preditor mais generalista.

In [21]:
df_transformed.drop("rating", axis = 1, inplace = True)
df_transformed.drop("Number of Ratings", axis = 1, inplace = True)
df_transformed.drop("Number of Reviews", axis = 1, inplace = True)
df_transformed.drop("msoffice", axis = 1, inplace = True)
df_transformed.drop("processor_gnrtn", axis = 1, inplace = True)

In [22]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2321
1,lenovo,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2613
2,lenovo,intel,core i3,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,2680
3,asus,intel,core i5,8 gb,ddr4,512 gb,0 gb,windows,32-bit,2 gb,casual,no warranty,no,4689
4,asus,intel,celeron dual,4 gb,ddr4,0 gb,512 gb,windows,64-bit,0 gb,casual,no warranty,no,1808


In [23]:
df_transformed["ram_gb"] = df_transformed["ram_gb"].replace({" gb": ""}, regex = True)
df_transformed["ssd"] = df_transformed["ssd"].replace({" gb": ""}, regex = True)
df_transformed["hdd"] = df_transformed["hdd"].replace({" gb": ""}, regex = True)
df_transformed["graphic_card_gb"] = df_transformed["graphic_card_gb"].replace({" gb": ""}, regex = True)

In [26]:
df_transformed.head(30)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,no warranty,no,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,no warranty,no,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,no warranty,no,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,no warranty,no,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,no warranty,no,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,no warranty,no,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,no warranty,no,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,no warranty,no,4019


In [28]:
df_transformed["warranty"] = df_transformed["warranty"].replace({"no warranty": "0"}, regex = True)
df_transformed["warranty"] = df_transformed["warranty"].replace({" (years|year)": ""}, regex = True)

In [29]:
df_transformed.head(30)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,no,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,no,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,no,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,no,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,0,no,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,0,no,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,0,no,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,0,no,4019


In [31]:
df_transformed["Touchscreen"] = df_transformed["Touchscreen"].replace({"yes": "1", "no": "0"}, regex = True)

In [32]:
df_transformed.head(20)

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808
5,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1540
6,asus,intel,celeron dual,4,ddr4,0,512,windows,32-bit,0,casual,0,0,1473
7,asus,intel,core i5,8,ddr4,0,1024,windows,32-bit,2,casual,0,0,3940
8,lenovo,intel,core i5,4,ddr4,0,1024,windows,32-bit,0,casual,0,0,3350
9,acer,amd,ryzen 5,4,ddr4,0,512,windows,32-bit,4,casual,0,0,4019


In [33]:
df_transformed = df_transformed.rename(columns={"Touchscreen": "touchscreen", "Price": "price_brl"})

In [34]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,touchscreen,price_brl
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,0,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,2,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,0,casual,0,0,1808


In [None]:
df_transformed["ram_gb"] = pd.to_numeric(df_transformed["ram_gb"], errors = 'coerce').fillna(0).astype(np.int64)
