# Exploração Inicial e Processamento de Dados
Verificação e pré-processamento de dados para utilização de modelos.

In [38]:
import pandas as pd
import numpy as np
import dagshub
from dagshub import upload_files

In [2]:
df = pd.read_csv("../data/raw/laptop-price-brl.csv")
df.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2321,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2613,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,2680,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,4689,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,1808,3 stars,0,0


## Pré-processamento

Padronização dos dados, consistindo nas etapas de:

* Padronização de caixa baixa
* Transformação de valores _string_ para _long_
* Transformação e criação de novos features

In [3]:
df_transformed = df.copy()

df_transformed = df_transformed.astype(str).apply(lambda x: x.str.lower())

In [4]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,asus,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2321,2 stars,3,0
1,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2613,3 stars,65,5
2,lenovo,intel,core i3,10th,4 gb,ddr4,0 gb,1024 gb,windows,64-bit,0 gb,casual,no warranty,no,no,2680,3 stars,8,1
3,asus,intel,core i5,10th,8 gb,ddr4,512 gb,0 gb,windows,32-bit,2 gb,casual,no warranty,no,no,4689,3 stars,0,0
4,asus,intel,celeron dual,not available,4 gb,ddr4,0 gb,512 gb,windows,64-bit,0 gb,casual,no warranty,no,no,1808,3 stars,0,0


As colunas a seguir não deverão ser utilizadas pois elas não conseguem ser simuladas por um serviço projetado da qual o usuário irá fornecer as características para a simulação do preço do laptop.

In [5]:
df_transformed.drop("rating", axis=1, inplace=True)
df_transformed.drop("Number of Ratings", axis=1, inplace=True)
df_transformed.drop("Number of Reviews", axis=1, inplace=True)
df_transformed.drop("msoffice", axis=1, inplace=True)
df_transformed.drop("processor_gnrtn", axis=1, inplace=True)

Ajustando os dados para remover e adaptar conteúdos string para numeric.

In [6]:
df_transformed['ram_gb'] = df_transformed['ram_gb'].replace({' gb' : ''}, regex=True)
df_transformed['ssd'] = df_transformed['ssd'].replace({' gb' : ''}, regex=True)
df_transformed['hdd'] = df_transformed['hdd'].replace({' gb' : ''}, regex=True)
df_transformed['graphic_card_gb'] = df_transformed['ram_gb'].replace({' gb' : ''}, regex=True)
df_transformed['warranty'] = df_transformed['warranty'].replace({'no warranty' : '0'}, regex=True)
df_transformed['warranty'] = df_transformed['warranty'].replace({' (years|year)' : ''}, regex=True)
df_transformed['Touchscreen'] = df_transformed['Touchscreen'].replace({'no' : '0'}, regex=True)
df_transformed['Touchscreen'] = df_transformed['Touchscreen'].replace({'yes' : '1'}, regex=True)

In [7]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,Price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,8,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,4,casual,0,0,1808


Renomeando as colunas para manter a padronização por nomes em caixa baixa.

In [8]:
df_transformed = df_transformed.rename(columns={"Touchscreen": "touchscreen","Price": "price"})

In [9]:
df_transformed.head()

Unnamed: 0,brand,processor_brand,processor_name,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,touchscreen,price
0,asus,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2321
1,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2613
2,lenovo,intel,core i3,4,ddr4,0,1024,windows,64-bit,4,casual,0,0,2680
3,asus,intel,core i5,8,ddr4,512,0,windows,32-bit,8,casual,0,0,4689
4,asus,intel,celeron dual,4,ddr4,0,512,windows,64-bit,4,casual,0,0,1808


Convertendo os tipos de dados para cada coluna, convertendo os tipos adequados, ou seja, valores numéricos deverão ser convertidos para inteiro.

In [10]:
df_transformed['ram_gb'] = pd.to_numeric(df_transformed['ram_gb'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['hdd'] = pd.to_numeric(df_transformed['hdd'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['ssd'] = pd.to_numeric(df_transformed['ssd'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['graphic_card_gb'] = pd.to_numeric(df_transformed['graphic_card_gb'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['warranty'] = pd.to_numeric(df_transformed['warranty'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['price'] = pd.to_numeric(df_transformed['price'], errors='coerce').fillna(0).astype(np.float64)
df_transformed['touchscreen'] = pd.to_numeric(df_transformed['touchscreen'], errors='coerce').fillna(0).astype(np.int64)
df_transformed['price'] = pd.to_numeric(df_transformed['price'], errors='coerce').fillna(0).astype(np.int64)

Diminuir a quantidade de itens para tornar o modelo mais simples e balancear os dados.

Primeiro vamos verificar o quão diverso os dados categóricos estão.

In [11]:
df_transformed["processor_name"].unique()

array(['core i3', 'core i5', 'celeron dual', 'ryzen 5', 'core i7',
       'core i9', 'm1', 'pentium quad', 'ryzen 3', 'ryzen 7', 'ryzen 9'],
      dtype=object)

In [12]:
df_transformed["os"].unique()

array(['windows', 'dos', 'mac'], dtype=object)

In [13]:
df_transformed["processor_name"].unique()

array(['core i3', 'core i5', 'celeron dual', 'ryzen 5', 'core i7',
       'core i9', 'm1', 'pentium quad', 'ryzen 3', 'ryzen 7', 'ryzen 9'],
      dtype=object)

In [14]:
df_transformed["ram_type"].unique()

array(['ddr4', 'lpddr4', 'lpddr4x', 'ddr5', 'ddr3', 'lpddr3'],
      dtype=object)

In [15]:
df_transformed["weight"].unique()

array(['casual', 'thinnlight', 'gaming'], dtype=object)

Agora vamos fazer algumas alterações, prezando menos granularidade, pois isso também pode ajudar na interface com o usuário, nem todos sabem no detalhe suas configurações.

In [16]:
df_transformed['os'] = df_transformed['os'].replace({'mac': 'other', 'dos': 'other'})

df_transformed['ram_type'] = df_transformed['ram_type'].replace({
    'lpddr4x': 'other', 'lpddr4': 'other', 'lpddr3': 'other',
    'ddr5': 'other', 'ddr3': 'other'
})

df_transformed['processor_name'] = df_transformed['processor_name'].replace({
    'core i9': 'other', 'pentium quad': 'other', 'm1': 'other',
    'celeron dual': 'other', 'ryzen 9': 'other', 'ryzen 3': 'ryzen 7'
})

df_transformed['brand'] = df_transformed['brand'].replace({
    'acer': 'other', 'msi': 'other', 'apple': 'other', 'avita': 'other'
})

Remover itens duplicados.

In [17]:
df_transformed.drop_duplicates(inplace=True)

Verificando se há dados nulos e confirmando os tipos de dados.

In [18]:
df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 779 entries, 0 to 822
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   brand            779 non-null    object
 1   processor_brand  779 non-null    object
 2   processor_name   779 non-null    object
 3   ram_gb           779 non-null    int64 
 4   ram_type         779 non-null    object
 5   ssd              779 non-null    int64 
 6   hdd              779 non-null    int64 
 7   os               779 non-null    object
 8   os_bit           779 non-null    object
 9   graphic_card_gb  779 non-null    int64 
 10  weight           779 non-null    object
 11  warranty         779 non-null    int64 
 12  touchscreen      779 non-null    int64 
 13  price            779 non-null    int64 
dtypes: int64(7), object(7)
memory usage: 91.3+ KB


## Análise Exploratória
Vamos utilizar o Yala Profiling para explorar brevemente os dados em busca de conhecer as principais features e averigar se os dados estão prontos para serem utilizados como treinamento.

Criando atributos one-hot encoded.

In [19]:
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["brand"], prefix="brand")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["processor_brand"], prefix="processor_brand")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["processor_name"], prefix="processor_name")

df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["os"], prefix="os")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["weight"], prefix="weight")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["warranty"], prefix="warranty")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["touchscreen"], prefix="touchscreen")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["ram_gb"], prefix="ram_gb")

df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["hdd"], prefix="hdd")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["ssd"], prefix="ssd")

df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["graphic_card_gb"], prefix="graphic_card_gb")
df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["ram_type"], prefix="ram_type")

df_transformed = pd.get_dummies(df_transformed, dtype=int, columns=["os_bit"], prefix="os_bit")

In [33]:
df_transformed

Unnamed: 0,price,brand_asus,brand_dell,brand_hp,brand_lenovo,brand_other,processor_brand_amd,processor_brand_intel,processor_brand_m1,processor_name_core i3,...,ssd_2048,ssd_3072,graphic_card_gb_4,graphic_card_gb_8,graphic_card_gb_16,graphic_card_gb_32,ram_type_ddr4,ram_type_other,os_bit_32-bit,os_bit_64-bit
0,2321,1,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1
1,2613,0,0,0,1,0,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1
2,2680,0,0,0,1,0,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1
3,4689,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,1,0
4,1808,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,9111,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
819,9714,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
820,10049,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1
821,9580,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,1


Exportando os dados processados.

In [34]:
df_transformed.to_csv("../data/processed/laptop-price-brl-processed.csv", index=False)

## Conectando com DagsHub

Você precisa criar uma conta e conectar o repositório antes.

In [37]:
dagshub.init(repo_owner='michelpf', repo_name='fiap-ds-mlops-laptop-pricing-brl')

Importação da pasta de dados. Neste parte o caminho precisa ser relativo a pasta em que estamos.

In [None]:
upload_files('michelpf/fiap-ds-mlops-laptop-pricing-brl', '../data/raw/laptop-price-brl.csv', remote_path='data/raw/laptop-price-brl.csv' , commit_message='update data')
upload_files('michelpf/fiap-ds-mlops-laptop-pricing-brl', '../data/processed/laptop-price-brl-processed.csv', remote_path='data/processed/laptop-price-brl-processed.csv', commit_message='update data')

[Errno 2] No such file or directory: '/Users/michelpf/Documents/Projects/fiap-ds-mlops-laptop-pricing-brl/data/raw/*.csv'


FileNotFoundError: [Errno 2] No such file or directory: '/Users/michelpf/Documents/Projects/fiap-ds-mlops-laptop-pricing-brl/data/raw/*.csv'

Agora vamos criar o dataset a partir dos dados enviados. Neste caso precisamos informar o caminho referente ao repositório.

In [32]:
from dagshub.data_engine import datasources

# Create a datasource from the 'dataset/' folder in a repo it
ds = datasources.create_from_repo(repo="michelpf/fiap-ds-mlops-laptop-pricing-brl", name="laptop-pricing", path="data")

In [51]:
import dagshub
dagshub.__version__

'0.2.7'