## Exploratory Data Analysis on the Laptop Prices Dataset

### 1.0 Importando as libs

In [115]:
import sys
import os

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### 2.0 Carregar o dataset

In [136]:
prices_df = pd.read_csv("../data/raw/laptop_price.csv", sep=',', encoding='latin1')
print(prices_df.head())

   laptop_ID Company      Product   TypeName  Inches   
0          1   Apple  MacBook Pro  Ultrabook    13.3  \
1          2   Apple  Macbook Air  Ultrabook    13.3   
2          3      HP       250 G6   Notebook    15.6   
3          4   Apple  MacBook Pro  Ultrabook    15.4   
4          5   Apple  MacBook Pro  Ultrabook    13.3   

                     ScreenResolution                         Cpu   Ram   
0  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB  \
1                            1440x900        Intel Core i5 1.8GHz   8GB   
2                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB   
3  IPS Panel Retina Display 2880x1800        Intel Core i7 2.7GHz  16GB   
4  IPS Panel Retina Display 2560x1600        Intel Core i5 3.1GHz   8GB   

                Memory                           Gpu  OpSys  Weight   
0            128GB SSD  Intel Iris Plus Graphics 640  macOS  1.37kg  \
1  128GB Flash Storage        Intel HD Graphics 6000  macOS  1.34kg   

In [117]:
prices_df.shape

(1303, 13)

In [118]:
prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Memory            1303 non-null   object 
 9   Gpu               1303 non-null   object 
 10  OpSys             1303 non-null   object 
 11  Weight            1303 non-null   object 
 12  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(10)
memory usage: 132.5+ KB


### 3.0 Explorando os dados

#### 3.1 Verificando os valores únicos de cada coluna
Vamos começar entendendo como estão distribuídas nossas variáveis (numeros vs texto) para descobrir o Feature Engineering necessário

In [119]:
for column in prices_df.columns:
    print(f"Coluna: {column} | Valores únicos: {prices_df[column].nunique()}")
    print(prices_df[column].unique())
    print("-"*100)

Coluna: laptop_ID | Valores únicos: 1303
[   1    2    3 ... 1318 1319 1320]
----------------------------------------------------------------------------------------------------
Coluna: Company | Valores únicos: 19
['Apple' 'HP' 'Acer' 'Asus' 'Dell' 'Lenovo' 'Chuwi' 'MSI' 'Microsoft'
 'Toshiba' 'Huawei' 'Xiaomi' 'Vero' 'Razer' 'Mediacom' 'Samsung' 'Google'
 'Fujitsu' 'LG']
----------------------------------------------------------------------------------------------------
Coluna: Product | Valores únicos: 618
['MacBook Pro' 'Macbook Air' '250 G6' 'Aspire 3' 'ZenBook UX430UN'
 'Swift 3' 'Inspiron 3567' 'MacBook 12"' 'IdeaPad 320-15IKB' 'XPS 13'
 'Vivobook E200HA' 'Legion Y520-15IKBN' '255 G6' 'Inspiron 5379'
 '15-BS101nv (i7-8550U/8GB/256GB/FHD/W10)' 'MacBook Air' 'Inspiron 5570'
 'Latitude 5590' 'ProBook 470' 'LapBook 15.6"'
 'E402WA-GA010T (E2-6110/2GB/32GB/W10)'
 '17-ak001nv (A6-9220/4GB/500GB/Radeon' 'IdeaPad 120S-14IAP'
 'Inspiron 5770' 'ProBook 450' 'X540UA-DM186 (i3-6006U/4GB/1TB

#### 3.2 Começamos dropand uma coluna que com certeza não vamos usar devido a alta cardinalidade: Nome do produto

In [120]:
prices_df = prices_df.drop("Product", axis=1)

#### 3.3 Fazendo One Hot Encoding na coluna Company Name

In [121]:
prices_df["Company"].value_counts()

Company
Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: count, dtype: int64

In [122]:
companies_to_agg = ["Razer",
"Mediacom",
"Microsoft",
"Xiaomi",
"Vero",
"Chuwi",
"Google",
'Fujitsu',
'LG',
'Huawei']

In [123]:
prices_df["Company"] = prices_df["Company"].apply(lambda x: "Other" if x in companies_to_agg else x)

In [124]:
# Screen Resolution - A resolução é sempre no final da string, então é fácil
prices_df["ScreenResolution"] = prices_df["ScreenResolution"].str.split(" ").apply(lambda x: x[-1])
prices_df["ScreenWidth"] =  prices_df["ScreenResolution"].str.split("x").apply(lambda x: x[0]).astype("int")
prices_df["ScreenHeight"] =  prices_df["ScreenResolution"].str.split("x").apply(lambda x: x[1]).astype("int")

In [125]:
# CPU é mesma coisa, a frequencia do CPU é sempre a ultima e a marca é a primeira
prices_df["CPU_BRAND"] =  prices_df["Cpu"].str.split(" ").apply(lambda x: x[0])
prices_df["CPU_FREQUENCY"] =  prices_df["Cpu"].str.split(" ").apply(lambda x: x[-1])
prices_df["CPU_FREQUENCY"] =  prices_df["CPU_FREQUENCY"].apply(lambda x: x[:-3]).astype("float")

In [126]:
# RAM
prices_df["Ram"] =  prices_df["Ram"].apply(lambda x: x[:-2]).astype("int")

In [127]:
def convert_to_gb(string):
    number = float(string[:-2])

    if "TB" in string:
        number = number*1024
    
    return number

In [128]:
# Memory
prices_df["Memory_Size"] = prices_df["Memory"].str.split(" ").apply(lambda x: x[0])
prices_df["Memory_Size"] = prices_df["Memory_Size"].apply(lambda x: convert_to_gb(x))
prices_df["Memory_Type"] = prices_df["Memory"].str.split(" ").apply(lambda x: x[1])

In [129]:
# Weight
prices_df["Weight"] = prices_df["Weight"].apply(lambda x: x[:-2]).astype("float")

In [130]:
# GPU
prices_df["GPU_BRAND"] = prices_df["Gpu"].str.split(" ").apply(lambda x: x[0])

In [131]:
# Get Dummies das variaveis categóricas finais
prices_df = prices_df.join(pd.get_dummies(prices_df["Company"], prefix="company", dtype="int"))
prices_df = prices_df.join(pd.get_dummies(prices_df["TypeName"], prefix="typeName", dtype="int"))
prices_df = prices_df.join(pd.get_dummies(prices_df["CPU_BRAND"], prefix="CPU_BRAND", dtype="int"))
prices_df = prices_df.join(pd.get_dummies(prices_df["GPU_BRAND"], prefix="GPU_BRAND", dtype="int"))
prices_df = prices_df.join(pd.get_dummies(prices_df["OpSys"], prefix="OpSys", dtype="int"))
prices_df = prices_df.join(pd.get_dummies(prices_df["Memory_Type"], prefix="Memory_Type", dtype="int"))

In [132]:
prices_df = prices_df.drop(["Company", "TypeName", "ScreenResolution", "Cpu", "Memory", "Gpu", "OpSys", "GPU_BRAND", "Memory_Type", "CPU_BRAND"], axis=1)

In [113]:
prices_df

Unnamed: 0,laptop_ID,Inches,Ram,Weight,Price_euros,company_Acer,company_Apple,company_Asus,company_Chuwi,company_Dell,...,GPU_BRAND_Nvidia,OpSys_Android,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
0,1,13.3,8,1.37,1339.69,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,13.3,8,1.34,898.94,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,15.6,8,1.86,575.00,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,15.4,16,1.83,2537.45,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,13.3,8,1.37,1803.60,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,1316,14.0,4,1.80,638.00,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1299,1317,13.3,16,1.30,1499.00,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1300,1318,14.0,2,1.50,229.00,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1301,1319,15.6,6,2.19,764.00,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
