## 1. Load Required Libraries

In [14]:
import pandas as pd
import src.util as utils
from sklearn.model_selection import train_test_split

## 2. Load Configuration File

In [15]:
config = utils.load_config()
config

{'dataset_raw_path': 'data/raw/laptop_price_data.csv',
 'dataset_path': 'data/processed/raw_dataset.pkl',
 'test_size': 0.3,
 'valid_size': 0.5,
 'train_set_path': ['data/processed/x_train.pkl',
  'data/processed/y_train.pkl'],
 'valid_set_path': ['data/processed/x_valid.pkl',
  'data/processed/y_valid.pkl'],
 'test_set_path': ['data/processed/x_test.pkl', 'data/processed/y_test.pkl'],
 'train_clean_path': ['data/processed/x_train_clean.pkl',
  'data/processed/y_train_clean.pkl'],
 'valid_clean_path': ['data/processed/x_valid_clean.pkl',
  'data/processed/y_valid_clean.pkl'],
 'test_clean_path': ['data/processed/x_test_clean.pkl',
  'data/processed/y_test_clean.pkl'],
 'production_model_path': 'models/production_model.pkl',
 'training_log_path': 'log/training_log.json',
 'standardizer_path': 'data/output/standardizer.pkl',
 'ohe_path': 'data/output/one_hot_encoder.pkl',
 'preprocessor_path': 'data/output/preprocessor.pkl',
 'label': 'Price',
 'predictors': ['Company',
  'TypeName',
  '

## 3. Load Dataset

In [16]:
dataset = pd.read_csv(config["dataset_raw_path"])
dataset.head()

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,HDD,SSD,Hybrid,Flash_Storage
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128 SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832,0,128,0,0
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128 Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232,0,0,0,128
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256 SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0,0,256,0,0
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512 SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336,0,512,0,0
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256 SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808,0,256,0,0


In [29]:
dataset["Company"].value_counts()

Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: Company, dtype: int64

In [17]:
dataset["Flash_Storage"].value_counts()

0      1228
32       38
64       16
256       8
16        7
128       4
512       2
Name: Flash_Storage, dtype: int64

## 4. Data Validation

In [18]:
dataset.isnull().sum()

Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
HDD                 0
SSD                 0
Hybrid              0
Flash_Storage       0
dtype: int64

In [19]:
dataset.dtypes

Company              object
TypeName             object
Inches              float64
ScreenResolution     object
Cpu                  object
Ram                  object
Memory               object
Gpu                  object
OpSys                object
Weight               object
Price               float64
HDD                   int64
SSD                   int64
Hybrid                int64
Flash_Storage         int64
dtype: object

In [20]:
def check_data(input_data, params):
    # check range of data
    assert set(input_data.Company).issubset(set(params["range_company"])), "an error occurs in Company range."

In [21]:
check_data(dataset, config)

## 6. Data Splitting

In [22]:
x = dataset[config["predictors"]].copy()
y = dataset[config["label"]].copy()

In [23]:
x

Unnamed: 0,Company,TypeName,OpSys,Cpu,Inches,Ram,Memory,Gpu,Weight,ScreenResolution,HDD,SSD,Hybrid,Flash_Storage
0,Apple,Ultrabook,macOS,Intel Core i5 2.3GHz,13.3,8GB,128 SSD,Intel Iris Plus Graphics 640,1.37kg,IPS Panel Retina Display 2560x1600,0,128,0,0
1,Apple,Ultrabook,macOS,Intel Core i5 1.8GHz,13.3,8GB,128 Flash Storage,Intel HD Graphics 6000,1.34kg,1440x900,0,0,0,128
2,HP,Notebook,No OS,Intel Core i5 7200U 2.5GHz,15.6,8GB,256 SSD,Intel HD Graphics 620,1.86kg,Full HD 1920x1080,0,256,0,0
3,Apple,Ultrabook,macOS,Intel Core i7 2.7GHz,15.4,16GB,512 SSD,AMD Radeon Pro 455,1.83kg,IPS Panel Retina Display 2880x1800,0,512,0,0
4,Apple,Ultrabook,macOS,Intel Core i5 3.1GHz,13.3,8GB,256 SSD,Intel Iris Plus Graphics 650,1.37kg,IPS Panel Retina Display 2560x1600,0,256,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,2 in 1 Convertible,Windows 10,Intel Core i7 6500U 2.5GHz,14.0,4GB,128 SSD,Intel HD Graphics 520,1.8kg,IPS Panel Full HD / Touchscreen 1920x1080,0,128,0,0
1299,Lenovo,2 in 1 Convertible,Windows 10,Intel Core i7 6500U 2.5GHz,13.3,16GB,512 SSD,Intel HD Graphics 520,1.3kg,IPS Panel Quad HD+ / Touchscreen 3200x1800,0,512,0,0
1300,Lenovo,Notebook,Windows 10,Intel Celeron Dual Core N3050 1.6GHz,14.0,2GB,64 Flash Storage,Intel HD Graphics,1.5kg,1366x768,0,0,0,64
1301,HP,Notebook,Windows 10,Intel Core i7 6500U 2.5GHz,15.6,6GB,1000 HDD,AMD Radeon R5 M330,2.19kg,1366x768,1000,0,0,0


In [24]:
y

0        71378.6832
1        47895.5232
2        30636.0000
3       135195.3360
4        96095.8080
           ...     
1298     33992.6400
1299     79866.7200
1300     12201.1200
1301     40705.9200
1302     19660.3200
Name: Price, Length: 1303, dtype: float64

In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [26]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42)

In [27]:
utils.pickle_dump(dataset, config["dataset_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])