# Data Ingestion: Laptop Price Dataset

In [1]:
# Import libraries for system configuration
import os 
import sys 

sys.path.append(os.path.abspath(".."))

# Import libraries for data collection
import numpy as np 
import pandas as pd 

## Data Loading

In [2]:
# Import dataloader from script collector 
from scripts.collection.collector import DataLoader
from scripts.collection.ingestor import Ingestor

In [3]:
# Initialise variables 
data_folder = "data"
dataset_filename = "ebay_laptop_ds.csv"

In [4]:
# Create a data loader: load laptop dataset
data_loader = DataLoader(filename=dataset_filename, folder=data_folder)
laptop_ds = data_loader.load()
laptop_ds

File accepted


Unnamed: 0,Brand,Price,Currency,Color,Features,Condition,Condition Description,Seller Note,GPU,Processor,...,Height of the Display,OS,Storage Type,Hard Drive Capacity,Hard Drive Capacity Unit,SSD Capacity,SSD Capacity Unit,Screen Size (inch),Ram Size,Ram Size Unit
0,other,303.80,$,gray,"Backlit Keyboard, Built-in Microphone, Built...",New,"A brand-new, unused, unopened, undamaged item ...",undefined,intel,quad core,...,1440.0,windows,ssd,512.0,gb,1.0,tb,14,8.0,gb
1,dell,400.00,$,black,"Backlit Keyboard, Bluetooth, Built-in Micropho...",Very Good - Refurbished,The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...,intel,intel core i7 8th generation,...,1080.0,windows,ssd,2.0,tb,,unknown,14,,unknown
2,dell,175.00,$,black,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",Used,An item that has been used previously. The ite...,"well kept, fully functional, includes battery,...",intel,intel core i5-6300u,...,1080.0,windows,ssd,500.0,gb,500.0,gb,14,16.0,gb
3,hp,85.00,$,black,"Bluetooth, Built-in Microphone, Built-in Webca...",Good - Refurbished,The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...,intel,intel celeron n,...,768.0,chrome,emmc,16.0,gb,,unknown,11.6,4.0,gb
4,dell,101.25,$,other,"10/100 LAN Card, Built-in Microphone, Built-in...",Good - Refurbished,The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...,other,intel core i5 6th generation,...,768.0,windows,ssd,,unknown,256.0,gb,12.5,8.0,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,162.20,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,undefined,...,,unknown,unknown,,unknown,,unknown,,,unknown
4179,acer,93.25,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,...,,unknown,unknown,,unknown,,unknown,,,unknown
4180,dell,424.80,$,black,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",Used,An item that has been used previously. The ite...,undefined,intel,intel core i5 6th generation,...,1080.0,windows,unknown,,unknown,120.0,gb,12.5,,unknown
4181,hp,90.94,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,...,,unknown,unknown,,unknown,,unknown,,,unknown


## ETL-Process

+ **Extraction**: extract features from dataset
+ **Transformation**: create new 4 dataset and apply transformation operations
+ **Loading**: load the dataset into mysql database

**Extraction**

Features can be split into 4 main groups:
+ **Numerical Data**
+ **Categorical Data**
+ **Complex Data**: Feature, Selle Note and Condition Description


In [5]:
# Check feature types
# laptop_ds.info()
desc_data = pd.DataFrame()

# Create features for descriptive dataset
desc_data["DataTypes"] = laptop_ds.dtypes
desc_data["Nullvalues"] = laptop_ds.isnull().sum()

# desc_data.transpose()
desc_data["Uniques"] = laptop_ds.nunique()
desc_data["Values"] = laptop_ds.values[:][0]
desc_data["FeatureType"] = desc_data["Values"].apply(lambda x: "Categorical" if type(x) == str else "Numerical")

# Reshape dataset
desc_data = desc_data.reset_index().rename(columns={"index": "Properties"})
desc_data["FeatureType"] = desc_data[["Properties", "FeatureType"]].apply(lambda property: "Complex" if property["Properties"] in ["Features", "Condition Description", "Seller Note"] else property["FeatureType"], axis=1)
# desc_data[["DataTypes", "FeatureType"]]
desc_data

Unnamed: 0,Properties,DataTypes,Nullvalues,Uniques,Values,FeatureType
0,Brand,object,0,10,other,Categorical
1,Price,float64,0,2014,303.8,Numerical
2,Currency,object,0,1,$,Categorical
3,Color,object,0,20,gray,Categorical
4,Features,object,2190,709,"Backlit Keyboard, Built-in Microphone, Built...",Complex
5,Condition,object,0,10,New,Categorical
6,Condition Description,object,0,10,"A brand-new, unused, unopened, undamaged item ...",Complex
7,Seller Note,object,0,795,undefined,Complex
8,GPU,object,0,5,intel,Categorical
9,Processor,object,0,413,quad core,Categorical


## Data Transformation: Numerical Data

In [6]:
# Extract numerical data
numerical_data = desc_data[desc_data["FeatureType"] == "Numerical"]
numerical_data

# Extract features 
numerical_vars = numerical_data["Properties"].values 
laptop_ds_num = laptop_ds[numerical_vars]
laptop_ds_num

Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size
0,303.80,2160.0,1440.0,512.0,1.0,8.0
1,400.00,1920.0,1080.0,2.0,,
2,175.00,1920.0,1080.0,500.0,500.0,16.0
3,85.00,1366.0,768.0,16.0,,4.0
4,101.25,1366.0,768.0,,256.0,8.0
...,...,...,...,...,...,...
4178,162.20,,,,,
4179,93.25,,,,,
4180,424.80,1920.0,1080.0,,120.0,
4181,90.94,,,,,


## Data Transformation: Categorical Data

In [7]:
# Check categorical features
categorical_data = desc_data[desc_data["FeatureType"] == "Categorical"]

# Extract categorical features 
categorical_vars = categorical_data["Properties"].values
laptop_ds_cat = laptop_ds[categorical_vars]
laptop_ds_cat

Unnamed: 0,Brand,Currency,Color,Condition,GPU,Processor,Processor Speed,Processor Speed Unit,Type,OS,Storage Type,Hard Drive Capacity Unit,SSD Capacity Unit,Screen Size (inch),Ram Size Unit
0,other,$,gray,New,intel,quad core,3.80,GHz,notebook/laptop,windows,ssd,gb,tb,14,gb
1,dell,$,black,Very Good - Refurbished,intel,intel core i7 8th generation,4.20,GHz,notebook/laptop,windows,ssd,tb,unknown,14,unknown
2,dell,$,black,Used,intel,intel core i5-6300u,2.40,GHz,notebook/laptop,windows,ssd,gb,gb,14,gb
3,hp,$,black,Good - Refurbished,intel,intel celeron n,2.40,GHz,notebook/laptop,chrome,emmc,gb,unknown,11.6,gb
4,dell,$,other,Good - Refurbished,other,intel core i5 6th generation,1.40,GHz,notebook/laptop,windows,ssd,unknown,gb,12.5,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,$,other,New,other,undefined,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4179,acer,$,other,New,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4180,dell,$,black,Used,intel,intel core i5 6th generation,2.80,GHz,notebook/laptop,windows,unknown,unknown,gb,12.5,unknown
4181,hp,$,other,New,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown


In [8]:
# Extract processor speed feature from categorical + combine with numerical feature 
processor_speed = laptop_ds_cat["Processor Speed"]
laptop_ds_num = pd.concat([laptop_ds_num, processor_speed], axis=1)
laptop_ds_num

Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size,Processor Speed
0,303.80,2160.0,1440.0,512.0,1.0,8.0,3.80
1,400.00,1920.0,1080.0,2.0,,,4.20
2,175.00,1920.0,1080.0,500.0,500.0,16.0,2.40
3,85.00,1366.0,768.0,16.0,,4.0,2.40
4,101.25,1366.0,768.0,,256.0,8.0,1.40
...,...,...,...,...,...,...,...
4178,162.20,,,,,,
4179,93.25,,,,,,
4180,424.80,1920.0,1080.0,,120.0,,2.80
4181,90.94,,,,,,


## Data Transformation: Complex Data

In [9]:
# Transform data into complex data 
complex_data = desc_data[desc_data["FeatureType"] == "Complex"]
complex_features = complex_data["Properties"].values
complex_features

# Create a new complex ddata
laptop_ds_complex = laptop_ds[complex_features]
laptop_ds_complex

Unnamed: 0,Features,Condition Description,Seller Note
0,"Backlit Keyboard, Built-in Microphone, Built...","A brand-new, unused, unopened, undamaged item ...",undefined
1,"Backlit Keyboard, Bluetooth, Built-in Micropho...",The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...
2,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",An item that has been used previously. The ite...,"well kept, fully functional, includes battery,..."
3,"Bluetooth, Built-in Microphone, Built-in Webca...",The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...
4,"10/100 LAN Card, Built-in Microphone, Built-in...",The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...
...,...,...,...
4178,,"A brand-new, unused, unopened, undamaged item ...",undefined
4179,,"A brand-new, unused, unopened, undamaged item ...",undefined
4180,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",An item that has been used previously. The ite...,undefined
4181,,"A brand-new, unused, unopened, undamaged item ...",undefined


## Data Storage

In [10]:
# Import dependencies for storing datasets
from scripts.collection.collector import DataSaver

After transformation is completed, the remaining datasets will be stored to processed folder. These datasets are names as: 

+ **Laptop_numerical_data**: dataset contains only numerical features 
+ **Laptop_categorical_data**: dataset contains only categorical features and some irrelevant data
+ **Laptop_complex_data**: dataset contains only complex textual data useful for feature engineering

In [11]:
# Initialise path variables
data_dict = {}
data_dict["Laptop_numerical_data"] = laptop_ds_num
data_dict["Laptop_categorical_data"] = laptop_ds_cat
data_dict["Laptop_complex_data"] = laptop_ds_complex


In [12]:
# Intialise path variables 
data_folder_path = "data\processed"

# Instantiate DataSaver-object
datasaver = DataSaver(folder=data_folder_path)
datasaver.save(data_dict=data_dict)

File 1: dataset as Laptop_numerical_data.csv --> C:\Development\Projects\MachineLearning\Laptop-Price-Predictor-System\data\processed (Saving is successful


File 2: dataset as Laptop_categorical_data.csv --> C:\Development\Projects\MachineLearning\Laptop-Price-Predictor-System\data\processed (Saving is successful
File 3: dataset as Laptop_complex_data.csv --> C:\Development\Projects\MachineLearning\Laptop-Price-Predictor-System\data\processed (Saving is successful
