# Data Transformation

In [1]:
# Import functionalities for transforming data 
import numpy as np 
import pandas as pd 
import os 
import sys 

In [2]:
# System configuration 
sys.path.append(os.path.abspath(".."))

## Data Loading

In [3]:
# Import functionalities for data loading
from scripts.collection.collector import DataLoader

In [4]:
# Initialisation: Select folders need for loading data
feature_folder = "data/featured"
processed_folder = "data/processed"

# Implement function: Instantiate data loader object & load data 
def data_loading(filename: str, folder: str,) -> pd.DataFrame:
    # Data Loader object
    dataloader = DataLoader(filename, folder)
    dataset = dataloader.load()

    return dataset

In [5]:
# Load the dataset
dataset1 = data_loading("new_feature_data1.csv", feature_folder)
dataset2 = data_loading("hardware_table.csv", processed_folder)
dataset3 = data_loading("visual_table.csv", processed_folder)

File accepted
File accepted
File accepted


In [6]:
dataset1

Unnamed: 0,Touchscreen,Bluetooth,Webcam(Built-In),Microphone(Built-In),Display,WiFi
0,NO,NO,YES,YES,NO,NO
1,NO,YES,YES,YES,YES,YES
2,YES,YES,YES,YES,YES,YES
3,NO,YES,YES,YES,YES,YES
4,NO,NO,YES,YES,NO,YES
...,...,...,...,...,...,...
4178,NO,NO,NO,NO,NO,NO
4179,NO,NO,NO,NO,NO,NO
4180,YES,YES,YES,NO,NO,YES
4181,NO,NO,NO,NO,NO,NO


In [7]:
dataset2

Unnamed: 0,HARDWARE_ID,GPU,RAM_SIZE_UNIT,STORAGE_TYPE,OPERATING_SYSTEM
0,H0,intel,gb,ssd,windows
1,H1,intel,gb,ssd,windows
2,H2,intel,gb,emmc,chrome
3,H3,other,gb,ssd,windows
4,H4,other,gb,ssd,chrome
...,...,...,...,...,...
4177,H4177,other,unknown,unknown,unknown
4178,H4178,other,unknown,unknown,unknown
4179,H4179,intel,unknown,unknown,windows
4180,H4180,other,unknown,unknown,unknown


In [8]:
dataset3

Unnamed: 0,SCREEN_ID,SCREEN_SIZE_INCH,COLOR,WIDTH_DISPLAY,HEIGHT_DISPLAY
0,S0,14.00,gray,2160,1440
1,S1,14.00,black,1920,1080
2,S2,11.60,black,1366,768
3,S3,12.50,other,1366,768
4,S4,11.60,black,1366,768
...,...,...,...,...,...
4177,S4177,8.43,other,1920,1080
4178,S4178,8.43,other,1920,1080
4179,S4179,12.50,black,1920,1080
4180,S4180,8.43,other,1920,1080


In [9]:
## Data Transformation

In [10]:
# Import functionalities to transform data 
from scripts.processing.transformer import ColumnTransformer

In [11]:
# Rearrange data into the correct column
column_transformer = ColumnTransformer(dataset2)
selected_features = ["Touchscreen", "Bluetooth", "Microphone(Built-In)"]
sel_data = dataset1[selected_features]
sel_data.columns = [column.upper() for column in sel_data.columns]

# Combine selected data with dataset2
dataset2 = column_transformer.combine(sel_data)

In [12]:
dataset2

Unnamed: 0,HARDWARE_ID,GPU,RAM_SIZE_UNIT,STORAGE_TYPE,OPERATING_SYSTEM,TOUCHSCREEN,BLUETOOTH,MICROPHONE(BUILT-IN)
0,H0,intel,gb,ssd,windows,NO,NO,YES
1,H1,intel,gb,ssd,windows,NO,YES,YES
2,H2,intel,gb,emmc,chrome,YES,YES,YES
3,H3,other,gb,ssd,windows,NO,YES,YES
4,H4,other,gb,ssd,chrome,NO,NO,YES
...,...,...,...,...,...,...,...,...
4178,H4178,other,unknown,unknown,unknown,NO,NO,NO
4179,H4179,intel,unknown,unknown,windows,NO,NO,NO
4180,H4180,other,unknown,unknown,unknown,YES,YES,NO
4181,H4181,other,unknown,unknown,unknown,NO,NO,NO


In [13]:
# Instantiate object: combine data with dataset1
column_transformer2 = ColumnTransformer(dataset3)
selected_features = ["Display", "Webcam(Built-In)"]
sel_data = dataset1[selected_features]
sel_data.columns = [column.upper() for column in sel_data.columns]

# Combine selected data with dataset1
dataset3 = column_transformer2.combine(sel_data)

In [14]:
dataset3

Unnamed: 0,SCREEN_ID,SCREEN_SIZE_INCH,COLOR,WIDTH_DISPLAY,HEIGHT_DISPLAY,DISPLAY,WEBCAM(BUILT-IN)
0,S0,14.00,gray,2160.0,1440.0,NO,YES
1,S1,14.00,black,1920.0,1080.0,YES,YES
2,S2,11.60,black,1366.0,768.0,YES,YES
3,S3,12.50,other,1366.0,768.0,YES,YES
4,S4,11.60,black,1366.0,768.0,NO,YES
...,...,...,...,...,...,...,...
4178,S4178,8.43,other,1920.0,1080.0,NO,NO
4179,S4179,12.50,black,1920.0,1080.0,NO,NO
4180,S4180,8.43,other,1920.0,1080.0,NO,YES
4181,S4181,8.43,black,1920.0,1080.0,NO,NO


## Data Storage

In [15]:
# Import functionalities to store dataset
from scripts.collection.saver import OneFileSaver

In [19]:
# Initialise folder 
sel_folder = "data\processed"

# Instantiate object: store one file 
saver = OneFileSaver(sel_folder)
saver.save(dataset2, filename="hardware_table")
saver.save(dataset3, filename="visual_table")

File hardware_table.csv has been stored successfully
File visual_table.csv has been stored successfully
