# Feature Selection 1

In [1]:
# Import libraries for machine learning 
import numpy as np 
import pandas as pd 
import os
import sys 

In [2]:
# System Configuration
sys.path.append(os.path.abspath(".."))

### Feature Selection: Approach

+ Load all possible datasets
+ Make a selection of features from each dataset
+ Save the processed features for further use

## Data Loading

In [3]:
# Import functionalities for loading data 
from scripts.collection.ingestor import FileIngestor
from scripts.collection.ingestor import Ingestor

In [4]:
# Ingest data 
sel_folder = "processed"
project_path = "C:\Development\Projects\MachineLearning\Laptop-Price-Predictor-System\data"

# Instantiate object for data ingestion
filenames = os.listdir(os.path.join(project_path, sel_folder))
data_library = {}

# Build a data storage library
for filename in filenames:
    # Ingest dataset 
    ingestor = FileIngestor("processed", filename)
    dataset = ingestor.ingest() 

    # Rename filenames as address key for dataset
    name = filename.split(".")[0].upper()
    key_name = name.split("_")[0] + "_DATA"

    # Store name into dictionary including the dataset
    data_library[key_name] = dataset
    print(f"File {filename} has successfully processed and stored")

File hardware_table.csv has successfully processed and stored
File information_table.csv has successfully processed and stored
File memory_table.csv has successfully processed and stored
File processor_table.csv has successfully processed and stored
File visual_table.csv has successfully processed and stored


In [5]:
hardware_data = data_library["HARDWARE_DATA"]
information_data = data_library["INFORMATION_DATA"]
memory_data = data_library["MEMORY_DATA"]
processor_data = data_library["PROCESSOR_DATA"]
visual_data = data_library["VISUAL_DATA"]

## Feature Selection

In [6]:
# Import functionalities for feature selection
from scripts.features.feature_selector import FeatureSelector
from scripts.processing.transformer import ColumnTransformer

From each dataset, we select 3 features maximum:

+ HARDWARE_DATA : GPU, OPERATING_SYSEM, TOUCHSCREEN, BLUETOOTH
+ MEMORY_DATA: HARD_DRIVE, SSD_CAPACITY
+ VISUAL_DATA: DISPLAY, WEBCAM, COLOR
+ INFORMATION_DAA: BRAND_ID, PRICE

In [7]:
hardware_data # 4183
# memory_data # 4182
# processor_data # 4182
visual_data # 4183
# information_data

Unnamed: 0,SCREEN_ID,SCREEN_SIZE_INCH,COLOR,WIDTH_DISPLAY,HEIGHT_DISPLAY,DISPLAY,WEBCAM(BUILT-IN)
0,S0,14.00,gray,2160.0,1440.0,NO,YES
1,S1,14.00,black,1920.0,1080.0,YES,YES
2,S2,11.60,black,1366.0,768.0,YES,YES
3,S3,12.50,other,1366.0,768.0,YES,YES
4,S4,11.60,black,1366.0,768.0,NO,YES
...,...,...,...,...,...,...,...
4178,S4178,8.43,other,1920.0,1080.0,NO,NO
4179,S4179,12.50,black,1920.0,1080.0,NO,NO
4180,S4180,8.43,other,1920.0,1080.0,NO,YES
4181,S4181,8.43,black,1920.0,1080.0,NO,NO


The number of rows are inconsistent across those datasets. 
+ hardware_data, visual_data: 4183
+ processor_data and memory_dat  and information data: 4182

Datasets that has 4183 contain a null value row. This will be removed to make the number of samples for all feature datasets consistent. 

In [8]:
# Identify number of null values 
# Remove row with null values from data with 4183 samples
def remove_row(dataset: pd.DataFrame, feature: str) -> pd.DataFrame:
    index = dataset[dataset[feature].isnull()].index.values
    dataset = dataset.drop(index)
    null_vals = dataset.isnull().sum()
    print(null_vals)

    return dataset

# Remove row with null values 
hardware_data = remove_row(hardware_data, "HARDWARE_ID")
visual_data = remove_row(visual_data, "SCREEN_ID")

HARDWARE_ID             0
GPU                     0
RAM_SIZE_UNIT           0
STORAGE_TYPE            0
OPERATING_SYSTEM        0
TOUCHSCREEN             0
BLUETOOTH               0
MICROPHONE(BUILT-IN)    0
dtype: int64
SCREEN_ID           0
SCREEN_SIZE_INCH    0
COLOR               0
WIDTH_DISPLAY       0
HEIGHT_DISPLAY      0
DISPLAY             0
WEBCAM(BUILT-IN)    0
dtype: int64


In [9]:
# Initialise feature selections 
selection1 = ["GPU", "OPERATING_SYSTEM", "TOUCHSCREEN", "BLUETOOTH"]
selection2 = ["HARD_DRIVE", "SSD_CAPACITY"]
selection3 = ["DISPLAY", "WEBCAM(BUILT-IN)", "COLOR"]
selection4 = ["BRAND", "PRICE"]

# Instatiate selector objects
selector1 = FeatureSelector(hardware_data)
selector2 = FeatureSelector(memory_data)
selector3 = FeatureSelector(visual_data)
selector4 = FeatureSelector(information_data)

# Make feature selection based on these selections
feature_data1 = selector1.select(selection1)
feature_data2 = selector2.select(selection2)
feature_data3 = selector3.select(selection3)
feature_data4 = selector4.select(selection4)

Number of features selected: 4
Number of features selected: 2
Number of features selected: 3
Number of features selected: 2


In [10]:
feature_data1
# feature_data4

Unnamed: 0,GPU,OPERATING_SYSTEM,TOUCHSCREEN,BLUETOOTH
0,intel,windows,NO,NO
1,intel,windows,NO,YES
2,intel,chrome,YES,YES
3,other,windows,NO,YES
4,other,chrome,NO,NO
...,...,...,...,...
4177,other,unknown,NO,NO
4178,other,unknown,NO,NO
4179,intel,windows,NO,NO
4180,other,unknown,YES,YES


In [12]:
# Combine 4 feature data into one single full dataset
# Prepare the selecte feature data into a dictionary 
feature_coll = {}
features = [feature_data4, feature_data1, feature_data2, feature_data3]

# Instantiate the combiner object from the ColumnTransformer class
column_combiner = ColumnTransformer()
mobile_df = column_combiner.combine(features)
mobile_df

Number of datasets for Column Combination: 4


Unnamed: 0,BRAND,PRICE,GPU,OPERATING_SYSTEM,TOUCHSCREEN,BLUETOOTH,HARD_DRIVE,SSD_CAPACITY,DISPLAY,WEBCAM(BUILT-IN),COLOR
0,other,303.80,intel,windows,NO,NO,512,1,NO,YES,gray
1,dell,175.00,intel,windows,NO,YES,500,500,YES,YES,black
2,hp,85.00,intel,chrome,YES,YES,16,240,YES,YES,black
3,dell,101.25,other,windows,NO,YES,256,256,YES,YES,other
4,acer,50.00,other,chrome,NO,NO,256,16,NO,YES,black
...,...,...,...,...,...,...,...,...,...,...,...
4177,other,162.20,other,unknown,NO,NO,256,240,NO,NO,other
4178,acer,93.25,other,unknown,NO,NO,256,240,NO,NO,other
4179,dell,424.80,intel,windows,NO,NO,256,120,NO,NO,black
4180,hp,90.94,other,unknown,YES,YES,256,240,NO,YES,other


## Feature Analysis

Performing the feature analysis helps us finding the distribution of the feature, possible some null values and incosistencies.

In [13]:
# Identify data distribution
mobile_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4182 entries, 0 to 4181
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   BRAND             4182 non-null   object 
 1   PRICE             4182 non-null   float64
 2   GPU               4182 non-null   object 
 3   OPERATING_SYSTEM  4182 non-null   object 
 4   TOUCHSCREEN       4182 non-null   object 
 5   BLUETOOTH         4182 non-null   object 
 6   HARD_DRIVE        4182 non-null   int64  
 7   SSD_CAPACITY      4182 non-null   int64  
 8   DISPLAY           4182 non-null   object 
 9   WEBCAM(BUILT-IN)  4182 non-null   object 
 10  COLOR             4182 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 392.1+ KB


## Data Storage

In [14]:
# Implement functionalities to store data
from scripts.collection.saver import OneFileSaver

In [16]:
# Instantiate object from OneFileSaver class
data_saver = OneFileSaver("data/training")
data_saver.save(mobile_df, "laptop_price_dataset")

File laptop_price_dataset.csv has been stored successfully
