# Feature Encoding: Test

In [1]:
# Import libraries for feature encoding
import os, sys
import numpy as np 
import pandas as pd 

In [2]:
## System configuration
sys.path.append(os.path.abspath(".."))

## Data Loading

In [3]:
# Import functionalities to load dataset 
from scripts.collection.ingestor import FileIngestor

In [4]:
# Ingest dataset 
ingestor = FileIngestor(folder="training", filename="laptop_price_dataset.csv")
dataset = ingestor.ingest()
dataset

Unnamed: 0,BRAND,PRICE,GPU,OPERATING_SYSTEM,TOUCHSCREEN,BLUETOOTH,HARD_DRIVE,SSD_CAPACITY,DISPLAY,WEBCAM(BUILT-IN),COLOR
0,other,303.80,intel,windows,NO,NO,512,1,NO,YES,gray
1,dell,175.00,intel,windows,NO,YES,500,500,YES,YES,black
2,hp,85.00,intel,chrome,YES,YES,16,240,YES,YES,black
3,dell,101.25,other,windows,NO,YES,256,256,YES,YES,other
4,acer,50.00,other,chrome,NO,NO,256,16,NO,YES,black
...,...,...,...,...,...,...,...,...,...,...,...
4177,other,162.20,other,unknown,NO,NO,256,240,NO,NO,other
4178,acer,93.25,other,unknown,NO,NO,256,240,NO,NO,other
4179,dell,424.80,intel,windows,NO,NO,256,120,NO,NO,black
4180,hp,90.94,other,unknown,YES,YES,256,240,NO,YES,other


In [5]:
# Display Laptop Price Features 
dataset.columns
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4182 entries, 0 to 4181
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   BRAND             4182 non-null   object 
 1   PRICE             4182 non-null   float64
 2   GPU               4182 non-null   object 
 3   OPERATING_SYSTEM  4182 non-null   object 
 4   TOUCHSCREEN       4182 non-null   object 
 5   BLUETOOTH         4182 non-null   object 
 6   HARD_DRIVE        4182 non-null   int64  
 7   SSD_CAPACITY      4182 non-null   int64  
 8   DISPLAY           4182 non-null   object 
 9   WEBCAM(BUILT-IN)  4182 non-null   object 
 10  COLOR             4182 non-null   object 
dtypes: float64(1), int64(2), object(8)
memory usage: 392.1+ KB


## Feature Encoding

In [6]:
# Implement functionalities to perform feature encoding
from scripts.features.feature_selector import FeatureSelector

In [7]:
# Initialise constant: list of selected features
sel_features = ['BRAND', 
                # 'PRICE', 
                'GPU', 'OPERATING_SYSTEM', 'TOUCHSCREEN', 'BLUETOOTH',
       # 'HARD_DRIVE', 'SSD_CAPACITY', 
       'DISPLAY', 'WEBCAM(BUILT-IN)', 'COLOR']

# Select features for encoding 
selector = FeatureSelector(dataset)
cat_dataset = selector.select(sel_features)
cat_dataset

Number of features selected: 8


Unnamed: 0,BRAND,GPU,OPERATING_SYSTEM,TOUCHSCREEN,BLUETOOTH,DISPLAY,WEBCAM(BUILT-IN),COLOR
0,other,intel,windows,NO,NO,NO,YES,gray
1,dell,intel,windows,NO,YES,YES,YES,black
2,hp,intel,chrome,YES,YES,YES,YES,black
3,dell,other,windows,NO,YES,YES,YES,other
4,acer,other,chrome,NO,NO,NO,YES,black
...,...,...,...,...,...,...,...,...
4177,other,other,unknown,NO,NO,NO,NO,other
4178,acer,other,unknown,NO,NO,NO,NO,other
4179,dell,intel,windows,NO,NO,NO,NO,black
4180,hp,other,unknown,YES,YES,NO,YES,other


In this test phase, we are going to build some algorithms that makes the feature encoding more efficient

Algorithm 1: Identify the number of discrete values per feature

In [8]:
# Algorithm one: identify the number of discrete categories
dataframe = pd.DataFrame()
dataframe["Categorical Features"] = cat_dataset.columns.values
dataframe["Number of Discrete values"] = dataframe["Categorical Features"].apply(lambda feature: cat_dataset[feature].nunique())
dataframe

Unnamed: 0,Categorical Features,Number of Discrete values
0,BRAND,10
1,GPU,5
2,OPERATING_SYSTEM,6
3,TOUCHSCREEN,2
4,BLUETOOTH,2
5,DISPLAY,2
6,WEBCAM(BUILT-IN),2
7,COLOR,20


Algorithm 2: Encoding categorical feature

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder 

**Binary Encoding**

In [10]:
# Label encoding 
# Identify distribution of feature
cat_dataset["TOUCHSCREEN"].value_counts()

# Identify which feature is ordinal, nominal and binary
def det_feature_type(feature: str) -> str:
    # Find feature data 
    feature_data = dataset[feature].values
    num_samples = np.unique(feature_data).shape[0]

    # Identify feature type 
    if num_samples == 2:
        return "Binary"
    else:
        return "Not Binary"

# Build a new features that show only binary features
dataframe["Binary"] = dataframe["Categorical Features"].transform(lambda feature: det_feature_type(feature)) 
dataframe

Unnamed: 0,Categorical Features,Number of Discrete values,Binary
0,BRAND,10,Not Binary
1,GPU,5,Not Binary
2,OPERATING_SYSTEM,6,Not Binary
3,TOUCHSCREEN,2,Binary
4,BLUETOOTH,2,Binary
5,DISPLAY,2,Binary
6,WEBCAM(BUILT-IN),2,Binary
7,COLOR,20,Not Binary


In [11]:
cat_dataset["BLUETOOTH"].apply(lambda x: 1 if x == "YES" else 0)
# cat_dataset["BLUETOOTH"]

0       0
1       1
2       1
3       1
4       0
       ..
4177    0
4178    0
4179    0
4180    1
4181    0
Name: BLUETOOTH, Length: 4182, dtype: int64

**Ordinal Encoding**

In [12]:
encoder = LabelEncoder()
encoder.fit_transform(cat_dataset["BLUETOOTH"])

array([0, 1, 1, ..., 0, 1, 0], shape=(4182,))

**Nominal Encoding**

In [13]:
cat_dataset["BLUETOOTH"].values.reshape(-1,1)

array([['NO'],
       ['YES'],
       ['YES'],
       ...,
       ['NO'],
       ['YES'],
       ['NO']], shape=(4182, 1), dtype=object)

In [None]:
encoder = OneHotEncoder()
reshaped_feature = cat_dataset["COLOR"].values.reshape((-1,1))
column_names = cat_dataset["COLOR"].unique()
encoded_feature = encoder.fit_transform(reshaped_feature).toarray()
feature_data = pd.DataFrame(encoded_feature, columns=encoder.get_feature_names_out(["COLOR"]))
feature_data

['COLOR']


Created by Adoan Mian 21/12/2025