# Data Preparation: Numerical Data

In [1]:
# Import dependencies for data processing
import numpy as np 
import pandas as pd 
import os
import sys 

In [2]:
# System configuration
sys.path.append(os.path.join(".."))

## Approach

This process is about the Processor Speed which has been processed and cleaned. Due to its numerical nature, this needed to be merged with the numerical feature. Therefore, this approach is focussed about applying small transformation operations in order to join the Processor Speed feature with the numerical data. The approach for this process are discussed here: 

- Data Loading: Loading 2 datasets (categorical data and numerical data)
- Feature Engineer(Processing Speed): analyse the datatype of the feature and start reconverting it 
- Data Transformation: merge the two dataframes into one single frame

# Data Loading

In [3]:
# Import functionalities to load datasets
from scripts.collection.collector import DataLoader
from scripts.collection.collector import DataSaver
from scripts.processing.cleaner import InconRemover

In [4]:
# Load 2 datasets: numerical data and categorical data 
data_loader1 = DataLoader("Laptop_numerical_data_cleaned.csv", "data\\cleaned")
data_loader2 = DataLoader("Laptop_categorical_data_cleaned.csv", "data\\cleaned")

# Load the datasets
dataset1 = data_loader1.load()
dataset2 = data_loader2.load()

# Create datasaver
data_saver = DataSaver("data\\cleaned")

File accepted
File accepted


In [5]:
# Dataset 2: Categorical Dataset
# dataset2 = dataset2.drop(columns=["Processor Speed"], axis=1)
# data_saver.save_one_ds(dataset2, "Laptop_categorical_data_cleaned")

In [7]:
# Dataset 1: Numerical Dataset
# dataset1.info()
processor_speed = dataset1["Processor Speed"]
processor_speed.value_counts()
print(f"Processor Speed(Type): {processor_speed.dtype}")

# Perform some cleaning operations on Processor Speed
incon_remover = InconRemover(dataset1)
num_incons = incon_remover.detect_incon("Processor Speed")
processor_speed = incon_remover.clean("Processor Speed")
processor_speed = processor_speed.replace("Unknown data", "0")

processor_speed = processor_speed.astype(float)
mean = round(processor_speed.mean(), 2)
processor_speed = processor_speed.replace("0", mean)

# Show processor speed unique values
print(num_incons)
processor_speed.unique()

Processor Speed(Type): object
Feature Processor Speed: 2107 inconsistent data has been detected.
Feature Processor Speed: 2107 inconsistent data has been detected.
['unknown data', 'unknown data', 'unknown data', 'o4.2', 'unknown data', 'o3.4', 'o2.8', 'o3.9', 'unknown data', 'o4.5', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown data', 'unknown da

array([ 3.80e+00,  4.20e+00,  2.40e+00,  1.40e+00,  1.60e+00,  1.00e+00,
        2.80e+00,  2.10e+00,  1.70e+00,  0.00e+00,  1.10e+00,  4.40e+00,
        2.30e+00,  2.60e+00,  1.90e+00,  3.10e+00,  2.50e+00,  3.50e+00,
        2.16e+00,  1.73e+00,  2.70e+00,  1.80e+00,  2.00e+00,  2.20e+00,
        2.48e+00,  1.20e+00,  3.00e+00,  4.00e+00,  3.40e+00,  1.50e+00,
        1.30e+00,  2.90e+00,  3.30e+00,  1.99e+00,  2.26e+00,  1.49e+00,
        2.66e+00,  2.67e+00,  4.10e+00,  1.83e+00,  3.90e+00,  4.70e+00,
        5.10e+00,  2.55e+00,  4.60e+00,  3.60e+00,  5.00e+00,  3.70e+00,
        4.90e+00,  4.80e+00,  1.07e+00,  1.46e+00,  1.66e+00,  8.00e+02,
       -3.10e+00,  3.20e+00,  1.44e+00,  4.30e+00,  1.86e+00,  4.50e+00,
        2.08e+00,  3.99e-01, -2.80e+00,  2.53e+00,  1.23e+00,  5.20e+00,
        1.87e+00])

In [11]:
# Adapt Processor Speed in the dataset & save the dataset
datasaver2 = DataSaver(folder="data\cleaned")
dataset1["Processor Speed"] = processor_speed
datasaver2.save_one_ds(dataset1, filename="Laptop_numerical_data_cleaned")

File Laptop_numerical_data_cleaned.csv has been stored successfully


**COOPERATION 2025 CREATED BY ADOAN MIAN**