# Test 1: Data Preparation

In [1]:
import numpy as np 
import pandas as pd 
import os 
import sys 

sys.path.append(os.path.abspath(".."))

## Data Loading

In [2]:
# Load the dataset
from scripts.collection.collector import DataLoader

In [3]:
# Load the dataset
data_loader = DataLoader("ebay_laptop_ds.csv", "data")
laptop_ds = data_loader.load()
laptop_ds

File accepted


Unnamed: 0,Brand,Price,Currency,Color,Features,Condition,Condition Description,Seller Note,GPU,Processor,...,Height of the Display,OS,Storage Type,Hard Drive Capacity,Hard Drive Capacity Unit,SSD Capacity,SSD Capacity Unit,Screen Size (inch),Ram Size,Ram Size Unit
0,other,303.80,$,gray,"Backlit Keyboard, Built-in Microphone, Built...",New,"A brand-new, unused, unopened, undamaged item ...",undefined,intel,quad core,...,1440.0,windows,ssd,512.0,gb,1.0,tb,14,8.0,gb
1,dell,400.00,$,black,"Backlit Keyboard, Bluetooth, Built-in Micropho...",Very Good - Refurbished,The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...,intel,intel core i7 8th generation,...,1080.0,windows,ssd,2.0,tb,,unknown,14,,unknown
2,dell,175.00,$,black,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",Used,An item that has been used previously. The ite...,"well kept, fully functional, includes battery,...",intel,intel core i5-6300u,...,1080.0,windows,ssd,500.0,gb,500.0,gb,14,16.0,gb
3,hp,85.00,$,black,"Bluetooth, Built-in Microphone, Built-in Webca...",Good - Refurbished,The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...,intel,intel celeron n,...,768.0,chrome,emmc,16.0,gb,,unknown,11.6,4.0,gb
4,dell,101.25,$,other,"10/100 LAN Card, Built-in Microphone, Built-in...",Good - Refurbished,The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...,other,intel core i5 6th generation,...,768.0,windows,ssd,,unknown,256.0,gb,12.5,8.0,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,162.20,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,undefined,...,,unknown,unknown,,unknown,,unknown,,,unknown
4179,acer,93.25,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,...,,unknown,unknown,,unknown,,unknown,,,unknown
4180,dell,424.80,$,black,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",Used,An item that has been used previously. The ite...,undefined,intel,intel core i5 6th generation,...,1080.0,windows,unknown,,unknown,120.0,gb,12.5,,unknown
4181,hp,90.94,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,...,,unknown,unknown,,unknown,,unknown,,,unknown


## Data Processing

In [4]:
# Import libraries for processing data 
from scripts.processing.cleaner import NullRemover

In [5]:
descr_data = pd.DataFrame()
descr_data["Nullvalues"] = laptop_ds.isnull().sum()
descr_data["DataTypes"] = laptop_ds.dtypes
descr_data.head()

Unnamed: 0,Nullvalues,DataTypes
Brand,0,object
Price,0,float64
Currency,0,object
Color,0,object
Features,2190,object


In [6]:
# Import dependencies from script folder to transform data
from scripts.processing.transformer import StatTransformer

In [7]:
stat_transformer = StatTransformer(dataset1=laptop_ds, dataset2=None)
descr_data = stat_transformer.transform()
descr_data

Unnamed: 0,Nullvalues,DataTypes,UniqueVals
Brand,0,object,10
Price,0,float64,2014
Currency,0,object,1
Color,0,object,20
Features,2190,object,709
Condition,0,object,10
Condition Description,0,object,10
Seller Note,0,object,795
GPU,0,object,5
Processor,0,object,413


## Data Processing Nullvalues: Numerical Data

In [8]:
from scripts.processing.cleaner import NullRemover

In [9]:
# Select numerical data
num_df = laptop_ds.select_dtypes([float, int])
num_df

Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size
0,303.80,2160.0,1440.0,512.0,1.0,8.0
1,400.00,1920.0,1080.0,2.0,,
2,175.00,1920.0,1080.0,500.0,500.0,16.0
3,85.00,1366.0,768.0,16.0,,4.0
4,101.25,1366.0,768.0,,256.0,8.0
...,...,...,...,...,...,...
4178,162.20,,,,,
4179,93.25,,,,,
4180,424.80,1920.0,1080.0,,120.0,
4181,90.94,,,,,


In [10]:
# Clean the entire dataset
features = num_df.columns

for feature in features:
    null_remover = NullRemover(num_df)
    null_remover.clean(feature)
print("Data Cleaning is completed")

num_df

Data Cleaning is completed


Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size
0,303.80,2160.0,1440.0,512.0,1.0,8.0
1,400.00,1920.0,1080.0,2.0,240.0,8.0
2,175.00,1920.0,1080.0,500.0,500.0,16.0
3,85.00,1366.0,768.0,16.0,240.0,4.0
4,101.25,1366.0,768.0,256.0,256.0,8.0
...,...,...,...,...,...,...
4178,162.20,1920.0,1080.0,256.0,240.0,8.0
4179,93.25,1920.0,1080.0,256.0,240.0,8.0
4180,424.80,1920.0,1080.0,256.0,120.0,8.0
4181,90.94,1920.0,1080.0,256.0,240.0,8.0


## Data Processing on Nullvalues: Categorical Data

In [11]:
cat_df = laptop_ds.select_dtypes([object])
cat_df

Unnamed: 0,Brand,Currency,Color,Features,Condition,Condition Description,Seller Note,GPU,Processor,Processor Speed,Processor Speed Unit,Type,OS,Storage Type,Hard Drive Capacity Unit,SSD Capacity Unit,Screen Size (inch),Ram Size Unit
0,other,$,gray,"Backlit Keyboard, Built-in Microphone, Built...",New,"A brand-new, unused, unopened, undamaged item ...",undefined,intel,quad core,3.80,GHz,notebook/laptop,windows,ssd,gb,tb,14,gb
1,dell,$,black,"Backlit Keyboard, Bluetooth, Built-in Micropho...",Very Good - Refurbished,The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...,intel,intel core i7 8th generation,4.20,GHz,notebook/laptop,windows,ssd,tb,unknown,14,unknown
2,dell,$,black,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",Used,An item that has been used previously. The ite...,"well kept, fully functional, includes battery,...",intel,intel core i5-6300u,2.40,GHz,notebook/laptop,windows,ssd,gb,gb,14,gb
3,hp,$,black,"Bluetooth, Built-in Microphone, Built-in Webca...",Good - Refurbished,The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...,intel,intel celeron n,2.40,GHz,notebook/laptop,chrome,emmc,gb,unknown,11.6,gb
4,dell,$,other,"10/100 LAN Card, Built-in Microphone, Built-in...",Good - Refurbished,The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...,other,intel core i5 6th generation,1.40,GHz,notebook/laptop,windows,ssd,unknown,gb,12.5,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,undefined,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4179,acer,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4180,dell,$,black,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",Used,An item that has been used previously. The ite...,undefined,intel,intel core i5 6th generation,2.80,GHz,notebook/laptop,windows,unknown,unknown,gb,12.5,unknown
4181,hp,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown


## Data Processing on Inconsistencies

In [18]:
# num_df.info()
cat_df

Unnamed: 0,Brand,Currency,Color,Features,Condition,Condition Description,Seller Note,GPU,Processor,Processor Speed,Processor Speed Unit,Type,OS,Storage Type,Hard Drive Capacity Unit,SSD Capacity Unit,Screen Size (inch),Ram Size Unit
0,other,$,gray,"Backlit Keyboard, Built-in Microphone, Built...",New,"A brand-new, unused, unopened, undamaged item ...",undefined,intel,quad core,3.80,GHz,notebook/laptop,windows,ssd,gb,tb,14,gb
1,dell,$,black,"Backlit Keyboard, Bluetooth, Built-in Micropho...",Very Good - Refurbished,The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...,intel,intel core i7 8th generation,4.20,GHz,notebook/laptop,windows,ssd,tb,unknown,14,unknown
2,dell,$,black,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",Used,An item that has been used previously. The ite...,"well kept, fully functional, includes battery,...",intel,intel core i5-6300u,2.40,GHz,notebook/laptop,windows,ssd,gb,gb,14,gb
3,hp,$,black,"Bluetooth, Built-in Microphone, Built-in Webca...",Good - Refurbished,The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...,intel,intel celeron n,2.40,GHz,notebook/laptop,chrome,emmc,gb,unknown,11.6,gb
4,dell,$,other,"10/100 LAN Card, Built-in Microphone, Built-in...",Good - Refurbished,The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...,other,intel core i5 6th generation,1.40,GHz,notebook/laptop,windows,ssd,unknown,gb,12.5,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,undefined,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4179,acer,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown
4180,dell,$,black,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",Used,An item that has been used previously. The ite...,undefined,intel,intel core i5 6th generation,2.80,GHz,notebook/laptop,windows,unknown,unknown,gb,12.5,unknown
4181,hp,$,other,,New,"A brand-new, unused, unopened, undamaged item ...",undefined,other,not applicable,,unknown,other,unknown,unknown,unknown,unknown,,unknown


Features that should be numerical are:
+ Processor Speed
+ Screen Size (inch)

Creating algorithm for finding inconsistent data

In [32]:
cat_df["Processor Speed"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 4183 entries, 0 to 4182
Series name: Processor Speed
Non-Null Count  Dtype 
--------------  ----- 
2090 non-null   object
dtypes: object(1)
memory usage: 32.8+ KB


In [None]:
def detect() -> list: 
    # Convert feature to numpy array
    processor_speed_data = cat_df["Processor Speed"].to_numpy()

   # Detect inconsistent data
    incons_list = []
    real_list = []

    for speed_value in processor_speed_data:
        try:
            # Check if value is numeric
            checked_value = float()
            real_list.append(checked_value)
        except ValueError:
            # Store inconsistencies into list 
            incon_value = speed_value
            incons_list.append(incon_value)
    # Analyse number of inconsistencies
    num_incons = len(incons_list)
    print(f"Feature {feature}: {num_incons} inconsistent data has been detected.")
    
    return incons_list 



In [47]:
cat_df["Processor Speed"].isnull().sum()

np.int64(2093)

In [48]:
# Remove inconsistent data 
incons_data = detect()

for incons in incons_data:
    # Replace inconsistent by zero
    cat_df["Processor Speed"].str.replace(incons, "0")
print("Inconsistent removal is done successfully.")

Feature Ram Size: 0 inconsistent data has been detected.
Inconsistent removal is done successfully.
