## Data Cleaning: P5

In [1]:
# Import libraries for data processing
import numpy as np 
import pandas as pd 
import os
import sys

In [2]:
# System Configuration
sys.path.append(os.path.abspath(".."))

## Data Loading

In [3]:
# Import functionalities to load dataset
from scripts.collection.collector import DataLoader

In [4]:
# Load the categorical dataset
dataloader1 = DataLoader("Laptop_categorical_data_cleaned.csv", folder="data\cleaned")
dataloader2 = DataLoader("Laptop_numerical_data_cleaned.csv", folder="data\cleaned")

dataset1 = dataloader1.load()
dataset2 = dataloader2.load()

File accepted
File accepted


## Data Transformation

In [5]:
# Import functionalities to combine data
from scripts.processing.transformer import ColumnTransformer

In [6]:
# Instantiate transformer object 
column_transformer = ColumnTransformer(dataset2)

# Extract screensize & combine with numerical data
screen_size = dataset1[["Screen Size (inch)"]]
dataset2 = column_transformer.combine(screen_size)

In [7]:
# Check for nullvalues 
nulls = dataset2["Screen Size (inch)"].isnull().sum()
print(f"Feature: Screen Size (inch)")
print(f"Nullvalues (before removal of inconsistencies): {nulls}")

Feature: Screen Size (inch)
Nullvalues (before removal of inconsistencies): 1638


## Inconsistency Analysis

In [8]:
# Import functionalities to detect inconsistencies
from scripts.analysis.irrelevant_data_analyser import InconInspector
from scripts.processing.cleaner import InconRemover

In [9]:
# Instantiate inspector and remover objects 
incon_inspector = InconInspector(dataset2)
incon_remover = InconRemover(dataset2)

incon_data = incon_inspector.detect(dataset2["Screen Size (inch)"].values)
incon_data

array(['14.1.', '12.3.', '14.1.', '12()', "1314'", '12.5(', '11.6.',
       '15.6.', '12.5/', '15.4.', '15.6.', '11.6.', '14.1.', '13.3.',
       '14.1.', '14.1.', '15.6.', '14.1.', '.', '10.1(', '11.6.', '12.5.',
       '13.3+', '14-15'], dtype='<U5')

In [10]:
# Remove inconsistent data
incon_remover.clean("Screen Size (inch)")
dataset2["Screen Size (inch)"].unique()

Feature Screen Size (inch): 24 inconsistent data has been detected.


array(['14', '11.6', '12.5', '13.3', '14.1', '2', '15.6', '11', '16', nan,
       '13', '12', '15.61', '13.5', '17.3', '18.4', '12.3', '10.1', '7.2',
       '15.4', '15', '17', '8.1', '10.95', '7', '10.8', '18', '16.1',
       '10.5', '12.1', '12.2', '13.4', '10.4', '10', '15.5', '14.5',
       '11.5', '10.3', '10.2', '13.7', '13.9', '9.5', '6.1', '13.1',
       '14.2', '12.4', '4.5', '9.7'], dtype=object)

In [11]:
# Remove screen size (inch) feature from categorical data
dataset1 = dataset1.drop(columns=["Screen Size (inch)"], axis = 1)
dataset1

Unnamed: 0,Brand,Currency,Color,Condition,GPU,Processor,Processor Speed Unit,Type,OS,Storage Type,Hard Drive Capacity Unit,SSD Capacity Unit,Ram Size Unit
0,other,$,gray,New,intel,quad core,GHz,notebook/laptop,windows,ssd,gb,tb,gb
1,dell,$,black,Very Good - Refurbished,intel,intel core i7 8th generation,GHz,notebook/laptop,windows,ssd,tb,unknown,unknown
2,dell,$,black,Used,intel,intel core i5-6300u,GHz,notebook/laptop,windows,ssd,gb,gb,gb
3,hp,$,black,Good - Refurbished,intel,intel celeron n,GHz,notebook/laptop,chrome,emmc,gb,unknown,gb
4,dell,$,other,Good - Refurbished,other,intel core i5 6th generation,GHz,notebook/laptop,windows,ssd,unknown,gb,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,$,other,New,other,undefined,unknown,other,unknown,unknown,unknown,unknown,unknown
4179,acer,$,other,New,other,not applicable,unknown,other,unknown,unknown,unknown,unknown,unknown
4180,dell,$,black,Used,intel,intel core i5 6th generation,GHz,notebook/laptop,windows,unknown,unknown,gb,unknown
4181,hp,$,other,New,other,not applicable,unknown,other,unknown,unknown,unknown,unknown,unknown


## Irrelevant Analysis 

In [12]:
# Import functionalities for identifying and removing null values 
from scripts.processing.cleaner import NullRemover
from scripts.analysis.irrelevant_data_analyser import NullInspector

Finding and removing nullvalues in Screen Size (inch) 

In [13]:
# Identify nullvalues 
inspector = NullInspector(dataset2)
inspector.inspect()
print(f"Nullvalues of Screen Size (after removal inconsistencies): 1639")

Nullvalues of Screen Size (after removal inconsistencies): 1639


In [14]:
# Remove inconsistent data 
remover = NullRemover(dataset2)
remover.clean("Screen Size (inch)")

# Check null values after removal
nulls = dataset2["Screen Size (inch)"].isnull().sum()
print(f"Null values Screen Size (after removal): {nulls}")

Null values Screen Size (after removal): 0


In [16]:
# Remove irrelevant data 
dataset2["Screen Size (inch)"] = dataset2["Screen Size (inch)"].str.replace("unknown data", "0.0")
dataset2["Screen Size (inch)"] = dataset2["Screen Size (inch)"].astype(float)

# Replace 0.0 by its mean values 
screen_size_mean = dataset2["Screen Size (inch)"].mean()
print(f"Screen Size (mean value): {screen_size_mean:2f}")

dataset2["Screen Size (inch)"] = dataset2["Screen Size (inch)"].replace(0.0, round(screen_size_mean, 2))
dataset2["Screen Size (inch)"].unique()


Screen Size (mean value): 8.426550


array([14.  , 11.6 , 12.5 , 13.3 , 14.1 ,  2.  , 15.6 , 11.  , 16.  ,
        8.43, 13.  , 12.  , 15.61, 13.5 , 17.3 , 18.4 , 12.3 , 10.1 ,
        7.2 , 15.4 , 15.  , 17.  ,  8.1 , 10.95,  7.  , 10.8 , 18.  ,
       16.1 , 10.5 , 12.1 , 12.2 , 13.4 , 10.4 , 10.  , 15.5 , 14.5 ,
       11.5 , 10.3 , 10.2 , 13.7 , 13.9 ,  9.5 ,  6.1 , 13.1 , 14.2 ,
       12.4 ,  4.5 ,  9.7 ])

In [None]:
dataset2[dataset2["Screen Size (inch)"] == 8.43]

Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size,Processor Speed,Screen Size (inch)
23,227.40,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
28,206.60,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
79,271.80,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
90,59.80,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
96,59.40,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
...,...,...,...,...,...,...,...,...
4175,319.00,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
4178,162.20,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
4179,93.25,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
4181,90.94,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43


## Data Storage

In [20]:
# Import functionalities for data storage
from scripts.collection.collector import DataSaver

In [22]:
# Categorical dataset
dataset1

Unnamed: 0,Brand,Currency,Color,Condition,GPU,Processor,Processor Speed Unit,Type,OS,Storage Type,Hard Drive Capacity Unit,SSD Capacity Unit,Ram Size Unit
0,other,$,gray,New,intel,quad core,GHz,notebook/laptop,windows,ssd,gb,tb,gb
1,dell,$,black,Very Good - Refurbished,intel,intel core i7 8th generation,GHz,notebook/laptop,windows,ssd,tb,unknown,unknown
2,dell,$,black,Used,intel,intel core i5-6300u,GHz,notebook/laptop,windows,ssd,gb,gb,gb
3,hp,$,black,Good - Refurbished,intel,intel celeron n,GHz,notebook/laptop,chrome,emmc,gb,unknown,gb
4,dell,$,other,Good - Refurbished,other,intel core i5 6th generation,GHz,notebook/laptop,windows,ssd,unknown,gb,gb
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,other,$,other,New,other,undefined,unknown,other,unknown,unknown,unknown,unknown,unknown
4179,acer,$,other,New,other,not applicable,unknown,other,unknown,unknown,unknown,unknown,unknown
4180,dell,$,black,Used,intel,intel core i5 6th generation,GHz,notebook/laptop,windows,unknown,unknown,gb,unknown
4181,hp,$,other,New,other,not applicable,unknown,other,unknown,unknown,unknown,unknown,unknown


In [23]:
# Numerical dataset: dataset 2
dataset2

Unnamed: 0,Price,Width of the Display,Height of the Display,Hard Drive Capacity,SSD Capacity,Ram Size,Processor Speed,Screen Size (inch)
0,303.80,2160.0,1440.0,512.0,1.0,8.0,3.8,14.00
1,400.00,1920.0,1080.0,2.0,240.0,8.0,4.2,14.00
2,175.00,1920.0,1080.0,500.0,500.0,16.0,2.4,14.00
3,85.00,1366.0,768.0,16.0,240.0,4.0,2.4,11.60
4,101.25,1366.0,768.0,256.0,256.0,8.0,1.4,12.50
...,...,...,...,...,...,...,...,...
4178,162.20,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
4179,93.25,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43
4180,424.80,1920.0,1080.0,256.0,120.0,8.0,2.8,12.50
4181,90.94,1920.0,1080.0,256.0,240.0,8.0,0.0,8.43


In [21]:
# Save categorical and numerical datasets
datasaver = DataSaver(folder="data\cleaned")
datasaver.save_one_ds(dataset1,"Laptop_categorical_data_cleaned")
datasaver.save_one_ds(dataset2,"Laptop_numerical_data_cleaned")

File Laptop_categorical_data_cleaned.csv has been stored successfully
File Laptop_numerical_data_cleaned.csv has been stored successfully
