# Data Cleaning: Numerical Dataset

In [1]:
# Import libraries for data cleaning
import numpy as np 
import pandas as pd 
import os 
import sys 

sys.path.append(os.path.abspath(".."))

## Approach

+ Removing duplicated rows (if available)
+ Removing null values
+ Removing inconsistencies

## Data Loading

In [2]:
# Import functionality for loading data
from scripts.collection.collector import DataLoader

In [3]:
# Load the dataset
dataloader = DataLoader("Laptop_numerical_data.csv", "data/processed")
dataset = dataloader.load()

File accepted


## Cleaning Process

In [4]:
# Import functionalies for data cleaning
from scripts.processing.cleaner import NullRemover
from scripts.processing.cleaner import InconRemover
from scripts.analysis.descriptive_analyser import StatisticalAnalysis

## Removing Inconsistencies

In [5]:
# Remove inconsistencies
incon_remover = InconRemover(dataset)
incons_list = incon_remover.detect_incon("Processor Speed")
print(f"Inconsistencies: {incons_list}")

Feature Processor Speed: 0 inconsistent data has been detected.
Inconsistencies: []


## Removing Nullvalues

In [6]:
# Identify nullvalues
stats_an = StatisticalAnalysis()
stats_data = stats_an.analyse(dataset)
stats_data

Unnamed: 0,Features,Nullvalues,DataTypes,UniqueValues
0,Price,0,float64,2014
1,Width of the Display,2268,float64,27
2,Height of the Display,2268,float64,28
3,Hard Drive Capacity,3079,float64,34
4,SSD Capacity,2056,float64,23
5,Ram Size,2531,float64,18
6,Processor Speed,2093,object,77


In [7]:
dataset["Hard Drive Capacity"].isnull().sum()

np.int64(3079)

In [8]:
# Remove nullvalues
# Create a null remover object
null_remover = NullRemover(dataset)

# Remove nullvalues (when available)
features = stats_data["Features"].values

for feature in features: 
    null_remover.clean(feature)

stats_an.analyse(dataset)

Unnamed: 0,Features,Nullvalues,DataTypes,UniqueValues
0,Price,0,float64,2014
1,Width of the Display,0,float64,27
2,Height of the Display,0,float64,28
3,Hard Drive Capacity,0,float64,34
4,SSD Capacity,0,float64,23
5,Ram Size,0,float64,18
6,Processor Speed,0,object,78


## Data Cleaning: Checkup 

In [9]:
# Check the dataset is 100 % cleaned
dataset.isnull().sum()


Price                    0
Width of the Display     0
Height of the Display    0
Hard Drive Capacity      0
SSD Capacity             0
Ram Size                 0
Processor Speed          0
dtype: int64

## Data Storage

In [None]:
# Import functionalities for storing data
from scripts.collection.collector import DataSaver

In [11]:
# Save dataset to cleaned folder 
datasaver = DataSaver("data/cleaned")
datasaver.save_one_ds(dataset, "Laptop_numerical_data_cleaned")