# Data Cleaning P4: Complex Dataset

In [1]:
# Import dependencies for data processing
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
import sys 

In [2]:
# System Configuration
sys.path.append(os.path.abspath(".."))

## Data Loading

In [3]:
# Import functionalities for loading data
from scripts.collection.collector import DataLoader

In [4]:
# Initialise variables for loading the dataset
filename = "Laptop_complex_data_cleaned.csv"
cleaning_folder = "data\cleaned"

# Instantiate data loader
dataloader = DataLoader(filename, cleaning_folder)
dataset = dataloader.load()

dataset


File accepted


Unnamed: 0,Features,Condition Description,Seller Note
0,"Backlit Keyboard, Built-in Microphone, Built...","A brand-new, unused, unopened, undamaged item ...",undefined
1,"Backlit Keyboard, Bluetooth, Built-in Micropho...",The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...
2,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",An item that has been used previously. The ite...,"well kept, fully functional, includes battery,..."
3,"Bluetooth, Built-in Microphone, Built-in Webca...",The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...
4,"10/100 LAN Card, Built-in Microphone, Built-in...",The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...
...,...,...,...
4178,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined
4179,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined
4180,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",An item that has been used previously. The ite...,undefined
4181,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined


In [5]:
(2010/4183)*100

48.05163758068372

**Problem Statement**

After importing the complex dataset, MySQL workbench registed 2010 records. This shows a strong defficiency of 48.05% of data that are missing. The reason is the fact that some records contains values that are more than 255 characters. Initially, the tablular format for this dataset is set at TEXT datatype. It is still inobvious to declare the missing records that have been attempted to be imported. Hence, we will deeply study the reason for misregistration of those records

The complex dataset contains textual data that are more than 100 characters. First we will find if there are irrelevant data that are not considered for import on MySQL workbench. Then we find the text value for each feature with the maximum amount of data. This way we will be able to adapt the maximum registered amount for allowing those records to be imported.



## Irrelevant value analysis

In [6]:
# Check if there are any irrelevant values 
dataset.isnull().sum()

Features                 0
Condition Description    0
Seller Note              0
dtype: int64

In [7]:
# Check the dataset unique carefully 
dataset["Features"].value_counts()
# dataset["Condition Description"].value_counts()
# dataset["Seller Note"].value_counts()

Features
unknown data                                                                          2190
Touchscreen                                                                             79
10/100 LAN Card,  Wi-Fi,  SD Card Reader                                                70
10/100 LAN Card, Wi-Fi, SD Card Reader                                                  66
Wi-Fi                                                                                   55
                                                                                      ... 
Touchscreen, Built-in Webcam, Wi-Fi, Built-in Microphone                                 1
Backlit Keyboard, Wi-Fi                                                                  1
Wi-Fi,  Widescreen Display,  Multi-Touch Trackpad,  Touchscreen,  Backlit Keyboard       1
Bluetooth,  Built-in Microphone,  Built-in Webcam,  Wi-Fi,  Widescreen Display           1
Backlit Keyboard, Bluetooth, Built-in Microphone, Wi-Fi                          

In [8]:
# Build an algorithm that shows number of characters 
def find_max_txt_size(dataset: pd.DataFrame, feature: str) -> int:
    # Initialise feature dataframe
    feature_df = pd.DataFrame()

    # Iteration: Find text sizes per text value
    feature_data = dataset[feature].values 
    text_size_coll = []

    for text in feature_data:
        # Take the len of the text
        text_size = len(text)
        # Store the text len into text_size_coll
        text_size_coll.append(text_size)

    # Initialise text size collection to df
    feature_df["Feature Text"] = feature_data
    feature_df["Text Size"] = text_size_coll
    max_text_size = feature_df["Text Size"].max()

    return max_text_size

In [9]:
# Find max text size per feature 
feature_max = find_max_txt_size(dataset, "Features")
selling_note_max = find_max_txt_size(dataset, "Seller Note")
con_desc_max = find_max_txt_size(dataset, "Condition Description")

text_df = pd.DataFrame()
text_df["Text Features"] = dataset.columns.values
text_df["Max length"] = [feature_max, selling_note_max, con_desc_max]

## Inconsistent Analysis on Textual Data

In [10]:
# Find text data that are inconsistent
txt_value1 = "\\xC3\\xA2\\xE2\\x82\\xAC\\xC2\\xA2\\x0911. 6"" LED HD 1366x768 Display\""  # rows 1366
txt_value2 = "duplicate entry" # 1062

In [11]:
# Identify values that are undefined
undefined = dataset["Seller Note"] == "undefined"
sell_undef = dataset[undefined]["Seller Note"]

It is about the inconsistency of data, but how many duplicated samples they are registed in the dataset. Removing those duplicates strong affects the distribution of other datasets, like numerical and categoricald datasets. Therefore, all three datasets need to be rechecked for duplicated rows.

## Data Loading P2

In [12]:
# Import functionalities to transform data
from scripts.processing.transformer import ColumnTransformer

In [13]:
# Load all 3 datasets
folder = "data/cleaned"
filename = lambda x: f"Laptop_{x}_data_cleaned.csv"
dataset1 = DataLoader(filename("categorical"), folder).load()
dataset2 = DataLoader(filename("numerical"), folder).load()
dataset3 = DataLoader(filename("complex"), folder).load()

File accepted
File accepted
File accepted


In [14]:
# transformer = ColumnTransformer(dataset1=dataset1, dataset2=dataset2)
# dataset4 = transformer.combine(sel_dataset=dataset3)
# dataset4

The total number of duplicated rows across all features are zero. We will still focus on the complex data by adding a new column that make it unduplicated

In [15]:
# Add a column called info ID
records = dataset.shape[0] # 4183
info_id = pd.DataFrame({"InfoID": [f"I{i}" for i in range(records)]})

transformer = ColumnTransformer(dataset)
dataset = transformer.combine(info_id)
dataset.duplicated().sum()

np.int64(0)

## Data Storage

In [16]:
# Import functionality to store data
from scripts.collection.collector import DataSaver

In [17]:
# Save the dataset
# dataset = dataset.drop(columns=["InfoID"], axis=1)
datasaver = DataSaver(folder="data\cleaned")
datasaver.save_one_ds(dataset, "Laptop_complex_data_cleaned")

File Laptop_complex_data_cleaned.csv has been stored successfully


In [18]:
dataset

Unnamed: 0,Features,Condition Description,Seller Note,InfoID
0,"Backlit Keyboard, Built-in Microphone, Built...","A brand-new, unused, unopened, undamaged item ...",undefined,I0
1,"Backlit Keyboard, Bluetooth, Built-in Micropho...",The item shows minimal wear and is backed by a...,aaa pcs is a microsoft authorized refurbisher ...,I1
2,"10/100 LAN Card, Backlit Keyboard, Bluetooth, ...",An item that has been used previously. The ite...,"well kept, fully functional, includes battery,...",I2
3,"Bluetooth, Built-in Microphone, Built-in Webca...",The item shows moderate wear and is backed by ...,1-year allstate warranty. the original hp char...,I3
4,"10/100 LAN Card, Built-in Microphone, Built-in...",The item shows moderate wear and is backed by ...,laptops is tested & fully working with some si...,I4
...,...,...,...,...
4178,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined,I4178
4179,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined,I4179
4180,"Touchscreen, 10/100 LAN Card, Bluetooth, Ba...",An item that has been used previously. The ite...,undefined,I4180
4181,unknown data,"A brand-new, unused, unopened, undamaged item ...",undefined,I4181
