## 1. Load Required Libraries

In [1]:
import src.util as utils
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder



## 2. Load Configuration File

In [2]:
config = utils.load_config()

## 3. Load Dataset

In [3]:
def load_dataset(config_data: dict):
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [65]:
train_set, valid_set, test_set = load_dataset(config)

## 4. Preprocessing

Summary EDA:

- ScreenResolution ada 0/1
- Cpu ada brand
- Ram ada replace
- Memory belum
- Gpu split
- OpSys make
- Weight replace 

1. Create new column, `Touchscreen` and `IPS` representing weather the laptop have touchscreen or IPS feature.

In [5]:
def check_touchsreen(data):
    if "Touchscreen" in data:
        return 1
    else:
        return 0

In [6]:
def check_ips(data):
    if "IPS" in data:
        return 1
    else:
        return 0

2. Get CPU name from CPU column

In [7]:
def get_cpu_name(data):
    # get the first three words in cpu name 
    
    first_three_words = data.split()[0:3]
    output = " ".join(first_three_words)
    
    return output

3. Get CPU brand from the CPU Name

In [8]:
def fetch_processor(text):
    if (text == 'Intel Core i7' or text == 'Intel Core i5' or text == 'Intel Core i3'):
        return text
    else:
        if text.split()[0] == 'Intel':
            return 'Other Intel Processor'
        else:
            return 'AMD Processor'

4. Get the number of RAM

In [9]:
def clean_ram(data):
    # get the numeric of RAM
    replacing_gb = data.str.replace("GB","")
    numeric_form = replacing_gb.astype("int")
    
    return replacing_gb

5. Get the GPU brand

In [10]:
def brand_gpu(data):
    # get the brand of gpu
    brand = data.split()[0]
    
    return brand

6. Fix operating system name 

In [11]:
def cat_os(data):
    if data == 'Windows 10' or data == 'Windows 7' or data == 'Windows 10 S':
        return 'Windows'
    elif data == 'macOS' or data == 'Mac OS X':
        return 'Mac'
    else:
        return 'Others/No OS/Linux'

7. Get the weight in numerical form

In [12]:
def clean_weight(data):
    # get the numeric of RAM
    replacing_kg = data.str.replace("kg","")
    numeric_form_weight = replacing_kg.astype("float")
    return numeric_form_weight

8. Get screen resolution

In [13]:
def get_xresolution(data):
    temp = data.str.split("x", n = 1, expand = True)
    x_res = temp[0].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])
    return x_res

def get_yresolution(data):
    temp = data.str.split("x", n = 1, expand = True)
    y_res = temp[1].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x:x[0])
    return y_res

In [14]:
train_set

Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,HDD,SSD,Hybrid,Flash_Storage,Price
795,Lenovo,2 in 1 Convertible,14.0,IPS Panel Touchscreen 2560x1440,Intel Core i5 7200U 2.5GHz,8GB,256 SSD,Intel HD Graphics 620,Windows 10,1.42kg,0,256,0,0,124142.4000
166,Acer,Notebook,15.6,1366x768,Intel Pentium Quad Core N4200 1.1GHz,4GB,1000 HDD,Intel HD Graphics 505,Windows 10,2.1kg,1000,0,0,0,19367.8128
695,Acer,Netbook,11.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,4GB,32 Flash Storage,Intel HD Graphics,Windows 10,1.4kg,0,0,0,32,14332.3200
470,HP,2 in 1 Convertible,13.3,IPS Panel 4K Ultra HD / Touchscreen 3840x2160,Intel Core i7 8550U 1.8GHz,8GB,512 SSD,Intel UHD Graphics 620,Windows 10,1.29kg,0,512,0,0,106506.7200
763,Asus,Ultrabook,13.3,IPS Panel Quad HD+ 3200x1800,Intel Core i5 7200U 2.5GHz,8GB,256 SSD,Intel HD Graphics 620,Windows 10,1.2kg,0,256,0,0,60153.1200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
970,Dell,2 in 1 Convertible,13.3,Quad HD+ / Touchscreen 3200x1800,Intel Core i7 7Y75 1.3GHz,16GB,512 SSD,Intel HD Graphics 615,Windows 10,1.22kg,0,512,0,0,101178.7200
885,HP,2 in 1 Convertible,13.3,Full HD / Touchscreen 1920x1080,Intel Core i5 7200U 2.5GHz,4GB,256 SSD,Intel HD Graphics 620,Windows 10,1.28kg,0,256,0,0,95850.7200
91,Acer,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,1000 HDD,Nvidia GeForce MX150,Windows 10,2.2kg,1000,0,0,0,36709.9200
1032,MSI,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,128 SSD + 1000 HDD,Nvidia GeForce GTX 1050,Windows 10,2.7kg,1000,128,0,0,63499.1040


Standardization

In [15]:
def fit_standardize(data, config_data, return_file=True):
    """Find standardizer data"""
    standardizer = StandardScaler()

    # Fit standardizer
    standardizer.fit(data)

    # Dump standardizer
    # utils.pickle_dump(standardizer, config_data['standardizer_path'])
    
    if return_file:
        return standardizer

In [16]:
def fit_ohe(data, config_data, return_file=True):
    """Find standardizer data"""
    ohe = OneHotEncoder()

    # Fit standardizer
    ohe.fit(data)

    # Dump standardizer
    # utils.pickle_dump(ohe, config_data['ohe_path'])
    
    if return_file:
        return ohe

In [17]:
def transform_standardize(data, standardizer):
    """Function to standardize data"""
    data_standard = pd.DataFrame(standardizer.transform(data))
    data_standard.columns = data.columns
    data_standard.index = data.index 
    
    return data_standard

In [18]:
def transform_ohe(data, ohe):
    """Function to standardize data"""
    data_ohe = pd.DataFrame(ohe.transform(data).toarray())
    data_ohe.columns = ohe.get_feature_names_out(data.columns)
    data_ohe.index = data.index
    return data_ohe

## 5. Preprocessing All

In [19]:
def feature_engineering(data):
    """Function to do feature engineering"""
    
    # Add new columns
    data["Touchscreen"] = data["ScreenResolution"].apply(check_touchsreen)
    data["IPS"] = data["ScreenResolution"].apply(check_ips)
    data["X_res"] = get_xresolution(data["ScreenResolution"])
    data["Y_res"] = get_yresolution(data["ScreenResolution"])
    
    # Clearning columns
    data['Cpu'] = data['Cpu'].apply(get_cpu_name)
    data['Cpu Name'] = data['Cpu'].apply(fetch_processor)
    data['Gpu'] = data['Gpu'].apply(brand_gpu)
    data["Ram"] = clean_ram(data["Ram"])
    data['OpSys'] = data['OpSys'].apply(cat_os)
    data["Weight"] = clean_weight(data["Weight"])
    
    return data

In [20]:
# # Fit standardizer
# standardizer = fit_standardize(data = train_set[])

# # Transform
# X_train_std = transform_standardize(data = X_train_imputed,
#                                     standardizer = standardizer)
# X_train_std.describe()

In [21]:
def generate_preprocessor(train_data, config_data, return_file=True):
    """Function to generate preprocessor"""
    # Load data
    
    # Generate preprocessor: standardizer
    standardizer = fit_standardize(train_data[config_data["numerical_columns"]], config_data)
    
    # Generate preprocessor: onehotencoden
    ohe = fit_ohe(train_data[config_data["cat_columns"]], config_data)

    # Dump file
    preprocessor = {'standardizer': standardizer,
                    'ohe': ohe}
    # utils.pickle_dump(preprocessor, config_data['preprocessor_path'])
    # utils.pickle_dump(preprocessor, config_data['ohe_path'])
    
    if return_file:
        return preprocessor

In [22]:
def preprocess_data(config_data, type_data = 'train' , return_file=True):
    """Function to preprocess train data"""
    # Load data
    X = utils.pickle_load(config_data[f'{type_data}_set_path'][0])
    y = utils.pickle_load(config_data[f'{type_data}_set_path'][1])
    
    # Feature Engineering
    X_fe = feature_engineering(X)
    
    # Load preprocessor
    if type_data == "train":
        preprocessor = generate_preprocessor(X_fe, config)
    else:
        preprocessor = utils.pickle_load(config_data['preprocessor_path'])
    
    
    # Standardization
    standardizer = preprocessor['standardizer']
    X_clean_numerical = transform_standardize(X_fe[config_data['numerical_columns']], standardizer)
    
    # One Hot Encoder
    ohe = preprocessor['ohe']
    X_clean_categorical = transform_ohe(X_fe[config_data['cat_columns']], ohe)
    
    # Combine numerical and categorical columns
    X_clean = pd.concat([X_clean_numerical, X_clean_categorical], axis = 1)
    
    y_clean = y

    # Print shape
    print("X clean shape:", X_clean.shape)
    print("y clean shape:", y_clean.shape)

    # Dump file
    # utils.pickle_dump(X_clean, config_data[f'{type_data}_clean_path'][0])
    # utils.pickle_dump(y_clean, config_data[f'{type_data}_clean_path'][1])

    if return_file:
        return X_clean, y_clean   

In [23]:
# Transform X_train
X_train_clean, y_train_clean = preprocess_data(config, type_data = 'train', return_file = True)

# Transform X_valid
X_valid_clean, y_valid_clean = preprocess_data(config, type_data = 'valid', return_file = True)

# Transform X_train
X_test_clean, y_test_clean = preprocess_data(config, type_data = 'test', return_file = True)

X clean shape: (638, 45)
y clean shape: (638,)
X clean shape: (274, 45)
y clean shape: (274,)
X clean shape: (391, 45)
y clean shape: (391,)
