In [56]:
from src.utils import (
    PreprocessingPipeline, 
    ConfigManager, 
    SerializationManager
)
import pandas as pd
import numpy as np 
import joblib

# 1. Load Config

In [57]:
# Path Config
CONFIG_PATH = 'config/config.yaml'
config_manager = ConfigManager(config_path=CONFIG_PATH)
config = config_manager.load_config()

# 2. Deserialized Data

In [58]:
# Load serilization manager
serilization_manager = SerializationManager(config=config)
# Deserialize processed data train
X_train = serilization_manager.deserialize_data('X_train')
y_train = serilization_manager.deserialize_data('y_train')
# Deserialize processed data validation
X_valid = serilization_manager.deserialize_data('X_valid')
y_valid = serilization_manager.deserialize_data('y_valid')
# Deserialize processed data test
X_test = serilization_manager.deserialize_data('X_test')
y_test = serilization_manager.deserialize_data('y_test')

Deserialized data loaded from data/processed/X_train.pkl
Deserialized data loaded from data/processed/y_train.pkl
Deserialized data loaded from data/processed/X_valid.pkl
Deserialized data loaded from data/processed/y_valid.pkl
Deserialized data loaded from data/processed/X_test.pkl
Deserialized data loaded from data/processed/y_test.pkl


# 3. Data Preprocessing

In [59]:
# Check duplicates
X_train[X_train.duplicated(keep=False)].head()

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
16664,RENT,EDUCATION,C,N,22,36000,1.0,5000,12.68,0.14,4
7289,MORTGAGE,VENTURE,A,N,24,52000,3.0,3500,6.76,0.07,3
24458,MORTGAGE,MEDICAL,C,Y,29,72500,2.0,7000,13.79,0.1,8
4097,MORTGAGE,PERSONAL,A,N,21,40000,5.0,5000,6.62,0.13,2
5496,RENT,MEDICAL,B,N,22,61000,3.0,5000,9.99,0.08,2


In [60]:
# Get index of duplicated rows
duplicated_index = X_train[X_train.duplicated(keep='first')].index
print(f'Jumlah index yang terduplikasi: {len(duplicated_index)}')

Jumlah index yang terduplikasi: 118


In [61]:
# Drop duplicates
X_train = X_train.drop(index=duplicated_index)
print(f"Shape X_train after drop duplicates: {X_train.shape}")
print(f"Number of duplicates from X_train: {X_train.duplicated().sum()}")
y_train = y_train.drop(index=duplicated_index)
print(f"Shape y_train after drop duplicates: {y_train.shape}")


Shape X_train after drop duplicates: (25946, 11)
Number of duplicates from X_train: 0
Shape y_train after drop duplicates: (25946, 1)


In [62]:
# Delete invalid data
cond1 = X_train['person_age'] > 100
cond2 = X_train['person_emp_length'] > 50
# --- IGNORE ---
invalid_data = X_train[cond1 | cond2].index
print(f"Jumlah data yang tidak valid: {len(invalid_data)}")
X_train = X_train.drop(index=invalid_data)
y_train = y_train.drop(index=invalid_data)
print(f"Shape X_train after drop invalid data: {X_train.shape}")
print(f"Shape y_train after drop invalid data: {y_train.shape}")

Jumlah data yang tidak valid: 5
Shape X_train after drop invalid data: (25941, 11)
Shape y_train after drop invalid data: (25941, 1)


### Handling Missing Value

In [63]:
def num_imputer(data: pd.DataFrame, cols: list) -> object:
    from sklearn.impute import SimpleImputer
    num_imputer = SimpleImputer(strategy='median', missing_values=np.nan)
    num_imputer.fit(data[cols])
    return num_imputer

def num_imputer_transform(data: pd.DataFrame, cols: list, imputer: object) -> pd.DataFrame:
    imputed_data = imputer.transform(data[cols])
    result_df = pd.DataFrame(
        data=imputed_data, index=data.index, columns=cols
    )
    # drop data
    data.drop(cols, axis=1, inplace=True)
    # hasil
    data = pd.concat([data, result_df], axis=1)
    return data

In [64]:
# Set Col name
col = X_train.describe().columns

imputer = num_imputer(data=X_train, cols=col)
# do imputer with median
X_train = num_imputer_transform(data=X_train, imputer=imputer, cols=col)
X_valid = num_imputer_transform(data=X_valid, imputer=imputer, cols=col)
X_test = num_imputer_transform(data=X_test, imputer=imputer, cols=col)

### 1. Pipeline Log Transform

In [65]:
# Define Preprocessing Pipeline for Log Transform
prep_log = PreprocessingPipeline(config=config, is_log_transform=True, is_robust=False)
# Fitting
x_train_log = prep_log.fit_transform(X_train)
x_valid_log = prep_log.transform(X_valid)
x_test_log = prep_log.transform(X_test)
# Simpan Pipeline Log Transform  
prep_log.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_log.pkl


In [66]:
# Update Config
PATH = 'models/preprocessing_pipeline_log.pkl'
config_manager.update_config(key='path.pipeline.log', value=PATH)

Updated: path.pipeline.log = models/preprocessing_pipeline_log.pkl


In [67]:
x_train_log.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
6190,3.178054,10.158169,1.098612,8.699681,2.484073,0.207014,1.386294,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
22640,3.526361,10.829748,1.098612,8.2943,2.484073,0.076961,2.197225,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
12835,3.178054,10.5187,1.94591,7.026427,1.915451,0.029559,1.386294,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5497,3.218876,10.71444,1.098612,9.472782,2.452728,0.254642,1.098612,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6770,3.091042,9.862718,1.791759,8.853808,2.731115,0.307485,1.098612,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### 2. Pipeline RobustScaler

In [68]:
# Define Preprocessing Pipeline for Robust Scaler
prep_rob = PreprocessingPipeline(config=config, is_log_transform=False, is_robust=True)
# Fitting
x_train_rob = prep_rob.fit_transform(X_train)
x_valid_rob = prep_rob.transform(X_valid)
x_test_rob = prep_rob.transform(X_test)
# Simpan Pipeline Log Robust Scaler  
prep_rob.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_robust.pkl


In [69]:
# Update Config
PATH = 'models/preprocessing_pipeline_robust.pkl'
config_manager.update_config(key='path.pipeline.robust', value=PATH)

Updated: path.pipeline.robust = models/preprocessing_pipeline_robust.pkl


In [70]:
x_train_rob.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
4773,-0.571429,-0.469136,-0.2,-0.137931,0.833333,0.285714,-0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
17876,0.285714,6.938272,-0.6,-0.806897,0.448052,-1.0,0.2,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
11760,-0.142857,0.498296,1.0,-0.275862,-0.753247,-0.5,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17743,-0.428571,-0.291358,-0.4,0.151724,0.487013,0.428571,-0.2,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
27479,0.428571,2.148148,2.0,2.241379,0.106061,0.142857,0.6,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Pipeline Log Transform + Robust

In [71]:
# Define Preprocessing Pipeline for Robust Scaler
prep_logrob = PreprocessingPipeline(config=config, is_log_transform=True, is_robust=True)
# Fitting
x_train_logrob = prep_logrob.fit_transform(X_train)
x_valid_logrob = prep_logrob.transform(X_valid)
x_test_logrob = prep_logrob.transform(X_test)
# Simpan Pipeline Log Robust Scaler  
prep_logrob.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_log_robust.pkl


In [72]:
# Update Config
PATH = 'models/preprocessing_pipeline_log_robust.pkl'
config_manager.update_config(key='path.pipeline.log_robust', value=PATH)

Updated: path.pipeline.log_robust = models/preprocessing_pipeline_log_robust.pkl


In [73]:
x_train_logrob.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
20410,-3.2284,-1.357749,-0.478112,-1.102288,-1.939226,-0.655222,-0.339483,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26258,-3.2284,-1.35774,-0.580278,-1.102273,-1.766048,-0.722927,-0.360555,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
21332,-3.21052,-1.357762,-0.661371,-1.10226,-1.867169,-0.198445,-0.320421,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23948,-3.233243,-1.35777,-0.384112,-1.102178,-1.852412,0.965564,-0.441648,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25675,-3.233243,-1.357743,-0.478112,-1.102279,-1.976316,-0.722927,-0.360555,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Load & test preprocessing pipeline 

In [74]:
test_data = X_valid.sample(5)
# Load pipeline log transform
PATH = config['path']['pipeline']['log']
print(PATH)
prep_log_loaded = joblib.load(PATH)
# Test transformasi
test_data_log = prep_log_loaded.transform(test_data)
test_data_log

models/preprocessing_pipeline_log.pkl


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
20834,3.496508,10.778977,1.791759,8.517393,2.524928,0.09531,2.197225,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
20084,3.401197,10.545368,1.791759,8.987322,2.260721,0.19062,1.94591,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31892,3.688879,12.42922,3.091042,8.2943,2.786861,0.019803,2.639057,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5980,3.135494,10.896758,1.609438,8.6307,2.36368,0.09531,1.098612,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
11105,3.178054,11.097425,2.079442,8.2943,2.810607,0.058269,1.609438,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [75]:
test_data

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
20834,RENT,DEBTCONSOLIDATION,B,N,32.0,48000.0,5.0,5000.0,11.49,0.1,8.0
20084,MORTGAGE,EDUCATION,A,N,29.0,38000.0,5.0,8000.0,8.59,0.21,6.0
31892,MORTGAGE,VENTURE,C,N,39.0,250000.0,21.0,4000.0,15.23,0.02,13.0
5980,RENT,VENTURE,B,N,22.0,54000.0,4.0,5600.0,9.63,0.1,2.0
11105,MORTGAGE,DEBTCONSOLIDATION,D,Y,23.0,66000.0,7.0,4000.0,15.62,0.06,4.0
