In [20]:
from src.utils import (
    PreprocessingPipeline, 
    ConfigManager, 
    SerializationManager
)
import pandas as pd
import numpy as np 
import joblib

# 1. Load Config

In [2]:
# Path Config
CONFIG_PATH = 'config/config.yaml'
config_manager = ConfigManager(config_path=CONFIG_PATH)
config = config_manager.load_config()

# 2. Deserialized Data

In [3]:
# Load serilization manager
serilization_manager = SerializationManager(config=config)
# Deserialize processed data train
X_train = serilization_manager.deserialize_data('X_train')
y_train = serilization_manager.deserialize_data('y_train')
# Deserialize processed data validation
X_valid = serilization_manager.deserialize_data('X_valid')
y_valid = serilization_manager.deserialize_data('y_valid')
# Deserialize processed data test
X_test = serilization_manager.deserialize_data('X_test')
y_test = serilization_manager.deserialize_data('y_test')

Deserialized data loaded from data/processed/X_train.pkl
Deserialized data loaded from data/processed/y_train.pkl
Deserialized data loaded from data/processed/X_valid.pkl
Deserialized data loaded from data/processed/y_valid.pkl
Deserialized data loaded from data/processed/X_test.pkl
Deserialized data loaded from data/processed/y_test.pkl


# 3. Data Preprocessing

In [4]:
# Drop duplicates
X_train = X_train.drop_duplicates()
print(f"Shape after drop duplicates: {X_train.shape}")
print(f"Number of duplicates: {X_train.duplicated().sum()}")

Shape after drop duplicates: (25946, 11)
Number of duplicates: 0


In [5]:
# Delete invalid data
cond1 = X_train['person_age'] <= 100
cond2 = X_train['person_emp_length'] <= 50
# --- IGNORE ---
X_train = X_train[cond1 & cond2]
print(f"Shape after drop invalid data: {X_train.shape}")

Shape after drop invalid data: (25230, 11)


### Handling Missing Value

In [6]:
def num_imputer(data: pd.DataFrame, cols: list) -> object:
    from sklearn.impute import SimpleImputer
    num_imputer = SimpleImputer(strategy='median', missing_values=np.nan)
    num_imputer.fit(data[cols])
    return num_imputer

def num_imputer_transform(data: pd.DataFrame, cols: list, imputer: object) -> pd.DataFrame:
    imputed_data = imputer.transform(data[cols])
    result_df = pd.DataFrame(
        data=imputed_data, index=data.index, columns=cols
    )
    # drop data
    data.drop(cols, axis=1, inplace=True)
    # hasil
    data = pd.concat([data, result_df], axis=1)
    return data

In [7]:
# Set Col name
col = X_train.describe().columns

imputer = num_imputer(data=X_train, cols=col)
# do imputer with median
X_train = num_imputer_transform(data=X_train, imputer=imputer, cols=col)
X_valid = num_imputer_transform(data=X_valid, imputer=imputer, cols=col)
X_test = num_imputer_transform(data=X_test, imputer=imputer, cols=col)

### 1. Pipeline Log Transform

In [8]:
# Define Preprocessing Pipeline for Log Transform
prep_log = PreprocessingPipeline(config=config, is_log_transform=True, is_robust=False)
# Fitting
x_train_log = prep_log.fit_transform(X_train)
x_valid_log = prep_log.transform(X_valid)
x_test_log = prep_log.transform(X_test)
# Simpan Pipeline Log Transform  
prep_log.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_log.pkl


In [16]:
# Update Config
PATH = 'models/preprocessing_pipeline_log.pkl'
config_manager.update_config(key='path.pipeline.log', value=PATH)

Updated: path.pipeline.log = models/preprocessing_pipeline_log.pkl


In [10]:
x_train_log.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
23959,3.583519,11.034906,2.302585,9.472782,2.327278,0.19062,2.397895,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4597,3.258097,10.637417,1.94591,8.2943,1.944481,0.09531,1.098612,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22662,3.367296,10.950824,2.639057,9.169623,2.830858,0.157004,2.302585,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
11977,3.178054,10.71444,1.098612,9.323758,2.450143,0.223144,1.098612,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6110,3.135494,9.798183,1.098612,8.699681,2.030776,0.285179,1.098612,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2. Pipeline RobustScaler

In [9]:
# Define Preprocessing Pipeline for Robust Scaler
prep_rob = PreprocessingPipeline(config=config, is_log_transform=False, is_robust=True)
# Fitting
x_train_rob = prep_rob.fit_transform(X_train)
x_valid_rob = prep_rob.transform(X_valid)
x_test_rob = prep_rob.transform(X_test)
# Simpan Pipeline Log Robust Scaler  
prep_rob.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_robust.pkl


In [15]:
# Update Config
PATH = 'models/preprocessing_pipeline_robust.pkl'
config_manager.update_config(key='path.pipeline.robust', value=PATH)

Updated: path.pipeline.robust = models/preprocessing_pipeline_robust.pkl


In [11]:
x_train_rob.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
20271,0.285714,0.136139,-0.6,-0.533333,-0.008565,-0.571429,0.8,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6190,-0.428571,-0.747525,-0.4,-0.266667,0.0,0.571429,-0.2,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
5486,0.0,0.09901,-0.8,-0.4,-0.824411,-0.5,-0.4,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27141,0.285714,-0.817624,0.0,-0.866667,-1.062099,-0.571429,0.6,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32140,1.714286,0.544554,0.8,-0.233333,-0.905782,-0.5,2.4,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Pipeline Log Transform + Robust

In [12]:
# Define Preprocessing Pipeline for Robust Scaler
prep_logrob = PreprocessingPipeline(config=config, is_log_transform=True, is_robust=True)
# Fitting
x_train_logrob = prep_logrob.fit_transform(X_train)
x_valid_logrob = prep_logrob.transform(X_valid)
x_test_logrob = prep_logrob.transform(X_test)
# Simpan Pipeline Log Robust Scaler  
prep_logrob.save_pipeline()

Pipeline saved to models/preprocessing_pipeline_log_robust.pkl


In [17]:
# Update Config
PATH = 'models/preprocessing_pipeline_log_robust.pkl'
config_manager.update_config(key='path.pipeline.log_robust', value=PATH)

Updated: path.pipeline.log_robust = models/preprocessing_pipeline_log_robust.pkl


In [13]:
x_train_logrob.sample(5)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
17510,-3.254446,-1.385862,-0.580278,-1.065368,-1.894809,0.465081,-0.478112,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5889,-3.254446,-1.385872,-0.441648,-1.065439,-1.776456,0.290145,-0.478112,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4906,-3.254446,-1.385876,-0.360555,-1.065531,-1.770319,-0.198445,-0.478112,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3018,-3.266358,-1.385871,-0.661371,-1.065591,-1.871469,-0.655222,-0.478112,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14639,-3.254446,-1.385874,-0.8,-1.065653,-1.882821,-0.722927,-0.478112,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Load & test preprocessing pipeline 

In [24]:
test_data = X_valid.sample(5)
# Load pipeline log transform
PATH = config['path']['pipeline']['log']
print(PATH)
prep_log_loaded = joblib.load(PATH)
# Test transformasi
test_data_log = prep_log_loaded.transform(test_data)
test_data_log

models/preprocessing_pipeline_log.pkl


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_2,loan_grade_3,loan_grade_4,loan_grade_5,loan_grade_6,loan_grade_7,cb_person_default_on_file_1
19074,3.332205,10.085851,0.693147,7.673688,2.754934,0.086178,2.079442,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22085,3.496508,10.799596,2.833213,8.748464,2.38968,0.122218,2.197225,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23919,3.401197,11.115771,1.609438,9.21044,2.496506,0.139762,2.302585,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32092,3.713572,10.736331,1.609438,9.903538,2.747912,0.357674,2.639057,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
18878,3.526361,10.778977,1.098612,7.313887,2.556452,0.029559,1.94591,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
test_data

Unnamed: 0,person_home_ownership,loan_intent,loan_grade,cb_person_default_on_file,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length
19074,RENT,MEDICAL,C,N,27.0,24000.0,1.0,2150.0,14.72,0.09,7.0
22085,RENT,MEDICAL,B,N,32.0,49000.0,16.0,6300.0,9.91,0.13,8.0
23919,MORTGAGE,DEBTCONSOLIDATION,B,N,29.0,67222.0,4.0,10000.0,11.14,0.15,9.0
32092,RENT,DEBTCONSOLIDATION,D,Y,40.0,45996.0,4.0,20000.0,14.61,0.43,13.0
18878,RENT,VENTURE,B,N,33.0,48000.0,2.0,1500.0,11.89,0.03,6.0
