In [3]:
import sys
import os

# somehow need to run this first
sys.path.append(os.path.join(os.getcwd(), 'src'))

import utils
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
import numpy as np


In [62]:
X_train = utils.deserialize_data("./data/interim/X_train.pkl")
y_train = utils.deserialize_data("./data/interim/y_train.pkl")
X_test = utils.deserialize_data("./data/interim/X_test.pkl")
y_test = utils.deserialize_data("./data/interim/y_test.pkl")
X_valid = utils.deserialize_data("./data/interim/X_valid.pkl")
y_valid = utils.deserialize_data("./data/interim/y_valid.pkl")

In [63]:
import pandas as pd
def drop_duplicate_data(X, y):
    """
    Function to remove duplicate rows from a dataset.

    Parameters:
    X : pd.DataFrame
        DataFrame containing the data (train, test, or validation set) from which duplicates should be removed.
    y : pd.Series
        Series containing the target data corresponding to the data in parameter X.

    Returns:
    X : pd.DataFrame
        DataFrame after removing duplicate rows.
    y : pd.Series
        Series after removing rows in X that were duplicates.
    """
    if not isinstance(X, pd.DataFrame):
        raise TypeError("Parameter X harus bertipe DataFrame")
    
    if not isinstance(y, pd.Series):
         raise TypeError("Parameter y harus bertipe Series")
    
    print("Fungsi drop_duplicate_data: parameter telah divalidasi.")

    X = X.copy()
    y = y.copy()

    X_duplicate = X[X.duplicated()]
    
    print(f"Fungsi drop_duplicate_data: shape dari data yang duplicate adalah {X_duplicate.shape}.")
    
    X_clean = (X.shape[0] - X_duplicate.shape[0], X.shape[1])
    
    print(f"Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah {X_clean}.")
    
    X.drop_duplicates(inplace=True)
    
    y = y[X.index]
    
    print(f"Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah {X.shape}.")
    
    return X, y

In [64]:
X_train, y_train  = drop_duplicate_data(X_train, y_train)

Fungsi drop_duplicate_data: parameter telah divalidasi.
Fungsi drop_duplicate_data: shape dari data yang duplicate adalah (96, 11).
Fungsi drop_duplicate_data: shape dataset setelah drop duplicate seharusnya adalah (25968, 11).
Fungsi drop_duplicate_data: shape dataset setelah dropping duplicate adalah (25968, 11).


In [65]:
def median_imputation(data, subset_data, fit):
    """
    Function to impute missing values in a dataset using the median value of specified columns.

    Parameters:
    data : pd.DataFrame
        DataFrame containing the data (train, test, or validation set) to be imputed.
    subset_data : list or dict
        When fit is True, subset_data is a list of column names to compute the median.
        When fit is False, subset_data is a dictionary with column names as keys and their medians as values.
    fit : bool
        If True, the function will calculate the median value for each column specified in subset_data.
        If False, the function will perform imputation based on previously calculated medians.

    Returns:
    dict or pd.DataFrame
        If fit is True, returns a dictionary with column names and their corresponding median values.
        If fit is False, returns the DataFrame after imputation.
    """
    if not isinstance(data, pd.DataFrame):
        raise RuntimeError("Fungsi median_imputation: parameter data haruslah bertipe DataFrame!")
    
    if fit == True and not isinstance(subset_data, list):
        raise RuntimeError("Fungsi median_imputation: untuk nilai parameter fit = True, subset_data harus bertipe list dan berisi daftar nama kolom yang ingin dicari nilai mediannya guna menjadi data imputasi pada kolom tersebut.")
    
    if fit == False and not isinstance(subset_data, dict):
        raise RuntimeError("Fungsi median_imputation: untuk nilai parameter fit = False, subset_data harus bertipe dict dan berisi key yang merupakan nama kolom beserta value yang merupakan nilai median dari kolom tersebut.")
    
    if not isinstance(fit, bool):
        raise RuntimeError("Fungsi median_imputation: parameter fit haruslah bertipe boolean, bernilai True atau False.")
    
    print("Fungsi median_imputation: parameter telah divalidasi.")

    data = data.copy()
    subset_data = deepcopy(subset_data)

    if fit == True:
        imputation_data = {}
        
        for subset in subset_data:
            median_value = data[subset].median()
            imputation_data[subset] = median_value
        print(f"Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {imputation_data}.")
        return imputation_data
    
    if fit == False:
        print("Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:")
        print(data.isna().sum())
        print()

        data.fillna(subset_data, inplace=True)
        
        print("Fungsi median_imputation: informasi count na setelah dilakukan imputasi:")
        print(data.isna().sum())
        print()
        
        return data


In [66]:
subset_data = ['person_emp_length', 'loan_int_rate']

subset_data = median_imputation(data = X_train, subset_data=subset_data, fit=True)

X_train = median_imputation(data = X_train, subset_data=subset_data, fit=False)

X_test = median_imputation(data = X_test, subset_data=subset_data, fit=False)

X_valid = median_imputation(data = X_valid, subset_data=subset_data, fit=False)

Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: proses fitting telah selesai, berikut hasilnya {'person_emp_length': 4.0, 'loan_int_rate': 10.99}.
Fungsi median_imputation: parameter telah divalidasi.
Fungsi median_imputation: informasi count na sebelum dilakukan imputasi:
person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              734
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 2491
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

Fungsi median_imputation: informasi count na setelah dilakukan imputasi:
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     

In [67]:
def create_onehot_encoder(categories, path):
    if not isinstance(categories, list):
        raise RuntimeError("Fungsi create_onehot_encoder: parameter categories haruslah bertipe list, berisi kategori yang akan dibuat encodernya.")
    
    # Validate that path is a string
    if not isinstance(path, str):
        raise RuntimeError("Fungsi create_onehot_encoder: parameter path haruslah bertipe string, berisi lokasi pada disk komputer dimana encoder akan disimpan.")
    
    ohe = OneHotEncoder()

    categories_array = np.array(categories).reshape(-1, 1)

    ohe.fit(categories_array)

    utils.serialize_data(ohe, path)

    print(f"Kategori yang telah dipelajari adalah {ohe.categories_[0].tolist()}")

    return ohe


In [72]:
person_home_ownership = ['MORTGAGE', 'RENT', 'OWN', 'OTHER']
loan_intent = ['EDUCATION', 'PERSONAL', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION']
loan_grade = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
cb_person_default_on_file = ['N', 'Y']


ohe_home_ownership = create_onehot_encoder(person_home_ownership, 'models/ohe_home_ownership.pkl')
ohe_loan_intent = create_onehot_encoder(loan_intent, 'models/ohe_loan_intent.pkl')
ohe_loan_grade = create_onehot_encoder(loan_grade, 'models/ohe_loan_grade.pkl')
ohe_default_on_file = create_onehot_encoder(cb_person_default_on_file, 'models/ohe_default_on_file.pkl')

Kategori yang telah dipelajari adalah ['MORTGAGE', 'OTHER', 'OWN', 'RENT']
Kategori yang telah dipelajari adalah ['DEBTCONSOLIDATION', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL', 'VENTURE']
Kategori yang telah dipelajari adalah ['A', 'B', 'C', 'D', 'E', 'F', 'G']
Kategori yang telah dipelajari adalah ['N', 'Y']


In [73]:
def ohe_transform(dataset, subset, prefix, ohe):
    """
    Function to transform a dataset column using a pre-fitted OneHotEncoder.

    Parameters:
    dataset : pd.DataFrame
        The dataset to be transformed.
    subset : str
        The column name in the dataset to be transformed.
    prefix : str
        The prefix to be used for the new encoded columns.
    ohe : OneHotEncoder
        The pre-fitted OneHotEncoder instance.

    Returns:
    pd.DataFrame
        The dataset with the one-hot encoded columns appended.
    """
    if not isinstance(dataset, pd.DataFrame):
        raise RuntimeError("Fungsi ohe_transform: parameter dataset harus bertipe DataFrame!")
    
    if not isinstance(ohe, OneHotEncoder):
        raise RuntimeError("Fungsi ohe_transform: parameter ohe harus bertipe OneHotEncoder!")
    
    if not isinstance(prefix, str):
        raise RuntimeError("Fungsi ohe_transform: parameter prefix harus bertipe str!")
    
    if not isinstance(subset, str):
        raise RuntimeError("Fungsi ohe_transform: parameter subset harus bertipe str!")
    
    try:
        column_names = list(dataset.columns)
        column_names.index(subset)
    except ValueError:
        raise RuntimeError("Fungsi ohe_transform: parameter subset string namun data tidak ditemukan dalam daftar kolom yang terdapat pada parameter dataset.")
    
    print("Fungsi ohe_transform: parameter telah divalidasi.")

    dataset = dataset.copy()

    print(f"Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah {list(dataset.columns)}")

    col_names = [f"{prefix}_{col}" for col in ohe.categories_[0].tolist()]

    transformed_data = ohe.transform(dataset[[subset]]).toarray()
    encoded = pd.DataFrame(transformed_data, columns=col_names,index=dataset.index)
    dataset = pd.concat([dataset, encoded], axis=1)

    dataset.drop(columns=[subset], inplace=True)

    print(f'Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah {list(dataset.columns)}')

    return dataset

In [75]:
X_train = ohe_transform(dataset=X_train, subset="person_home_ownership", prefix="home_ownership", ohe=ohe_home_ownership)
X_train = ohe_transform(dataset=X_train, subset="loan_intent", prefix="loan_intent", ohe=ohe_loan_intent )
X_train = ohe_transform(dataset=X_train, subset="loan_grade", prefix="loan_grade", ohe=ohe_loan_grade )
X_train = ohe_transform(dataset=X_train, subset="cb_person_default_on_file", prefix="default_onfile", ohe=ohe_default_on_file)

print(X_train.columns)

Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT']
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',



In [76]:
X_test = ohe_transform(dataset=X_test, subset="person_home_ownership", prefix="home_ownership", ohe=ohe_home_ownership)
X_test = ohe_transform(dataset=X_test, subset="loan_intent", prefix="loan_intent", ohe=ohe_loan_intent )
X_test = ohe_transform(dataset=X_test, subset="loan_grade", prefix="loan_grade", ohe=ohe_loan_grade )
X_test = ohe_transform(dataset=X_test, subset="cb_person_default_on_file", prefix="default_onfile", ohe=ohe_default_on_file)

X_valid = ohe_transform(dataset=X_valid, subset="person_home_ownership", prefix="home_ownership", ohe=ohe_home_ownership)
X_valid = ohe_transform(dataset=X_valid, subset="loan_intent", prefix="loan_intent", ohe=ohe_loan_intent )
X_valid = ohe_transform(dataset=X_valid, subset="loan_grade", prefix="loan_grade", ohe=ohe_loan_grade )
X_valid = ohe_transform(dataset=X_valid, subset="cb_person_default_on_file", prefix="default_onfile", ohe=ohe_default_on_file)



Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
Fungsi ohe_transform: daftar nama kolom setelah dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length', 'home_ownership_MORTGAGE', 'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT']
Fungsi ohe_transform: parameter telah divalidasi.
Fungsi ohe_transform: daftar nama kolom sebelum dilakukan pengkodean adalah ['person_age', 'person_income', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file',



In [77]:
utils.serialize_data(X_train, 'data/processed/X_train_prep.pkl')
utils.serialize_data(X_test, 'data/processed/X_test_prep.pkl')
utils.serialize_data(X_valid, 'data/processed/X_valid_prep.pkl')

In [78]:
utils.serialize_data(y_train, 'data/processed/y_train_prep.pkl')