In [1]:
import os
os.chdir("../")

In [26]:
from LoanApproval import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LoanApproval.utils import read_yam_file, create_directories

In [27]:
from pathlib import Path

In [28]:
class DataPreprocessConfigurationManager:
    def __init__(self, config_file_path=CONFIG_FILE_PATH):
        self.config = read_yam_file(config_file_path)
        self.dataset_load = os.path.join(self.config['data_ingestion']['root_dir'], self.config['data_ingestion']['local_files'])
        create_directories([self.config['artifacts_root']])
    
    def data_preprocess_config(self):
        config = self.config['data_preprocessing']
        create_directories([config['root_dir']])

        data_preprocess_config = {"root_dir": Path(config['root_dir']), "dataset_load": Path(self.dataset_load),
                                 "local_file": Path(config['local_file'])}
        return data_preprocess_config
   

In [50]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [51]:
class DataPreprocessing:
    def __init__(self, config):
        self.config = config

    def handle_missing_value(self):
        data_path = self.config['dataset_load']
        self.df = pd.read_csv(data_path)
        
        # first drop Loan_ID column
        self.df = self.df.drop('Loan_ID', axis=1)

        # handling categorical missing value
        df_categorical = self.df.select_dtypes('O')
        self.df[df_categorical.columns] = self.df[df_categorical.columns].fillna(self.df[df_categorical.columns].mode().iloc[0])

        # handling numerical missing value
        self.df = self.df.fillna(self.df.median())

    def cat_to_numeric(self):
        # first convert out dependent variable to numeric
        self.df['Loan_Status'] = self.df['Loan_Status'].replace(['Y', 'N'], [1, 0])

        # now convert our independent features to numeric with OneHotEncoding
        self.df = pd.get_dummies(self.df)

    def handle_imbalanced_data(self):
        # using oversampling SMOTE
        sm = SMOTE()
        self.df_over_sample, df_loan_new = sm.fit_resample(self.df.drop('Loan_Status', axis=1), self.df.Loan_Status)
        self.df_over_sample['Loan_Status'] = df_loan_new
        
    def scaling_data(self):
        scaler = StandardScaler()
        self.df_scaled = pd.DataFrame(scaler.fit_transform(self.df_over_sample.drop("Loan_Status", axis=1)), columns=self.df_over_sample.columns[:-1])
        self.df_scaled['Loan_Status'] = self.df_over_sample['Loan_Status']

         
    def save_preprocess_data(self):
        data_preprocess_root_dir = self.config['root_dir']
        local_file_dir = self.config['local_file']

        raw_local_file_path = os.path.join(data_preprocess_root_dir, local_file_dir)

        self.df_scaled.to_csv(raw_local_file_path, index=False)

In [52]:
config = DataPreprocessConfigurationManager()
data_preprocess_config = config.data_preprocess_config()
data_preprocess = DataPreprocessing(config=data_preprocess_config)
data_preprocess.handle_missing_value()
data_preprocess.cat_to_numeric()
data_preprocess.handle_imbalanced_data()
data_preprocess.scaling_data()
data_preprocess.save_preprocess_data()

[2023-03-21 12:24:30,774: INFO: utils]: yaml file configs\config.yaml load  successfully
[2023-03-21 12:24:30,775: INFO: utils]: created directory at artifacts
[2023-03-21 12:24:30,777: INFO: utils]: created directory at artifacts/data_preprocessing


  self.df = self.df.fillna(self.df.median())
