In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [6]:
def ModelPreProcessing(credit_risk):

    # outliers treatment
    cr_age_rmvd=credit_risk[credit_risk['person_age']<=70]
    cr_age_rmvd.reset_index(drop=True, inplace=True)
    emp_ln_rmvd=cr_age_rmvd[cr_age_rmvd['person_emp_length']<42]
    emp_ln_rmvd.reset_index(drop=True, inplace=True)
    cr_data=emp_ln_rmvd.copy()

    # Missing Values Treatment
    cr_data['loan_int_rate']=cr_data['loan_int_rate'].fillna(cr_data['loan_int_rate'].median())
    cr_data_cat_treated= cr_data.copy()

    # Categorical Variables treatment/One-hot Encoding
    person_home_ownership=pd.get_dummies(cr_data_cat_treated['person_home_ownership'], drop_first=True).astype(int)
    loan_intent=pd.get_dummies(cr_data_cat_treated['loan_intent'], drop_first=True).astype(int)
    cr_data_cat_treated['cb_person_default_on_file_binary']=np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y', 1, 0)
    cr_data_to_scale=cr_data_cat_treated.drop(['person_home_ownership', 'loan_intent', 
                                           'loan_grade', 'loan_status', 'cb_person_default_on_file', 
                                           'cb_person_default_on_file_binary'], axis=1)
    colms=cr_data_to_scale.columns

    # Scaling the data
    scaler=StandardScaler()
    scaled_df=pd.DataFrame(scaler.fit_transform(cr_data_to_scale), columns=colms)
    scaled_combined=pd.concat([scaled_df,person_home_ownership, loan_intent],axis=1)
    scaled_combined['cb_person_default_on_file']=cr_data_cat_treated['cb_person_default_on_file_binary']
    scaled_combined['loan_status']=cr_data_cat_treated['loan_status']

    # Feature and target creation
    target=scaled_combined['loan_status']
    features=scaled_combined.drop('loan_status', axis=1)

    # SMOTE Balancing
    smote=SMOTE()
    balanced_features, balanced_target=smote.fit_resample(features, target)

    # return the basepanel data
    return cr_data_to_scale, features, target,balanced_features, balanced_target

In [11]:
def ModelDataProcessing(credit_risk):

    # outliers treatment
    cr_age_rmvd=credit_risk[credit_risk['person_age']<=70]
    cr_age_rmvd.reset_index(drop=True, inplace=True)
    emp_ln_rmvd=cr_age_rmvd[cr_age_rmvd['person_emp_length']<42]
    emp_ln_rmvd.reset_index(drop=True, inplace=True)
    cr_data=emp_ln_rmvd.copy()

    # Missing Values Treatment
    cr_data['loan_int_rate']=cr_data['loan_int_rate'].fillna(cr_data['loan_int_rate'].median())
    cr_data_cat_treated= cr_data.copy()

    # Categorical Variables treatment/One-hot Encoding
    person_home_ownership=pd.get_dummies(cr_data_cat_treated['person_home_ownership'], drop_first=True).astype(int)
    loan_intent=pd.get_dummies(cr_data_cat_treated['loan_intent'], drop_first=True).astype(int)
    cr_data_cat_treated['cb_person_default_on_file_binary']=np.where(cr_data_cat_treated['cb_person_default_on_file']=='Y', 1, 0)
    cr_data_to_scale=cr_data_cat_treated.drop(['person_home_ownership', 'loan_intent', 
                                           'loan_grade', 'loan_status', 'cb_person_default_on_file', 
                                           'cb_person_default_on_file_binary'], axis=1)
    colms=cr_data_to_scale.columns

    # Scaling the data
    scaler=StandardScaler()
    scaled_df=pd.DataFrame(scaler.fit_transform(cr_data_to_scale), columns=colms)
    scaled_combined=pd.concat([scaled_df,person_home_ownership, loan_intent],axis=1)
    scaled_combined['cb_person_default_on_file']=cr_data_cat_treated['cb_person_default_on_file_binary']
    scaled_combined['loan_status']=cr_data_cat_treated['loan_status']

    # Feature and target creation
    # target=scaled_combined['loan_status']
    # features=scaled_combined.drop('loan_status', axis=1)

    features=scaled_combined.copy()

    # SMOTE Balancing
    # smote=SMOTE()
    # balanced_features=smote.fit_resample(features)

    # return the basepanel data
    return cr_data_to_scale, features

In [15]:
# df = pd.read_csv(r"C:\Users\dwipa\Desktop\pd_model_development\input\credit_risk_dataset.csv")
# cr_data_to_scale, features, target, balanced_features, balanced_target = ModelPreProcessing(df)

# df1 = pd.read_csv(r"C:\Users\dwipa\Desktop\pd_model_development\input\credit_risk_dataset.csv")
# cr_data_to_scale, features = ModelDataProcessing(df1)
# features.head()