## Split Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder,OrdinalEncoder,PolynomialFeatures
from sklearn.compose import ColumnTransformer

In [408]:
df = pd.read_csv('/kaggle/working/df_cleaned.csv')

In [409]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,income_category,age_category,income_to_loan_ratio
0,37.0,35000.0,RENT,0.0,EDUCATION,B,6000.0,11.49,0.17,N,14.0,0,Low,30-40,5.833333
1,22.0,56000.0,OWN,6.0,MEDICAL,C,4000.0,13.35,0.07,N,2.0,0,Medium,20-30,14.0
2,29.0,28800.0,OWN,8.0,PERSONAL,A,6000.0,8.9,0.21,N,10.0,0,Low,20-30,4.8
3,30.0,70000.0,RENT,14.0,VENTURE,B,12000.0,11.11,0.17,N,5.0,0,Medium,20-30,5.833333
4,22.0,60000.0,RENT,2.0,MEDICAL,A,6000.0,6.92,0.1,N,3.0,0,Medium,20-30,10.0


In [391]:
x = df.drop(columns = 'loan_status' , axis = 1)
y = df['loan_status']

In [392]:
x_train , x_test , y_train , y_test  = train_test_split(x,y,test_size = 0.2)
x_train.shape ,y_train.shape , x_test.shape ,y_test.shape

((46905, 14), (46905,), (11727, 14), (11727,))

In [393]:
x_train.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,income_category,age_category,income_to_loan_ratio
40676,22.0,48000.0,RENT,0.0,MEDICAL,A,12000.0,8.94,0.25,N,2.0,Low,20-30,4.0
30248,24.0,65004.0,MORTGAGE,0.0,EDUCATION,C,8000.0,12.87,0.12,Y,3.0,Medium,20-30,8.1255
31899,27.0,59000.0,MORTGAGE,11.0,EDUCATION,A,3500.0,7.68,0.06,N,10.0,Medium,20-30,16.857143
15786,25.0,100000.0,MORTGAGE,9.0,HOMEIMPROVEMENT,B,15000.0,10.59,0.15,N,4.0,Medium,20-30,6.666667
20322,21.0,45200.0,RENT,1.0,DEBTCONSOLIDATION,A,5000.0,8.59,0.11,N,3.0,Low,20-30,9.04


In [394]:
num_cols = ['person_age','person_income','person_emp_length','loan_amnt','loan_percent_income','cb_person_cred_hist_length','income_to_loan_ratio']
nominal_cols = ['person_home_ownership','loan_intent','cb_person_default_on_file','age_category']
ordinal_cols = ['loan_grade','income_category']

In [395]:
num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy = 'mean')),
    #('poly', PolynomialFeatures(degree = 3)),
    ('poly', PolynomialFeatures()),
    ('scaler', StandardScaler())
])

ordinal_pipeline = Pipeline([
    ('ord_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ord_encoder', OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)),
])

nominal_pipeline = Pipeline([
    ('nom_imputer', SimpleImputer(strategy = 'most_frequent')),
    ('ohe', OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_cols),
    ('ordinal_pipeline', ordinal_pipeline, ordinal_cols),
    ('nominal_pipeline', nominal_pipeline, nominal_cols)
]).set_output(transform = 'pandas')

In [396]:
x_train_prep = preprocessor.fit_transform(x_train)
x_test_prep = preprocessor.transform(x_test)

In [397]:
x_train_prep.head()

Unnamed: 0,num_pipeline__1,num_pipeline__person_age,num_pipeline__person_income,num_pipeline__person_emp_length,num_pipeline__loan_amnt,num_pipeline__loan_percent_income,num_pipeline__cb_person_cred_hist_length,num_pipeline__income_to_loan_ratio,num_pipeline__person_age^2,num_pipeline__person_age person_income,...,nominal_pipeline__loan_intent_MEDICAL,nominal_pipeline__loan_intent_PERSONAL,nominal_pipeline__loan_intent_VENTURE,nominal_pipeline__cb_person_default_on_file_Y,nominal_pipeline__age_category_30-40,nominal_pipeline__age_category_40-50,nominal_pipeline__age_category_50-60,nominal_pipeline__age_category_60-70,nominal_pipeline__age_category_70-80,nominal_pipeline__age_category_80-90
40676,0.0,-1.073306,-0.496392,-1.30792,0.743012,1.192453,-1.049596,-0.720993,-0.974017,-0.769339,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30248,0.0,-0.618707,0.240255,-1.30792,-0.135332,-0.40181,-0.738182,-0.158513,-0.614745,-0.05554,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
31899,0.0,0.063192,-0.01985,1.949208,-1.123468,-1.137624,1.441722,1.031978,-0.017261,-0.008948,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15786,0.0,-0.391407,1.756351,1.357003,1.401769,-0.033903,-0.426767,-0.357414,-0.423394,1.275362,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20322,0.0,-1.300605,-0.617694,-1.011817,-0.794089,-0.524446,-0.738182,-0.033828,-1.141938,-0.920567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [398]:
x_train_prep.to_csv('x_train_prep.csv',index=False)
x_test_prep.to_csv('x_test_prep.csv',index=False)
y_train.to_csv('y_train.csv',index=False)
y_test.to_csv('y_test.csv',index=False)