Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as ans

In [2]:
# load your dataset
df = pd.read_csv("/content/loan_dataset_20000.csv")

#Display first 7 rows
df.head(7)

Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1
5,28,Male,Single,High School,55559.8,4629.98,Employed,0.081,774,12724.02,...,60,287.75,B1,3,76147.61,14885.25,0,0,1,1
6,41,Male,Single,Bachelor's,24642.88,2053.57,Unemployed,0.165,742,5905.27,...,36,197.5,B3,6,18380.15,12683.33,3,0,3,0


In [3]:
# Shape of dataset
print("Shape of dataset:", df.shape)

Shape of dataset: (20000, 22)


In [4]:
# Column names
print("\nColumns:\n", df.columns)


Columns:
 Index(['age', 'gender', 'marital_status', 'education_level', 'annual_income',
       'monthly_income', 'employment_status', 'debt_to_income_ratio',
       'credit_score', 'loan_amount', 'loan_purpose', 'interest_rate',
       'loan_term', 'installment', 'grade_subgrade', 'num_of_open_accounts',
       'total_credit_limit', 'current_balance', 'delinquency_history',
       'public_records', 'num_of_delinquencies', 'loan_paid_back'],
      dtype='object')


In [5]:
# Data types
print("\nData Types:\n")
print(df.dtypes)


Data Types:

age                       int64
gender                   object
marital_status           object
education_level          object
annual_income           float64
monthly_income          float64
employment_status        object
debt_to_income_ratio    float64
credit_score              int64
loan_amount             float64
loan_purpose             object
interest_rate           float64
loan_term                 int64
installment             float64
grade_subgrade           object
num_of_open_accounts      int64
total_credit_limit      float64
current_balance         float64
delinquency_history       int64
public_records            int64
num_of_delinquencies      int64
loan_paid_back            int64
dtype: object


In [6]:
# General info
print("\nDataset Info:\n")
df.info()


Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   20000 non-null  int64  
 1   gender                20000 non-null  object 
 2   marital_status        20000 non-null  object 
 3   education_level       20000 non-null  object 
 4   annual_income         20000 non-null  float64
 5   monthly_income        20000 non-null  float64
 6   employment_status     20000 non-null  object 
 7   debt_to_income_ratio  20000 non-null  float64
 8   credit_score          20000 non-null  int64  
 9   loan_amount           20000 non-null  float64
 10  loan_purpose          20000 non-null  object 
 11  interest_rate         20000 non-null  float64
 12  loan_term             20000 non-null  int64  
 13  installment           20000 non-null  float64
 14  grade_subgrade        20000 non-null  object 
 15  num

In [7]:
# Check missing values
print("Missing Values:\n")
print(df.isnull().sum())

Missing Values:

age                     0
gender                  0
marital_status          0
education_level         0
annual_income           0
monthly_income          0
employment_status       0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
loan_purpose            0
interest_rate           0
loan_term               0
installment             0
grade_subgrade          0
num_of_open_accounts    0
total_credit_limit      0
current_balance         0
delinquency_history     0
public_records          0
num_of_delinquencies    0
loan_paid_back          0
dtype: int64


In [10]:
# fill categorical missing values with mode
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


In [12]:
# fill numerical missing values with median
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [13]:
# Verify again
print("Missing values after treatment:\n")
print(df.isnull().sum())

Missing values after treatment:

age                     0
gender                  0
marital_status          0
education_level         0
annual_income           0
monthly_income          0
employment_status       0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
loan_purpose            0
interest_rate           0
loan_term               0
installment             0
grade_subgrade          0
num_of_open_accounts    0
total_credit_limit      0
current_balance         0
delinquency_history     0
public_records          0
num_of_delinquencies    0
loan_paid_back          0
dtype: int64


In [14]:
# Incorrect Data Types

if 'Credit_History' in df.columns:
    df['Credit_History'] = df['Credit_History'].astype(int)

df.dtypes

Unnamed: 0,0
age,int64
gender,object
marital_status,object
education_level,object
annual_income,float64
monthly_income,float64
employment_status,object
debt_to_income_ratio,float64
credit_score,int64
loan_amount,float64


In [15]:
# Remove Duplicate Records
# Check duplicates
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


In [16]:
# Remove duplicates
df.drop_duplicates(inplace=True)

print("After removing duplicates:", df.duplicated().sum())

After removing duplicates: 0


In [17]:
# Detect Outliers
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Outliers removed perfectly")

Outliers removed perfectly


In [18]:
# Irrelevant or Redundant Features
if 'Loan_ID' in df.columns:
    df.drop('Loan_ID', axis=1, inplace=True)

df.head()

Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,1
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,1
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,1
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,1
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,1


In [19]:
# final Cleaned Dataset Shape
print("Final dataset shape after preprocessing:", df.shape)


Final dataset shape after preprocessing: (12683, 22)


categorical_encoding

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

In [21]:
#Label Encoding
def label_encoding(df, column):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    return df

In [22]:
#One-Hot Encoding
def one_hot_encoding(df, column):
    df = pd.get_dummies(df, columns=[column], drop_first=True)
    return df

In [23]:
# Ordinal Encoding
def ordinal_encoding(df, column, categories_order):
    oe = OrdinalEncoder(categories=[categories_order])
    df[column] = oe.fit_transform(df[[column]])
    return df

In [24]:
# Frequency Encoding
def frequency_encoding(df, column):
    freq = df[column].value_counts() / len(df)
    df[column] = df[column].map(freq)
    return df


In [25]:
# Target Encoding
def target_encoding(df, column, target):
    target_mean = df.groupby(column)[target].mean()
    df[column] = df[column].map(target_mean)
    return df

In [30]:
# label encoding
df = label_encoding(df, 'loan_paid_back')
df.head()

Unnamed: 0,age,gender,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,...,loan_term,installment,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,Male,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,...,36,581.88,B5,7,40833.47,24302.07,1,0,1,0
1,72,Female,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,...,60,573.17,F1,5,27968.01,10803.01,1,0,3,0
2,49,Female,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,...,60,76.32,B4,2,15502.25,4505.44,0,0,0,0
3,35,Female,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,...,36,468.07,A5,7,18157.79,5525.63,4,0,5,0
4,63,Other,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,...,60,395.5,D5,1,17467.56,3593.91,2,0,2,0


In [31]:
# One-Hot Encoding (gender)
df = one_hot_encoding(df, 'gender')
df.head()

Unnamed: 0,age,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,loan_purpose,...,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back,gender_Male,gender_Other
0,59,Married,Master's,24240.19,2020.02,Employed,0.074,743,17173.72,Car,...,B5,7,40833.47,24302.07,1,0,1,0,True,False
1,72,Married,Bachelor's,20172.98,1681.08,Employed,0.219,531,22663.89,Debt consolidation,...,F1,5,27968.01,10803.01,1,0,3,0,False,False
2,49,Single,High School,26181.8,2181.82,Employed,0.234,779,3631.36,Business,...,B4,2,15502.25,4505.44,0,0,0,0,False,False
3,35,Single,High School,11873.84,989.49,Employed,0.264,809,14939.23,Other,...,A5,7,18157.79,5525.63,4,0,5,0,False,False
4,63,Single,Other,25326.44,2110.54,Employed,0.26,663,16551.71,Car,...,D5,1,17467.56,3593.91,2,0,2,0,False,True


In [36]:
# Ordinal Encoding (education_level)
df['education_level'] = df['education_level'].str.strip()
df['education_level'] = df['education_level'].str.title()

df['education_level'].unique()

array(["Master'S", "Bachelor'S", 'High School', 'Other', 'Phd'],
      dtype=object)

In [37]:
categories = sorted(df['education_level'].unique())

df = ordinal_encoding(df, 'education_level', categories)

df.head()

Unnamed: 0,age,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,loan_purpose,...,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back,gender_Male,gender_Other
0,59,Married,2.0,24240.19,2020.02,Employed,0.074,743,17173.72,Car,...,B5,7,40833.47,24302.07,1,0,1,0,True,False
1,72,Married,0.0,20172.98,1681.08,Employed,0.219,531,22663.89,Debt consolidation,...,F1,5,27968.01,10803.01,1,0,3,0,False,False
2,49,Single,1.0,26181.8,2181.82,Employed,0.234,779,3631.36,Business,...,B4,2,15502.25,4505.44,0,0,0,0,False,False
3,35,Single,1.0,11873.84,989.49,Employed,0.264,809,14939.23,Other,...,A5,7,18157.79,5525.63,4,0,5,0,False,False
4,63,Single,3.0,25326.44,2110.54,Employed,0.26,663,16551.71,Car,...,D5,1,17467.56,3593.91,2,0,2,0,False,True


In [38]:
# Frequency Encoding (loan_purpose)
df = frequency_encoding(df, 'loan_purpose')
df.head()

Unnamed: 0,age,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,loan_purpose,...,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back,gender_Male,gender_Other
0,59,Married,2.0,24240.19,2020.02,Employed,0.074,743,17173.72,0.119215,...,B5,7,40833.47,24302.07,1,0,1,0,True,False
1,72,Married,0.0,20172.98,1681.08,Employed,0.219,531,22663.89,0.401798,...,F1,5,27968.01,10803.01,1,0,3,0,False,False
2,49,Single,1.0,26181.8,2181.82,Employed,0.234,779,3631.36,0.081369,...,B4,2,15502.25,4505.44,0,0,0,0,False,False
3,35,Single,1.0,11873.84,989.49,Employed,0.264,809,14939.23,0.127336,...,A5,7,18157.79,5525.63,4,0,5,0,False,False
4,63,Single,3.0,25326.44,2110.54,Employed,0.26,663,16551.71,0.119215,...,D5,1,17467.56,3593.91,2,0,2,0,False,True


In [39]:
# Target Encoding (employment_status)
df = target_encoding(df, 'employment_status', 'loan_paid_back')
df.head()

Unnamed: 0,age,marital_status,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,loan_purpose,...,grade_subgrade,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back,gender_Male,gender_Other
0,59,Married,2.0,24240.19,2020.02,0.0,0.074,743,17173.72,0.119215,...,B5,7,40833.47,24302.07,1,0,1,0,True,False
1,72,Married,0.0,20172.98,1681.08,0.0,0.219,531,22663.89,0.401798,...,F1,5,27968.01,10803.01,1,0,3,0,False,False
2,49,Single,1.0,26181.8,2181.82,0.0,0.234,779,3631.36,0.081369,...,B4,2,15502.25,4505.44,0,0,0,0,False,False
3,35,Single,1.0,11873.84,989.49,0.0,0.264,809,14939.23,0.127336,...,A5,7,18157.79,5525.63,4,0,5,0,False,False
4,63,Single,3.0,25326.44,2110.54,0.0,0.26,663,16551.71,0.119215,...,D5,1,17467.56,3593.91,2,0,2,0,False,True


Feature Scaling

In [40]:
# separate numerical columns
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, Normalizer, StandardScaler

In [41]:
# Select only numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

In [42]:
# Drop target column from scaling (important)
if 'Loan_Status' in numerical_cols:
    numerical_cols = numerical_cols.drop('Loan_Status')

X_numeric = df[numerical_cols]

X_numeric.head()

Unnamed: 0,age,education_level,annual_income,monthly_income,employment_status,debt_to_income_ratio,credit_score,loan_amount,loan_purpose,interest_rate,loan_term,installment,num_of_open_accounts,total_credit_limit,current_balance,delinquency_history,public_records,num_of_delinquencies,loan_paid_back
0,59,2.0,24240.19,2020.02,0.0,0.074,743,17173.72,0.119215,13.39,36,581.88,7,40833.47,24302.07,1,0,1,0
1,72,0.0,20172.98,1681.08,0.0,0.219,531,22663.89,0.401798,17.81,60,573.17,5,27968.01,10803.01,1,0,3,0
2,49,1.0,26181.8,2181.82,0.0,0.234,779,3631.36,0.081369,9.53,60,76.32,2,15502.25,4505.44,0,0,0,0
3,35,1.0,11873.84,989.49,0.0,0.264,809,14939.23,0.127336,7.99,36,468.07,7,18157.79,5525.63,4,0,5,0
4,63,3.0,25326.44,2110.54,0.0,0.26,663,16551.71,0.119215,15.2,60,395.5,1,17467.56,3593.91,2,0,2,0


In [43]:
# Min-Max Scaling
minmax_scaler = MinMaxScaler()

X_minmax = minmax_scaler.fit_transform(X_numeric)

print("Min-Max Scaling Applied Successfully")

Min-Max Scaling Applied Successfully


In [44]:
# Max Absolute Scaling
maxabs_scaler = MaxAbsScaler()

X_maxabs = maxabs_scaler.fit_transform(X_numeric)

print("Max Absolute Scaling Applied Successfully")

Max Absolute Scaling Applied Successfully


In [45]:
# Vector Normalization
normalizer = Normalizer()

X_normalized = normalizer.fit_transform(X_numeric)

print("Vector Normalization Applied Successfully")

Vector Normalization Applied Successfully


In [46]:
# Z-Score Standardization (Standard Scaling)
standard_scaler = StandardScaler()

X_standardized = standard_scaler.fit_transform(X_numeric)

print("Z-Score Standardization Applied Successfully")

Z-Score Standardization Applied Successfully


In [47]:
# Check skewness of numerical features
skew_values = df[numerical_cols].skew()

print("Skewness of Numerical Features:\n")
print(skew_values)

Skewness of Numerical Features:

age                     0.008013
education_level         0.917753
annual_income           0.714783
monthly_income          0.714783
employment_status       0.000000
debt_to_income_ratio    0.687718
credit_score           -0.048756
loan_amount             0.142386
loan_purpose            0.339565
interest_rate           0.066822
loan_term               0.855807
installment             0.320515
num_of_open_accounts    0.225731
total_credit_limit      0.727586
current_balance         0.827724
delinquency_history     0.576852
public_records          0.000000
num_of_delinquencies    0.398683
loan_paid_back          0.000000
dtype: float64


In [50]:
# Train-Test Split
from sklearn.model_selection import train_test_split

# Define Features (X) and Target (y)
X = df.drop('loan_paid_back', axis=1)
y = df['loan_paid_back']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (10146, 22)
Testing set shape: (2537, 22)
