In [19]:
#Importing Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv('LoanAmountlPrediction.csv')


In [2]:
#Analyse dataset
import pandas as pd

file_path = "LoanAmountlPrediction.csv"  # Replace with the actual file path
data = pd.read_csv(file_path)
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [3]:
#Head, tail, check for null values, duplicate values
print("Head:\n", data.head())
print("\nTail:\n", data.tail())
print("\nNull values:\n", data.isnull().sum())
print("\nDuplicate values:\n", data.duplicated().sum())


Head:
     Loan_ID Gender Married  Dependents     Education Self_Employed  \
0  LP001002   Male      No         0.0      Graduate            No   
1  LP001003   Male     Yes         1.0      Graduate            No   
2  LP001005   Male     Yes         0.0      Graduate           Yes   
3  LP001006   Male     Yes         0.0  Not Graduate            No   
4  LP001008   Male      No         0.0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2      

In [5]:
#Impute null values using KNN imputer for numerical columns
numerical_columns = data.select_dtypes(include=['number']).columns
categorical_columns = data.select_dtypes(include=['object']).columns

data_numerical = data[numerical_columns]
data_categorical = data[categorical_columns]


imputer_numerical = KNNImputer()
data_numerical_imputed = pd.DataFrame(imputer_numerical.fit_transform(data_numerical), columns=numerical_columns)

data_categorical_imputed = data_categorical.apply(lambda x: x.fillna(x.value_counts().index[0]))


data_imputed = pd.concat([data_numerical_imputed, data_categorical_imputed], axis=1)


In [6]:
# Explore all columns
print(data_imputed.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dependents         598 non-null    float64
 1   ApplicantIncome    598 non-null    float64
 2   CoapplicantIncome  598 non-null    float64
 3   LoanAmount         598 non-null    float64
 4   Loan_Amount_Term   598 non-null    float64
 5   Credit_History     598 non-null    float64
 6   Loan_ID            598 non-null    object 
 7   Gender             598 non-null    object 
 8   Married            598 non-null    object 
 9   Education          598 non-null    object 
 10  Self_Employed      598 non-null    object 
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(6), object(7)
memory usage: 60.9+ KB
None


In [7]:
# Find relationship between LoanAmount and other variables
data_encoded = pd.get_dummies(data_imputed.drop('Loan_ID', axis=1))
correlation_matrix = data_encoded.corr()
print(correlation_matrix['LoanAmount'].sort_values(ascending=False))


LoanAmount                 1.000000
ApplicantIncome            0.540108
CoapplicantIncome          0.201106
Education_Graduate         0.174792
Married_Yes                0.146335
Dependents                 0.114805
Self_Employed_Yes          0.111699
Gender_Male                0.106263
Property_Area_Rural        0.058145
Loan_Status_N              0.056782
Loan_Amount_Term           0.051024
Property_Area_Semiurban    0.001834
Credit_History            -0.008535
Loan_Status_Y             -0.056782
Property_Area_Urban       -0.058100
Gender_Female             -0.106263
Self_Employed_No          -0.111699
Married_No                -0.146335
Education_Not Graduate    -0.174792
Name: LoanAmount, dtype: float64


In [8]:
#Spearman correlation among all numerical columns
from scipy.stats import spearmanr

spearman_corr, _ = spearmanr(data_imputed)
print(pd.DataFrame(spearman_corr, columns=data_imputed.columns, index=data_imputed.columns))


                   Dependents  ApplicantIncome  CoapplicantIncome  LoanAmount  \
Dependents           1.000000         0.133291          -0.037326    0.122953   
ApplicantIncome      0.133291         1.000000          -0.304677    0.533598   
CoapplicantIncome   -0.037326        -0.304677           1.000000    0.240790   
LoanAmount           0.122953         0.533598           0.240790    1.000000   
Loan_Amount_Term    -0.089271        -0.012648          -0.023570    0.058840   
Credit_History      -0.072931         0.029022          -0.005077   -0.023742   
Loan_ID              0.074897         0.007606          -0.022689    0.032467   
Gender               0.171554         0.079530           0.208748    0.151682   
Married              0.364892         0.010908           0.266932    0.181308   
Education            0.056621        -0.189320          -0.016468   -0.169800   
Self_Employed        0.057631         0.167118          -0.060943    0.091316   
Property_Area        0.03533

In [9]:
#Drop 'Loan_ID'
data_imputed = data_imputed.drop('Loan_ID', axis=1)


In [18]:
#Perform Level encoding to convert categorical data to numerical data
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']

for col in categorical_columns:
    data_imputed[col] = label_encoder.fit_transform(data_imputed[col])


In [11]:
#Remove outliers using IQR method
Q1 = data_imputed.quantile(0.25)
Q3 = data_imputed.quantile(0.75)
IQR = Q3 - Q1
data_no_outliers = data_imputed[~((data_imputed < (Q1 - 1.5 * IQR)) | (data_imputed > (Q3 + 1.5 * IQR))).any(axis=1)]


In [12]:
#  Sanity check for outliers removal
print("Original dataset shape:", data_imputed.shape)
print("Dataset shape after removing outliers:", data_no_outliers.shape)


Original dataset shape: (598, 12)
Dataset shape after removing outliers: (193, 12)


In [13]:
#Split data into x and y (LoanAmount)
X = data_no_outliers.drop('LoanAmount', axis=1)
y = data_no_outliers['LoanAmount']


In [14]:
#Split data into x_train, y_train, x_test, and y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
#Perform linear regression, decision tree, and random forest
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)


In [16]:
#Verify accuracy using r2 square
lr_r2 = r2_score(y_test, lr_model.predict(X_test))
dt_r2 = r2_score(y_test, dt_model.predict(X_test))
rf_r2 = r2_score(y_test, rf_model.predict(X_test))

print(f"Linear Regression R^2: {lr_r2}")
print(f"Decision Tree R^2: {dt_r2}")
print(f"Random Forest R^2: {rf_r2}")


Linear Regression R^2: 0.6422322996124173
Decision Tree R^2: 0.39109855344488664
Random Forest R^2: 0.6070245338531786
