In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


First, we'll see a roundup of the dataset.

In [None]:
# Read loan
Loan_DataSet = pd.read_csv('dataset/loan.csv')

# Info
print("\nInfo :")
Loan_DataSet.info()

# Count NAN values 
print("\nNAN values :\n ",Loan_DataSet.isna().sum())

# Head 
print("\nHead:")
Loan_DataSet.head()


Info :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB

NAN values :
  Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome 

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


After that we handel missing values.

In [3]:
# Fill categorical columnumns with the mode
for column in ['Gender', 'Married', 'Self_Employed','Credit_History','Loan_Amount_Term','Dependents']:
    Loan_DataSet[column].fillna(Loan_DataSet[column].mode()[0], inplace=True)

# Fill LoanAmount with median
Loan_DataSet['LoanAmount'].fillna(Loan_DataSet['LoanAmount'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Loan_DataSet[column].fillna(Loan_DataSet[column].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  Loan_DataSet['LoanAmount'].fillna(Loan_DataSet['LoanAmount'].median(), inplace=True)


Encoding Categorical Variables

In [4]:
# Convert '3+' to 3 and cast to int
Loan_DataSet['Dependents'] = Loan_DataSet['Dependents'].replace('3+', 3).astype(int)

# Encode the target variable: Y = 1, N = 0
Loan_DataSet['Loan_Status'] = Loan_DataSet['Loan_Status'].map({'Y': 1, 'N': 0})

# List of categorical columns to encode
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# Apply LabelEncoder
label_encoders = {}

for column in categorical_columns:
    label_encoder = LabelEncoder()
    Loan_DataSet[column] = label_encoder .fit_transform(Loan_DataSet[column])
    # label_encoders[column] = label_encoder   

> In this project, we observed that scaling had a significant effect on KNN and ANN, while Logistic Regression performed equally well with or without scaling.

Splitting the Dataset

In [5]:
# Separate features and label
X = Loan_DataSet.drop("Loan_Status", axis=1)
y = Loan_DataSet["Loan_Status"]

# Step 1: Split off 15% for the test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

# Step 2: From the remaining 85%, split 20% for validation (0.2 of 0.85 ≈ 0.235)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.235, random_state=42, stratify=y_train_val)


Feature scaling

In [6]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


### Logistic Regression - With and Without Feature Scaling
#### Data Scaling Comparison

In [7]:
# (1) Without Scaling
logistic_model_raw = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42)
logistic_model_raw.fit(X_train, y_train)

y_val_pred_raw = logistic_model_raw.predict(X_val)
y_test_pred_raw = logistic_model_raw.predict(X_test)

val_acc_raw = accuracy_score(y_val, y_val_pred_raw)
test_acc_raw = accuracy_score(y_test, y_test_pred_raw)

print("Logistic Regression WITHOUT Scaling")
print(f"Validation Accuracy: {val_acc_raw:.4f}")
print(f"Test Accuracy: {test_acc_raw:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_raw))


# (2) With Scaling

logistic_model_scaled = LogisticRegression(solver='liblinear', max_iter=1000, random_state=42)
logistic_model_scaled.fit(X_train_scaled, y_train)

y_val_pred_scaled = logistic_model_scaled.predict(X_val_scaled)
y_test_pred_scaled = logistic_model_scaled.predict(X_test_scaled)

val_acc_scaled = accuracy_score(y_val, y_val_pred_scaled)
test_acc_scaled = accuracy_score(y_test, y_test_pred_scaled)

print("Logistic Regression WITH Scaling")
print(f"Validation Accuracy: {val_acc_scaled:.4f}")
print(f"Test Accuracy: {test_acc_scaled:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_scaled))


Logistic Regression WITHOUT Scaling
Validation Accuracy: 0.7805
Test Accuracy: 0.8495

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.82      0.37      0.51        38
           1       0.77      0.96      0.86        85

    accuracy                           0.78       123
   macro avg       0.80      0.67      0.68       123
weighted avg       0.79      0.78      0.75       123

Logistic Regression WITH Scaling
Validation Accuracy: 0.7805
Test Accuracy: 0.8495

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.82      0.37      0.51        38
           1       0.77      0.96      0.86        85

    accuracy                           0.78       123
   macro avg       0.80      0.67      0.68       123
weighted avg       0.79      0.78      0.75       123



### K-Nearest Neighbors (KNN) - With and Without Feature Scaling

In [8]:
# (1) KNN WITHOUT Scaling
knn_raw = KNeighborsClassifier(n_neighbors=5)
knn_raw.fit(X_train, y_train)

y_val_pred_knn_raw = knn_raw.predict(X_val)
y_test_pred_knn_raw = knn_raw.predict(X_test)

val_acc_knn_raw = accuracy_score(y_val, y_val_pred_knn_raw)
test_acc_knn_raw = accuracy_score(y_test, y_test_pred_knn_raw)

print("KNN WITHOUT Scaling")
print(f"Validation Accuracy: {val_acc_knn_raw:.4f}")
print(f"Test Accuracy: {test_acc_knn_raw:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_knn_raw))


# (2) KNN WITH Scaling
knn_scaled = KNeighborsClassifier(n_neighbors=5)
knn_scaled.fit(X_train_scaled, y_train)

y_val_pred_knn_scaled = knn_scaled.predict(X_val_scaled)
y_test_pred_knn_scaled = knn_scaled.predict(X_test_scaled)

val_acc_knn_scaled = accuracy_score(y_val, y_val_pred_knn_scaled)
test_acc_knn_scaled = accuracy_score(y_test, y_test_pred_knn_scaled)

print("KNN WITH Scaling")
print(f"Validation Accuracy: {val_acc_knn_scaled:.4f}")
print(f"Test Accuracy: {test_acc_knn_scaled:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_knn_scaled))


KNN WITHOUT Scaling
Validation Accuracy: 0.6667
Test Accuracy: 0.6344

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.42      0.21      0.28        38
           1       0.71      0.87      0.78        85

    accuracy                           0.67       123
   macro avg       0.57      0.54      0.53       123
weighted avg       0.62      0.67      0.63       123

KNN WITH Scaling
Validation Accuracy: 0.7317
Test Accuracy: 0.8280

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.62      0.34      0.44        38
           1       0.75      0.91      0.82        85

    accuracy                           0.73       123
   macro avg       0.69      0.62      0.63       123
weighted avg       0.71      0.73      0.71       123



### Artificial Neural Network (ANN) - With and Without Feature Scaling

In [9]:
# (1) ANN WITHOUT Scaling
ann_raw = MLPClassifier(hidden_layer_sizes=(64,), max_iter=1000, random_state=42)
ann_raw.fit(X_train, y_train)

y_val_pred_ann_raw = ann_raw.predict(X_val)
y_test_pred_ann_raw = ann_raw.predict(X_test)

val_acc_ann_raw = accuracy_score(y_val, y_val_pred_ann_raw)
test_acc_ann_raw = accuracy_score(y_test, y_test_pred_ann_raw)

print("ANN WITHOUT Scaling")
print(f"Validation Accuracy: {val_acc_ann_raw:.4f}")
print(f"Test Accuracy: {test_acc_ann_raw:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_ann_raw))


# (2) ANN WITH Scaling
ann_scaled = MLPClassifier(hidden_layer_sizes=(64,), max_iter=2000, random_state=42)
ann_scaled.fit(X_train_scaled, y_train)

y_val_pred_ann_scaled = ann_scaled.predict(X_val_scaled)
y_test_pred_ann_scaled = ann_scaled.predict(X_test_scaled)

val_acc_ann_scaled = accuracy_score(y_val, y_val_pred_ann_scaled)
test_acc_ann_scaled = accuracy_score(y_test, y_test_pred_ann_scaled)

print("ANN WITH Scaling")
print(f"Validation Accuracy: {val_acc_ann_scaled:.4f}")
print(f"Test Accuracy: {test_acc_ann_scaled:.4f}")
print("\nClassification Report (Validation):\n", classification_report(y_val, y_val_pred_ann_scaled))


ANN WITHOUT Scaling
Validation Accuracy: 0.6585
Test Accuracy: 0.6882

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.68      0.95      0.79        85

    accuracy                           0.66       123
   macro avg       0.34      0.48      0.40       123
weighted avg       0.47      0.66      0.55       123

ANN WITH Scaling
Validation Accuracy: 0.6992
Test Accuracy: 0.7742

Classification Report (Validation):
               precision    recall  f1-score   support

           0       0.51      0.50      0.51        38
           1       0.78      0.79      0.78        85

    accuracy                           0.70       123
   macro avg       0.65      0.64      0.65       123
weighted avg       0.70      0.70      0.70       123






### 🔧 Feature Scaling (Summary)

We applied feature scaling using `StandardScaler` to improve model performance. It is especially important for distance- and gradient-based models like KNN and ANN.



### 📊 Final Model Comparison

| Model               | Validation Accuracy | Test Accuracy |
|--------------------|---------------------|----------------|
| Logistic Regression | 0.7805              | 0.8495         |
| KNN                 | 0.7317              | 0.8280         |
| ANN                 | 0.6992              | 0.7742         |

**Conclusion:** Logistic Regression showed the best performance. Scaling significantly improved KNN and ANN results.



### So... does scaling even matter for Logistic Regression?

Honestly, in this project — not really. We ran Logistic Regression on both the raw and scaled data, and the performance stayed basically the same. Accuracy, precision, recall, even the confusion matrix — they were all pretty much identical.

And that’s actually kind of expected. Logistic Regression doesn’t rely on distances like KNN does. So whether your numbers are small or large doesn’t affect it that much.

The models that really care about scaling are things like KNN and neural networks. They look at distances or do a bunch of math that’s sensitive to the size of numbers. Logistic Regression? Not so much.

So yeah — scaling is super important in general, but in this specific case, it didn’t move the needle for Logistic Regression.


### But if i use grid search

In [10]:

# Logistic Regression with GridSearchCV
param_grid_log = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

grid_log = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_log, cv=5, scoring='accuracy')
grid_log.fit(X_train_scaled, y_train)

best_log_model = grid_log.best_estimator_
val_pred_log = best_log_model.predict(X_val_scaled)
test_pred_log = best_log_model.predict(X_test_scaled)

print("Best Logistic Regression Params:", grid_log.best_params_)
print("Val Accuracy:", accuracy_score(y_val, val_pred_log))
print("Test Accuracy:", accuracy_score(y_test, test_pred_log))


Best Logistic Regression Params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Val Accuracy: 0.7804878048780488
Test Accuracy: 0.8494623655913979


In [11]:

# KNN with GridSearchCV
param_grid_knn = {
    'n_neighbors': list(range(1, 21)),
    'weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring='accuracy')
grid_knn.fit(X_train_scaled, y_train)

best_knn_model = grid_knn.best_estimator_
val_pred_knn = best_knn_model.predict(X_val_scaled)
test_pred_knn = best_knn_model.predict(X_test_scaled)

print("Best KNN Params:", grid_knn.best_params_)
print("Val Accuracy:", accuracy_score(y_val, val_pred_knn))
print("Test Accuracy:", accuracy_score(y_test, test_pred_knn))


Best KNN Params: {'n_neighbors': 6, 'weights': 'distance'}
Val Accuracy: 0.7560975609756098
Test Accuracy: 0.8387096774193549


In [12]:

# ANN with GridSearchCV
param_grid_ann = {
    'hidden_layer_sizes': [(32,), (64,), (32, 32)],
    'alpha': [0.0001, 0.001, 0.01]
}

grid_ann = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), param_grid_ann, cv=3, scoring='accuracy')
grid_ann.fit(X_train_scaled, y_train)

best_ann_model = grid_ann.best_estimator_
val_pred_ann = best_ann_model.predict(X_val_scaled)
test_pred_ann = best_ann_model.predict(X_test_scaled)

print("Best ANN Params:", grid_ann.best_params_)
print("Val Accuracy:", accuracy_score(y_val, val_pred_ann))
print("Test Accuracy:", accuracy_score(y_test, test_pred_ann))




Best ANN Params: {'alpha': 0.01, 'hidden_layer_sizes': (32,)}
Val Accuracy: 0.7560975609756098
Test Accuracy: 0.7956989247311828





### Model Accuracy Comparison (Validation / Test)

| Model Variant        | Validation / Test Accuracy |
|----------------------|----------------------------|
| Logistic (Raw)       | 0.7805 / 0.8495            |
| Logistic (Scaled)    | 0.7805 / 0.8495            |
| Logistic (Grid)      | 0.7805 / 0.8495            |
| KNN (Raw)            | 0.6344 / 0.6667            |
| KNN (Scaled)         | 0.7317 / 0.8280            |
| KNN (Grid)           | 0.7561 / 0.8387            |
| ANN (Raw)            | 0.6585 / 0.6882            |
| ANN (Scaled)         | 0.6992 / 0.7742            |
| ANN (Grid)           | 0.7560 / 0.7956            |

### Summary:

- Scaling clearly improved KNN and ANN.
- Logistic Regression was the most stable and accurate model acrros scenarios , achieving **84.95 accuracy** on the test set with ot without feature scaling.Thid suggest that the model is robust and does not heavily rely on scaling or hyperparameter tuning.
- GridSearchCV consistently improved performance on ANN,KNN models and it's ineffective on Logistic Regreesion .
- Best overall accuracy was from **Logistic Regression with GridSearch**.
