In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')


In [3]:
# Load your dataset
df = pd.read_csv('loan_data_set.csv')
df.head()


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
# Drop target and ID columns
X = df.drop(['Loan_Status', 'Loan_ID'], axis=1)
y = df['Loan_Status']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Column types
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 
                    'Self_Employed', 'Property_Area', 'Credit_History']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 
                  'LoanAmount', 'Loan_Amount_Term']


In [5]:
# For categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# For numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Combine all into a column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [6]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)


In [7]:
logreg_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

dtree_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])


In [8]:
models = {
    "Logistic Regression": logreg_pipeline,
    "Decision Tree": dtree_pipeline
}

for name, model in models.items():
    print(f"🔍 Evaluating: {name}")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in skf.split(X_train, y_train_encoded):
        X_fold_train = X_train.iloc[train_idx]
        X_fold_val = X_train.iloc[val_idx]
        y_fold_train = y_train_encoded[train_idx]
        y_fold_val = y_train_encoded[val_idx]

        model.fit(X_fold_train, y_fold_train)
        y_pred_val = model.predict(X_fold_val)
        score = classification_report(y_fold_val, y_pred_val, output_dict=True)
        cv_scores.append(score)

    avg_recall = np.mean([score['1']['recall'] for score in cv_scores])
    avg_f1 = np.mean([score['1']['f1-score'] for score in cv_scores])

    print(f"✅ Average Recall (CV): {avg_recall:.4f}")
    print(f"✅ Average F1-score (CV): {avg_f1:.4f}")


🔍 Evaluating: Logistic Regression
✅ Average Recall (CV): 0.8487
✅ Average F1-score (CV): 0.8177
🔍 Evaluating: Decision Tree
✅ Average Recall (CV): 0.8309
✅ Average F1-score (CV): 0.8178


In [9]:
    print(f"\n📌 Final Test Evaluation - {name}")
    model.fit(X_train, y_train_encoded)
    y_pred_test = model.predict(X_test)

    print("Classification Report:")
    print(classification_report(y_test_encoded, y_pred_test))

    cm = confusion_matrix(y_test_encoded, y_pred_test)
    print("Confusion Matrix:")
    print(cm, "\n")



📌 Final Test Evaluation - Decision Tree
Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.61      0.52        38
           1       0.79      0.67      0.73        85

    accuracy                           0.65       123
   macro avg       0.62      0.64      0.62       123
weighted avg       0.69      0.65      0.66       123

Confusion Matrix:
[[23 15]
 [28 57]] 

