In [3]:
import numpy as np
import pandas as pd

In [4]:
loan_df = pd.read_csv("../Data/loan.csv")
loan_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


Fill missing values in Dependents column
- Creat a class
- if Married was  Yes fill with 2, else fill with 0

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DependentsImputer(BaseEstimator, TransformerMixin):
    def __init__(self, dependents_col="Dependents", married_col="Married"):
        self.dependents_col = dependents_col
        self.married_col = married_col

    def fit(self, X, y=None):
        # Nothing to learn ‚Äî rule-based
        return self

    def transform(self, X):
        X = X.copy()
        mask = X[self.dependents_col].isna()
        X.loc[mask, self.dependents_col] = np.where(
            X.loc[mask, self.married_col] == "Yes", "2", "0"
        )
        return X

In [None]:
# ordinal_pipeline = Pipeline(
#     steps=[
#         ("impute" ,DependentsImputer()),
#         ("select_dependents", FunctionTransformer(lambda X: X[["Dependents"]], validate=False)), #Drops "Married" befor encodeing
#         ("Encoder",OrdinalEncoder(categories=[['0', '1', '2', '3+']],handle_unknown="use_encoded_value", unknown_value=-1))  # specific order  
#     ]
# )


# ("ordinal", ordinal_pipeline, ["Dependents", "Married"]),

## Imputation of Missing Data

‚úÖ Choose Simple Imputer when:

- Working with large datasets

- Need fast processing

- Prototyping or building MVP

- Computational resources are limited

‚úÖ Choose KNN Imputer when:

- You have medium-sized datasets (not too large)

- Feature relationships are important

- You have computational resources to spare

- Data quality is critical



1.Use *k*-nearest neighbors imputer (KNNImputer) :

First we need encoding categoricals features 

‚ö†Ô∏è Caution:Using numeric encodings for categories makes the imputer treat them as ordered or continuous ‚Äî which may lead to incorrect assumptions.
üîÅ Better alternative: Use methods that treat categorical data natively.



In [5]:
numerical_cols = [
    "ApplicantIncome",
    "CoapplicantIncome",
    "LoanAmount",
    "Loan_Amount_Term"
    ]
binary_cols = [
    "Gender",
    "Married",
    "Education",
    "Self_Employed",
    "Credit_History"
    ]

ordinal_cols = ["Dependents"]

nominal_cols = ["Property_Area"]

## Encodeing categorical features
- Use OrdinalEncoder for binary features.
- Use OneHotEncoder for Multi-class nominal feature.
- Use LabelEncoder only for the target.
- LabelEncoder in scikit-learn is meant for target variables (y), not features (X).

In [6]:
from sklearn.preprocessing import  OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

#Encode categorical data
preprocessing_encode = ColumnTransformer([
    ('binary', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), binary_cols),
    ('ordinal', OrdinalEncoder(categories=[['0', '1', '2', '3+']] ,handle_unknown="use_encoded_value", unknown_value=-1), ordinal_cols),
    ('nominal', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_cols)
],remainder='passthrough')  # Keep numerical features as they are

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline

log_reg_clf = LogisticRegression(random_state=0)
log_reg_clf_pipeline = Pipeline(
    steps=[
        ("encode", preprocessing_encode),
        ("impute", KNNImputer(n_neighbors= 5)),
        ("scale", StandardScaler()),
        ("model", log_reg_clf),
    ]
)

In [11]:
from sklearn import set_config

set_config(display="diagram")
display(log_reg_clf_pipeline)

0,1,2
,steps,"[('encode', ...), ('impute', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('binary', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,"[['0', '1', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,0
,solver,'lbfgs'
,max_iter,100


In [12]:
from sklearn.model_selection import train_test_split

X = loan_df[numerical_cols + binary_cols + ordinal_cols + nominal_cols]
y = loan_df["Loan_Status"]

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y # 'stratify' is good for classification
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=(20/85), random_state=42, stratify=y_temp
)

In [13]:
log_reg_clf_pipeline.fit(X_train, y_train)

score = log_reg_clf_pipeline.score(X_val, y_val)
print(f"Model score: {score}")  # model accuracy

Model score: 0.7723577235772358


In [14]:
param_grid_logistic_regression = {
    "model__penalty": ["l1", "l2"],
    "model__C": np.logspace(-4, 4, 20),
    "model__solver": ["liblinear"],
}

In [15]:
from sklearn.model_selection import GridSearchCV

grid_search_logistic_regression = GridSearchCV(
    log_reg_clf_pipeline,
    param_grid=param_grid_logistic_regression,
    cv=5,
    scoring="accuracy",
    verbose=True,
    n_jobs=-1,
)

In [16]:
grid_search_logistic_regression.fit(X_train, y_train)

print("Best Score of train set: " + str(grid_search_logistic_regression.best_score_))
print("Best parameter set: " + str(grid_search_logistic_regression.best_params_))
print("Test Score: " + str(grid_search_logistic_regression.score(X_val, y_val)))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best Score of train set: 0.8139556962025315
Best parameter set: {'model__C': 0.08858667904100823, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
Test Score: 0.7642276422764228


2.Use MissForest from missingpy (Better choice)

‚úÖ Supports both categorical and numerical natively

‚úÖ Automatically selects classification or regression

‚úÖ Handles nonlinear relationships

‚úÖ No need to manually define estimators

‚ö†Ô∏è Slower for large datasets

‚ö†Ô∏è Requires everything to be encoded numerically (but handles it internally well)

In [8]:
# from missingpy import MissForest

---------------------------------------------------------------------------
ModuleNotFoundError: No module named 'sklearn.neighbors.base'

‚ùå Problem:

- The missingpy library is using private internal modules from an old version of scikit-learn.
- These internals (like sklearn.neighbors.base) were removed or refactored in scikit-learn ‚â• 0.24

3.Use IterativeImputer

‚úÖHandles Mixed Data: Works well with both categorical and numerical data.

‚úÖAccurate: Uses regression models (like Random Forest) for better imputation accuracy.

‚úÖCaptures Relationships: Can learn relationships between features for more precise imputations.


‚ö†Ô∏èComputationally Intensive: Slower than simpler methods like KNN, especially for larger datasets.

‚ö†Ô∏èRisk of Overfitting: Can overfit on smaller datasets if not tuned properly.

‚ö†Ô∏èNot Ideal for Sparse Data: Struggles if too much data is missing.

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.experimental import enable_iterative_imputer  # Required for IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_openml

In [10]:
numerical_scale_pipeline = Pipeline(
    steps = [
        ("Imputer",IterativeImputer()),
        ("Scaler",StandardScaler())
        ]
)
binary_pipeline = Pipeline(
    steps = [
        ("Encoder",OrdinalEncoder()),
        ("Imputer",IterativeImputer())
        ]
)
ordinal_pipeline = Pipeline(
    steps=[
        ("Encoder",OrdinalEncoder(categories=[['0', '1', '2', '3+']])),  # specific order
        ("Imputer",IterativeImputer())
    ]
)
nominal_pipeline = Pipeline(
    steps=[
        ("Encoder",OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ]
)