# Finance Loan approval Prediction

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from skopt import BayesSearchCV
from skopt.space import Real, Categorical
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

In [2]:
data = pd.read_csv("test.csv")
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113.0,360.0,1.0,Urban
363,LP002975,Male,Yes,0,Graduate,No,4158,709,115.0,360.0,1.0,Urban
364,LP002980,Male,No,0,Graduate,No,3250,1993,126.0,360.0,,Semiurban
365,LP002986,Male,Yes,0,Graduate,No,5000,2393,158.0,360.0,1.0,Rural


In [3]:
data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,367.0,367.0,362.0,361.0,338.0
mean,4805.599455,1569.577657,136.132597,342.537396,0.825444
std,4910.685399,2334.232099,61.366652,65.156643,0.38015
min,0.0,0.0,28.0,6.0,0.0
25%,2864.0,0.0,100.25,360.0,1.0
50%,3786.0,1025.0,125.0,360.0,1.0
75%,5060.0,2430.5,158.0,360.0,1.0
max,72529.0,24000.0,550.0,480.0,1.0


In [4]:
data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [5]:
data.columns = data.columns.str.strip()

In [6]:
data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
data['LoanAmount'].fillna(data['LoanAmount'].median(), inplace=True)
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].median(), inplace=True)
data['Credit_History'].fillna(data['Credit_History'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

In [7]:
data['Dependents'] = data['Dependents'].replace('3+', 3).astype(int)

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

## Regresión logística con regularización

In [10]:
target_col = "Credit_History"
y = data[target_col]
X = data.drop(columns=[target_col])

In [11]:
num_cols = X.select_dtypes(include="number").columns.tolist()
cat_cols = X.select_dtypes(exclude="number").columns.tolist()

In [12]:
pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)

In [13]:
baseline = Pipeline([
    ("pre", pre),
    ("clf", DummyClassifier(strategy="most_frequent"))
])

In [14]:
logreg = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(solver="saga", max_iter=6000, random_state=42))
])

In [15]:
k = 5
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [18]:
scoring = "roc_auc"

search_spaces = [
    {
        "clf__penalty": Categorical(["l2"]),
        "clf__C": Real(0.01, 10, prior="log-uniform"),
        "clf__class_weight": Categorical([None, "balanced"]),
    },
    {
        "clf__penalty": Categorical(["elasticnet"]),
        "clf__l1_ratio": Real(0.0, 1.0),
        "clf__C": Real(0.01, 10, prior="log-uniform"),
        "clf__class_weight": Categorical([None, "balanced"]),
    },
]

In [19]:
bo = BayesSearchCV(
    estimator=logreg,
    search_spaces=search_spaces,
    n_iter=40,
    cv=cv,
    scoring=scoring, 
    n_jobs=-1,
    random_state=42,
    refit=True,
)

bo.fit(X, y)

In [20]:
# Modelo óptimo obtenido
bo.best_estimator_

In [21]:
# Regresión logística optimizada
bo.best_estimator_.named_steps["clf"]

In [22]:
# Mejores hiperparámetros encontrados
bo.best_params_

OrderedDict([('clf__C', 1.022134789124424),
             ('clf__class_weight', 'balanced'),
             ('clf__l1_ratio', 1.0),
             ('clf__penalty', 'elasticnet')])

In [27]:
# Rendimiento del modelo
scores = cross_val_score(bo.best_estimator_, X, y, cv=5, scoring="roc_auc")
print("ROC-AUC promedio:", scores.mean())
print("Desviación estándar:", scores.std())

ROC-AUC promedio: 0.5960618880502541
Desviación estándar: 0.04489352645011028
