In [7]:
## Import of data
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Load the dataset----------------------------------------------------------------------------------------------------------------
loan = pd.read_csv('loan.csv')
print(loan.head())


## Feature Engineering--------------------------------------------------------------------------------------------------------------
# 1. Debt-to-Income Ratio (DTI)
loan['Debt_to_Income_Ratio'] = loan['loan_amnt'] / loan['person_income']

# 2. Employment Stability Score
loan['Employment_Stability'] = loan['person_emp_exp'] / loan['person_age']

# 3. Credit Score Category (Binned Feature)
def categorize_credit_score(score):
    if score >= 750:
        return 3  # Excellent
    elif 650 <= score < 750:
        return 2  # Good
    elif 550 <= score < 650:
        return 1  # Fair
    else:
        return 0  # Poor

loan['Credit_Score_Category'] = loan['credit_score'].apply(categorize_credit_score)

# 4. Loan Affordability Index
loan['Loan_Affordability_Index'] = loan['person_income'] / (loan['loan_amnt'] * (1 + loan['loan_int_rate']))

# 5. Loan Intent Risk Score (Categorical Encoding)
loan_intent_mapping = {
    "home_improvement": 0.2,
    "education": 0.3,
    "medical": 0.5,
    "personal": 0.6,
    "business": 0.8
}
 # Default risk if category is unknown
loan['Loan_Intent_Risk'] = loan['loan_intent'].map(loan_intent_mapping).fillna(0.5) 
x = loan.drop(columns=['loan_status','person_education','loan_intent','person_home_ownership','person_gender','previous_loan_defaults_on_file'])
print(loan.head())


##TRAINING/TESTING AND EVALUATION MODELS ----------------------------------------------------------------------------------------------------------
# Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)
y = loan['loan_status']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Store model results
model_results = {}

### LOGISTIC REGRESSION ###
log_model = LogisticRegression(C=1.0, penalty='l2', solver='liblinear')
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

# Evaluation Metrics
model_results['Logistic Regression'] = {
    'Precision': precision_score(y_test, y_pred_log),
    'Recall': recall_score(y_test, y_pred_log),
    'F1-Score': f1_score(y_test, y_pred_log),
    'ROC-AUC': roc_auc_score(y_test, log_model.predict_proba(X_test)[:, 1])
}

### RANDOM FOREST CLASSIFIER ###
rf_model = RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluation Metrics
model_results['Random Forest'] = {
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1-Score': f1_score(y_test, y_pred_rf),
    'ROC-AUC': roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])
}

xgb_model = XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


# Evaluation Metrics
model_results['XGBoost'] = {
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1-Score': f1_score(y_test, y_pred_xgb),
    'ROC-AUC': roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])
}

# Display results
for model, metrics in model_results.items():
    print(f"\n🔹 {model} Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



   person_age person_gender person_education  person_income  person_emp_exp  \
0        22.0        female           Master        71948.0               0   
1        21.0        female      High School        12282.0               0   
2        25.0        female      High School        12438.0               3   
3        23.0        female         Bachelor        79753.0               0   
4        24.0          male           Master        66135.0               1   

  person_home_ownership  loan_amnt loan_intent  loan_int_rate  \
0                  RENT    35000.0    PERSONAL          16.02   
1                   OWN     1000.0   EDUCATION          11.14   
2              MORTGAGE     5500.0     MEDICAL          12.87   
3                  RENT    35000.0     MEDICAL          15.23   
4                  RENT    35000.0     MEDICAL          14.27   

   loan_percent_income  cb_person_cred_hist_length  credit_score  \
0                 0.49                         3.0           561  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
