In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Sample data (replace this with your actual data)
data = pd.read_csv("Downloads/Task 3 and 4_Loan_Data.csv")

# Convert to DataFrame
df = pd.DataFrame(data)

# 1. Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# 2. Handle missing data (if any) - Here, no missing values, so we can skip this step.

# 3. Separate features (X) and target variable (y)
X = df.drop(columns=['default'])
y = df['default']

# 4. Feature Scaling: Standardize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Scale the features

# 5. Split the data into training and testing sets (80% train, 20% test)
X_scaled_train, X_scaled_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize the logistic regression model
log_reg_model = LogisticRegression()

# Train the model on the training data
log_reg_model.fit(X_scaled_train, y_train)

# Predict probabilities of default on the test data
y_pred_prob = log_reg_model.predict_proba(X_scaled_test)[:, 1]  # Get probability for class 1 (default)

# Predict binary outcome (0 or 1) based on a threshold (0.5)
y_pred = log_reg_model.predict(X_scaled_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)
print(f'ROC-AUC Score: {roc_auc:.4f}')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_scaled_train, y_train)

# Predict probabilities of default on the test data
y_pred_prob_rf = rf_model.predict_proba(X_scaled_test)[:, 1]  # Get probability for class 1 (default)

# Predict binary outcome (0 or 1) based on a threshold (0.5)
y_pred_rf = rf_model.predict(X_scaled_test)

# Evaluate the model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)

# Print evaluation metrics for Random Forest
print(f'Random Forest Accuracy: {accuracy_rf:.4f}')
print('Random Forest Confusion Matrix:')
print(conf_matrix_rf)
print('Random Forest Classification Report:')
print(class_report_rf)
print(f'Random Forest ROC-AUC Score: {roc_auc_rf:.4f}')

import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Train the model on the training data
xgb_model.fit(X_scaled_train, y_train)

# Predict probabilities of default on the test data
y_pred_prob_xgb = xgb_model.predict_proba(X_scaled_test)[:, 1]  # Get probability for class 1 (default)

# Predict binary outcome (0 or 1) based on a threshold (0.5)
y_pred_xgb = xgb_model.predict(X_scaled_test)

# Evaluate the model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
class_report_xgb = classification_report(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_pred_prob_xgb)

# Print evaluation metrics for XGBoost
print(f'XGBoost Accuracy: {accuracy_xgb:.4f}')
print('XGBoost Confusion Matrix:')
print(conf_matrix_xgb)
print('XGBoost Classification Report:')
print(class_report_xgb)
print(f'XGBoost ROC-AUC Score: {roc_auc_xgb:.4f}')


Missing values in each column:
customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64
Accuracy: 0.9960
Confusion Matrix:
[[1651    1]
 [   7  341]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       1.00      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000

ROC-AUC Score: 1.0000
Random Forest Accuracy: 0.9955
Random Forest Confusion Matrix:
[[1650    2]
 [   7  341]]
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.99      0.98      0.99   

In [8]:
# Summary of results
results = {
    'Logistic Regression': {
        'Accuracy': accuracy,
        'ROC-AUC': roc_auc,
        'Confusion Matrix': conf_matrix,
        'Classification Report': class_report
    },
    'Random Forest': {
        'Accuracy': accuracy_rf,
        'ROC-AUC': roc_auc_rf,
        'Confusion Matrix': conf_matrix_rf,
        'Classification Report': class_report_rf
    },
    'XGBoost': {
        'Accuracy': accuracy_xgb,
        'ROC-AUC': roc_auc_xgb,
        'Confusion Matrix': conf_matrix_xgb,
        'Classification Report': class_report_xgb
    }
}

# Print comparison
for model_name, metrics in results.items():
    print(f'\n{model_name} Performance:')
    print(f'Accuracy: {metrics["Accuracy"]:.4f}')
    print(f'ROC-AUC Score: {metrics["ROC-AUC"]:.4f}')
    print('Confusion Matrix:')
    print(metrics['Confusion Matrix'])
    print('Classification Report:')
    print(metrics['Classification Report'])



Logistic Regression Performance:
Accuracy: 0.9960
ROC-AUC Score: 1.0000
Confusion Matrix:
[[1651    1]
 [   7  341]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       1.00      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000


Random Forest Performance:
Accuracy: 0.9955
ROC-AUC Score: 0.9997
Confusion Matrix:
[[1650    2]
 [   7  341]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1652
           1       0.99      0.98      0.99       348

    accuracy                           1.00      2000
   macro avg       0.99      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000


XGBoost Performance:
Accuracy: 0.9950
ROC-AUC Score: 0.9998
Confusion

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split

def calculate_expected_loss(attributes, model_type):
    # Define column names
    columns = ['credit_lines_outstanding', 'loan_amt_outstanding', 'total_debt_outstanding', 
               'income', 'years_employed', 'fico_score']
    
    # Convert the attributes list into a DataFrame
    data = pd.DataFrame([attributes], columns=columns)
    
    # Load and preprocess the data
    df = pd.read_csv("Downloads/Task 3 and 4_Loan_Data.csv")
    df = df.drop(columns=['customer_id'])  # Drop customer_id as it's not needed
    df.fillna(df.mean(), inplace=True)  # Fill missing values with column means

    # Split the data into features (X) and target (y)
    X = df.drop(columns=['default'])
    y = df['default']
    
    # Split the data into train and test sets (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    data_scaled = scaler.transform(data)  # Scale the input attributes
    
    # Select model based on user input
    if model_type == "LogisticRegression":
        model_instance = LogisticRegression(random_state=42)
    elif model_type == "RandomForest":
        model_instance = RandomForestClassifier(random_state=42)
    elif model_type == "XGBoost":
        model_instance = xgb.XGBClassifier(random_state=42)
    else:
        raise ValueError("Model not recognized. Choose 'LogisticRegression', 'RandomForest', or 'XGBoost'.")
    
    # Fit the model
    model_instance.fit(X_train_scaled, y_train)
    
    # Model evaluation on the test set
    y_pred = model_instance.predict(X_test_scaled)
    y_pred_proba = model_instance.predict_proba(X_test_scaled)[:, 1]  # Get probability of default
    
    # Calculate accuracy and other metrics
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Print the model's evaluation results
    print(f"Model: {model_type}")
    print("Model Parameters:")
    print(model_instance.get_params())
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Confusion Matrix:\n{cm}")
    print(f"Classification Report:\n{cr}")
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # Calculate Expected Loss for the input loan
    loan_amount = attributes[1]  # loan_amt_outstanding corresponds to index 1
    recovery_rate = 0.10
    loss_factor = 1 - recovery_rate
    
    # Expected Loss formula
    prob_default = model_instance.predict_proba(data_scaled)[:, 1]
    expected_loss = loan_amount * prob_default * loss_factor
    
    print(f"\nExpected Loss for the loan: {expected_loss[0]:.2f}")
    
    return expected_loss[0]

# Example usage
loan_attributes = [5, 5221.545193, 3915.471226, 78039.38546, 5, 605]  # Example loan attributes
model = "LogisticRegression"  # Change model type as needed to 'RandomForest', or 'XGBoost' or 

expected_loss = calculate_expected_loss(loan_attributes, model)


Model: XGBoost
Model Parameters:
{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}

Accuracy: 0.9945
Confusion Matrix:
[[1648    4]
 [   7  341]]
Classi