#### Imports

In [1]:
import pandas as pd
import numpy as np
import joblib
import boto3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

#### Load Data & Train-Test Split

In [2]:
#load dataset
df = pd.read_csv('../data/loan-data.csv')

#features and target
X = df.drop('loan_status', axis=1)
y = df['loan_status']

#rain-test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution - Train: {y_train.value_counts().to_dict()}")

Training set: (36000, 17)
Test set: (9000, 17)
Class distribution - Train: {0: 28000, 1: 8000}


#### Feature Groups

In [3]:
numeric_features = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
                   'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
categorical_ohe = ['person_gender', 'employment_type', 'person_home_ownership', 'loan_intent', 'account_type']
ordinal_features = ['person_education', 'previous_loan_defaults_on_file']
ordinal_categories = [['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate'], ['No', 'Yes']]

#### Preprocessing Pipeline

In [4]:
preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numeric_features),
    ('cat_ohe', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
    ]), categorical_ohe),
    ('cat_ord', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
    ]), ordinal_features)
], remainder='drop')

print("Preprocessing pipeline created successfully")

Preprocessing pipeline created successfully


#### Train Best Model (Random Forest)

In [5]:
#create full pipeline with preprocessing + model
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        class_weight='balanced_subsample',
        random_state=42
    ))
])

#train model
print("Training Random Forest model...")
rf_pipeline.fit(X_train, y_train)
print("Model training completed!")

Training Random Forest model...
Model training completed!


#### Model Evaluation

In [9]:
#predictions on test set
y_pred = rf_pipeline.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

#calculate metrics
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
accuracy = (tp + tn) / (tp + tn + fp + fn)

print("="*70)
print("MODEL PERFORMANCE ON TEST SET")
print("="*70)
print(f"Precision: {test_precision:.4f} (Primary Metric)")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")
print("="*70)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

MODEL PERFORMANCE ON TEST SET
Precision: 0.9046 (Primary Metric)
Recall:    0.7440
F1-Score:  0.8165

Confusion Matrix:
[[6843  157]
 [ 512 1488]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      7000
           1       0.90      0.74      0.82      2000

    accuracy                           0.93      9000
   macro avg       0.92      0.86      0.88      9000
weighted avg       0.92      0.93      0.92      9000



In [10]:
cm_display = np.array([[tp, fn], [fp, tn]])
cm_percent = cm_display / cm_display.sum() * 100

annotations = []
for i in range(2):
    for j in range(2):
        annotations.append(
            dict(
                x=j, y=i,
                text=f"<b>{cm_display[i, j]:,}</b><br>({cm_percent[i, j]:.1f}%)",
                showarrow=False,
                font=dict(size=18, color='white' if cm_display[i, j] > cm_display.max()/2 else '#1a1a1a')
            )
        )

fig = go.Figure(data=go.Heatmap(
    z=cm_display,
    x=['Predicted Positive', 'Predicted Negative'],
    y=['Actual Positive', 'Actual Negative'],
    colorscale=[[0, '#E3F2FD'], [0.5, '#42A5F5'], [1, '#1565C0']],
    showscale=False,
    text=cm_display,
    texttemplate='',
    hovertemplate='<b>%{y}</b><br>%{x}<br>Count: %{z:,}<extra></extra>'
))

fig.update_layout(annotations=annotations)

fig.update_layout(
    title=dict(
        text=f'<b>Confusion Matrix - Randon Forest</b><br>' +
             f'<sub>Precision: {test_precision:.3f} | Recall: {test_recall:.3f} | Accuracy: {accuracy:.3f}</sub>',
        x=0.5,
        xanchor='center',
        font=dict(size=20, color='#1a1a1a')
    ),
    xaxis=dict(side='top', tickfont=dict(size=13, color='#333')),
    yaxis=dict(tickfont=dict(size=13, color='#333')),
    width=700,
    height=600,
    plot_bgcolor='white',
    paper_bgcolor='white',
    margin=dict(l=120, r=100, t=120, b=80)
)

fig.show()

#### Save Models Locally

In [12]:
#save complete pipeline (preprocessor + model)
joblib.dump(rf_pipeline, 'loan_approval_pipeline.pkl')
print("Saved: loan_approval_pipeline.pkl")
#save preprocessor separately (optional, for flexibility)
joblib.dump(preprocessor, 'loan_preprocessing_pipeline.pkl')
print("Saved: loan_preprocessing_pipeline.pkl")
#save trained model separately (optional)
joblib.dump(rf_pipeline.named_steps['classifier'], 'loan_approval_model.pkl')
print("Saved: loan_approval_model.pkl")

print("\nAll models saved successfully!")

Saved: loan_approval_pipeline.pkl
Saved: loan_preprocessing_pipeline.pkl
Saved: loan_approval_model.pkl

All models saved successfully!


#### Upload to S3

In [None]:
#initialize S3 client
s3 = boto3.client('s3')
bucket_name = '' #change to your bucket name

#upload files to S3
print("Uploading models to S3...")

s3.upload_file('loan_approval_pipeline.pkl', 
               bucket_name, 
               'models/loan_approval_pipeline.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_approval_pipeline.pkl")

s3.upload_file('loan_preprocessing_pipeline.pkl', 
               bucket_name, 
               'models/loan_preprocessing_pipeline.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_preprocessing_pipeline.pkl")

s3.upload_file('loan_approval_model.pkl', 
               bucket_name, 
               'models/loan_approval_model.pkl')
print(f"Uploaded to s3://{bucket_name}/models/loan_approval_model.pkl")

print("\nAll models successfully uploaded to S3!")

#### Test Model Loading (Verification)

In [None]:
#verify models can be loaded
loaded_pipeline = joblib.load('loan_approval_pipeline.pkl')

#test prediction on a sample
sample_prediction = loaded_pipeline.predict(X_test.iloc[:5])
print("Sample predictions:", sample_prediction)
print("Actual values:    ", y_test.iloc[:5].values)
print("\nModel loading and prediction successful!")

#### Tableau Data Frame

In [16]:
tableau_df = X_test.copy()

tableau_df['actual_loan_status'] = y_test.values
tableau_df['predicted_loan_status'] = y_pred

tableau_df['prediction_correct'] = (y_test.values == y_pred)
tableau_df['prediction_result'] = tableau_df['prediction_correct'].map({
    True: 'Correct', 
    False: 'Incorrect'
})

y_pred_proba = rf_pipeline.predict_proba(X_test)
tableau_df['probability_denied'] = y_pred_proba[:, 0]
tableau_df['probability_approved'] = y_pred_proba[:, 1]

tableau_df['confidence_level'] = pd.cut(
    tableau_df[['probability_denied', 'probability_approved']].max(axis=1),
    bins=[0, 0.6, 0.8, 1.0],
    labels=['Low', 'Medium', 'High']
)

def classify_prediction(row):
    if row['actual_loan_status'] == 1 and row['predicted_loan_status'] == 1:
        return 'True Positive'
    elif row['actual_loan_status'] == 0 and row['predicted_loan_status'] == 0:
        return 'True Negative'
    elif row['actual_loan_status'] == 0 and row['predicted_loan_status'] == 1:
        return 'False Positive'
    else:
        return 'False Negative'

tableau_df['prediction_type'] = tableau_df.apply(classify_prediction, axis=1)

tableau_df.head()

Unnamed: 0,person_age,person_name,person_gender,person_education,employment_type,person_income,person_emp_exp,person_home_ownership,bank_name,account_type,...,credit_score,previous_loan_defaults_on_file,actual_loan_status,predicted_loan_status,prediction_correct,prediction_result,probability_denied,probability_approved,confidence_level,prediction_type
10750,25.0,Norma Martin,female,Bachelor,unemployed,84973.0,2.0,MORTGAGE,Morgan Stanley,saving,...,634.0,No,0,0,True,Correct,0.99,0.01,High,True Negative
17512,24.0,Jeffrey Meyer,male,Bachelor,contract,87280.0,2.0,OWN,Wells Fargo,checking,...,610.0,Yes,0,0,True,Correct,1.0,0.0,High,True Negative
17070,22.0,Ashley Espinoza,female,Associate,unemployed,70178.0,0.0,OWN,Goldman Sachs,checking,...,668.0,Yes,0,0,True,Correct,1.0,0.0,High,True Negative
35943,27.0,Thomas Moore,male,Bachelor,contract,176144.0,1.0,MORTGAGE,U.S. Bank,checking,...,591.0,Yes,0,0,True,Correct,1.0,0.0,High,True Negative
15749,26.0,Stacy Webb,female,Bachelor,contract,181548.0,3.0,MORTGAGE,Goldman Sachs,checking,...,643.0,No,1,1,True,Correct,0.17,0.83,High,True Positive


In [17]:
tableau_df.to_csv('../data/loan_predictions_tableau.csv', index=False)
print(f"\nSaved: {tableau_df.shape[0]} rows, {tableau_df.shape[1]} columns")
print(f"Columns included: {list(tableau_df.columns)}")


Saved: 9000 rows, 25 columns
Columns included: ['person_age', 'person_name', 'person_gender', 'person_education', 'employment_type', 'person_income', 'person_emp_exp', 'person_home_ownership', 'bank_name', 'account_type', 'loan_amnt', 'loan_intent', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'previous_loan_defaults_on_file', 'actual_loan_status', 'predicted_loan_status', 'prediction_correct', 'prediction_result', 'probability_denied', 'probability_approved', 'confidence_level', 'prediction_type']
